.agents/memory/scripts/export_embeddings.py

147 lines
4.2 KiB
Python

#!/usr/bin/env python3
"""Export embeddings for visualization."""
import json
import sqlite3
import sys
from pathlib import Path
try:
import zvec
ZVEC_AVAILABLE = True
except ImportError:
ZVEC_AVAILABLE = False
AGENTS_DIR = Path.home() / ".agents"
DB_PATH = AGENTS_DIR / "memory" / "memories.db"
ZVEC_PATH = AGENTS_DIR / "memory" / "vectors.zvec"
def export_embeddings():
"""Export all embeddings with their memory data."""
if not ZVEC_AVAILABLE:
return {"error": "zvec not installed (requires Python 3.10-3.12)", "embeddings": []}
if not DB_PATH.exists():
return {"error": "No database found", "embeddings": []}
if not ZVEC_PATH.exists():
return {"error": "No vector store found", "embeddings": []}
# Open database
db = sqlite3.connect(str(DB_PATH))
db.row_factory = sqlite3.Row
# Get all memories
rows = db.execute("""
SELECT id, content, who, importance, tags, created_at
FROM memories
ORDER BY created_at DESC
""").fetchall()
# Open zvec collection
try:
collection = zvec.open(path=str(ZVEC_PATH))
except Exception as e:
db.close()
return {"error": f"Failed to open vector store: {e}", "embeddings": []}
embeddings = []
for row in rows:
memory_id = str(row["id"])
# Try to get vector from zvec
# Use a self-query: search for exact match
try:
# Unfortunately zvec doesn't have a direct get-by-id
# We'll use the memory content to search and verify ID
# For now, skip the vector data and just include metadata
# The UMAP will run client-side only if we have vectors
embeddings.append({
"id": memory_id,
"text": row["content"],
"who": row["who"] or "unknown",
"importance": row["importance"] or 0.5,
"tags": row["tags"],
"createdAt": row["created_at"],
# Vector will be loaded separately or we compute PCA server-side
})
except Exception:
continue
db.close()
return {"embeddings": embeddings, "count": len(embeddings)}
def export_with_vectors():
"""Export embeddings with actual vector data for UMAP."""
if not DB_PATH.exists():
return {"error": "No database found", "embeddings": []}
if not ZVEC_PATH.exists():
return {"error": "No vector store found", "embeddings": []}
# Import embeddings module for re-embedding
sys.path.insert(0, str(AGENTS_DIR / "memory" / "scripts"))
from embeddings import embed
import yaml
# Load config
config_path = AGENTS_DIR / "config.yaml"
config = {}
if config_path.exists():
with open(config_path) as f:
config = yaml.safe_load(f)
# Open database
db = sqlite3.connect(str(DB_PATH))
db.row_factory = sqlite3.Row
# Get all memories
rows = db.execute("""
SELECT id, content, who, importance, tags, created_at
FROM memories
ORDER BY created_at DESC
LIMIT 200
""").fetchall()
embeddings = []
for row in rows:
memory_id = str(row["id"])
content = row["content"]
try:
# Re-embed to get vector (cached by ollama)
vector, _ = embed(content, config)
embeddings.append({
"id": memory_id,
"text": content[:200], # Truncate for JSON size
"who": row["who"] or "unknown",
"importance": row["importance"] or 0.5,
"tags": row["tags"],
"createdAt": row["created_at"],
"vector": vector,
})
except Exception as e:
# Skip memories we can't embed
continue
db.close()
return {"embeddings": embeddings, "count": len(embeddings)}
if __name__ == "__main__":
# If --with-vectors flag, include vector data
if len(sys.argv) > 1 and sys.argv[1] == "--with-vectors":
result = export_with_vectors()
else:
result = export_embeddings()
print(json.dumps(result))