import numpy as np import ollama from typing import List, Dict, Tuple, Optional from sklearn.metrics.pairwise import cosine_similarity from sklearn.cluster import KMeans class SemanticAnalyzer: def __init__(self, db): self.db = db self.embedding_dim = 768 def generate_embedding(self, text: str) -> Optional[np.ndarray]: try: response = ollama.embeddings(model="nomic-embed-text", prompt=text) return np.array(response["embedding"], dtype=np.float32) except Exception as e: print(f"Error generating embedding: {e}") return None def chunk_text( self, text: str, max_tokens: int = 500, overlap: int = 50 ) -> List[Tuple[str, int, int]]: words = text.split() chunks = [] start_idx = 0 while start_idx < len(words): end_idx = min(start_idx + max_tokens, len(words)) chunk_words = words[start_idx:end_idx] chunk_text = " ".join(chunk_words) word_count = len(chunk_words) char_start = len(" ".join(words[:start_idx])) char_end = char_start + len(chunk_text) chunks.append((chunk_text, char_start, char_end)) start_idx += max_tokens - overlap if end_idx >= len(words): break return chunks def generate_all_embeddings(self): sections = self.db.get_sections_with_embeddings() sentences = self.db.get_sentences() variables = self.db.get_variables() print("Generating document-level embeddings...") for section in sections: if section["content"]: text = section["content"] if len(text) > 2000: chunks = self.chunk_text(text, max_tokens=500, overlap=50) if chunks: first_chunk_text = chunks[0][0] embedding = self.generate_embedding(first_chunk_text) if embedding is not None and np.any(embedding): self.db.add_embedding(section["id"], "section", embedding) else: embedding = self.generate_embedding(text) if embedding is not None and np.any(embedding): self.db.add_embedding(section["id"], "section", embedding) print("Generating sentence-level embeddings...") for sent in sentences[:100]: embedding = self.generate_embedding(sent["text"]) if embedding is not None and np.any(embedding): self.db.add_embedding(sent["id"], "sentence", embedding) print("Generating variable-level embeddings...") for var in variables: text = f"{var['name']}: {var.get('description', '')}" embedding = self.generate_embedding(text) if embedding is not None and np.any(embedding): self.db.add_embedding(var["id"], "variable", embedding) print("Embeddings generated and stored in database.") def compute_similarities(self, top_k: int = 10): sections = self.db.get_sections_with_embeddings() embeddings = [] section_ids = [] for section in sections: embedding = self.db.get_embedding(section["id"]) if embedding is not None: embeddings.append(embedding) section_ids.append(section["id"]) if len(embeddings) < 2: print("Not enough embeddings to compute similarities.") return embeddings_matrix = np.array(embeddings) similarity_matrix = cosine_similarity(embeddings_matrix) print(f"Computing top-{top_k} similarities...") for i, section_id_1 in enumerate(section_ids): similarities = similarity_matrix[i] top_indices = np.argsort(similarities)[-top_k - 1 : -1][::-1] for j in top_indices: section_id_2 = section_ids[j] similarity_score = float(similarities[j]) if section_id_1 != section_id_2: self.db.add_similarity(section_id_1, section_id_2, similarity_score) print("Similarities computed and stored.") def find_similar_sections(self, query: str, top_k: int = 5) -> List[Dict]: query_embedding = self.generate_embedding(query) if query_embedding is None: return [] sections = self.db.get_sections_with_embeddings() similarities = [] for section in sections: embedding = self.db.get_embedding(section["id"]) if embedding is not None: similarity = float( cosine_similarity([query_embedding], [embedding])[0][0] ) similarities.append({"section": section, "similarity": similarity}) similarities.sort(key=lambda x: x["similarity"], reverse=True) return similarities[:top_k] def cluster_variables(self, n_clusters: int = 5) -> Dict[int, List[int]]: variables = self.db.get_variables() embeddings = [] var_ids = [] for var in variables: embedding = self.db.get_embedding(var["id"]) if embedding is not None: embeddings.append(embedding) var_ids.append(var["id"]) if len(embeddings) < n_clusters: print( f"Not enough variables for {n_clusters} clusters. Using {len(embeddings)} clusters." ) n_clusters = max(1, len(embeddings)) embeddings_matrix = np.array(embeddings) kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto") cluster_labels = kmeans.fit_predict(embeddings_matrix) clusters = {} for var_id, label in zip(var_ids, cluster_labels): if label not in clusters: clusters[label] = [] clusters[label].append(var_id) return clusters def compute_cluster_centers( self, clusters: Dict[int, List[int]] ) -> Dict[int, np.ndarray]: cluster_centers = {} for cluster_id, var_ids in clusters.items(): embeddings = [] for var_id in var_ids: embedding = self.db.get_embedding(var_id) if embedding is not None: embeddings.append(embedding) if embeddings: cluster_centers[cluster_id] = np.mean(embeddings, axis=0) return cluster_centers def get_embeddings_metadata(self) -> Dict: sections = self.db.get_sections_with_embeddings() sentences = self.db.get_sentences() variables = self.db.get_variables() section_embeddings = sum( 1 for s in sections if self.db.get_embedding(s["id"]) is not None ) sentence_embeddings = sum( 1 for s in sentences if self.db.get_embedding(s["id"]) is not None ) variable_embeddings = sum( 1 for v in variables if self.db.get_embedding(v["id"]) is not None ) metadata = { "total_embeddings": section_embeddings + sentence_embeddings + variable_embeddings, "section_embeddings": section_embeddings, "sentence_embeddings": sentence_embeddings, "variable_embeddings": variable_embeddings, "embedding_dimension": self.embedding_dim, "embedding_model": "nomic-embed-text", } return metadata