#!/usr/bin/env python3 import sys import json import os import numpy as np from pathlib import Path from data_processor import DocumentProcessor from db_manager import DatabaseManager from quantitative import QuantitativeAnalyzer from semantic_analyzer import SemanticAnalyzer from metadata_builder import MetadataBuilder def main(): print("Starting Claude's Constitution Analysis...") base_dir = Path(__file__).parent.parent.parent constitution_path = base_dir / "claudes-constitution.md" if not constitution_path.exists(): print(f"Error: Constitution file not found at {constitution_path}") sys.exit(1) analysis_dir = base_dir / "constitution_analysis" data_dir = analysis_dir / "data" data_dir.mkdir(exist_ok=True) db_path = data_dir / "constitution.db" print("\n1. Parsing document...") processor = DocumentProcessor(str(constitution_path)) sections = processor.parse() sentences = processor.extract_sentences(sections) variables = processor.extract_variables() constraints = processor.classify_constraints() print(f" - Found {len(sections)} sections") print(f" - Found {len(sentences)} sentences") print(f" - Found {len(variables)} variables") print(f" - Found {len(constraints)} constraints") print("\n2. Building database...") db = DatabaseManager(str(db_path)) db.create_tables() db.populate(sections, sentences, variables, constraints) print(" - Database created and populated") print("\n3. Performing quantitative analysis...") quant_analyzer = QuantitativeAnalyzer(db) tfidf_scores = quant_analyzer.compute_tfidf() centrality = quant_analyzer.compute_centrality() statistics = quant_analyzer.generate_statistics() section_stats = quant_analyzer.compute_section_statistics() with open(data_dir / "tfidf_scores.json", "w") as f: json.dump(tfidf_scores, f, indent=2) with open(data_dir / "centrality.json", "w") as f: json.dump(centrality, f, indent=2) with open(data_dir / "statistics.json", "w") as f: json.dump(statistics, f, indent=2) with open(data_dir / "section_stats.json", "w") as f: json.dump(section_stats, f, indent=2) print(" - Quantitative analysis complete") print("\n4. Generating semantic embeddings...") print(" (This may take several minutes...)") semantic_analyzer = SemanticAnalyzer(db) try: semantic_analyzer.generate_all_embeddings() semantic_analyzer.compute_similarities() clusters = semantic_analyzer.cluster_variables() with open(data_dir / "clusters.json", "w") as f: json.dump(clusters, f, indent=2) embeddings_meta = semantic_analyzer.get_embeddings_metadata() with open(data_dir / "embeddings_meta.json", "w") as f: json.dump(embeddings_meta, f, indent=2) print(" - Semantic embeddings generated") except Exception as e: print(f" - Warning: Could not generate embeddings. Error: {e}") print(" - Continuing without embeddings...") print("\n5. Building metadata...") metadata_builder = MetadataBuilder(db, quant_analyzer, semantic_analyzer) variables_meta = metadata_builder.build_variable_metadata() sections_meta = metadata_builder.build_section_metadata() with open(data_dir / "variables.json", "w") as f: json.dump(convert_to_json_serializable(variables_meta), f, indent=2) with open(data_dir / "sections.json", "w") as f: json.dump(convert_to_json_serializable(sections_meta), f, indent=2) print(" - Metadata built and exported") print("\n6. Preparing web data...") graph_data = build_graph_data(variables_meta, centrality) with open(data_dir / "graph.json", "w") as f: json.dump(graph_data, f, indent=2) charts_data = prepare_charts_data(statistics, sections_meta) with open(data_dir / "charts.json", "w") as f: json.dump(charts_data, f, indent=2) print(" - Web data prepared") db.close() print("\n" + "=" * 50) print("✓ Analysis complete!") print("=" * 50) print(f"\nData files created in {data_dir}:") for f in data_dir.glob("*.json"): size_kb = f.stat().st_size / 1024 print(f" - {f.name} ({size_kb:.1f} KB)") print(f" - constitution.db ({(db_path.stat().st_size / (1024 * 1024)):.1f} MB)") print(f"\nNext steps:") print(f" 1. Generate HTML interface:") print(f" python analysis/generate_html.py") print(f" 2. Open in browser:") print(f" firefox web/index.html") def convert_to_json_serializable(obj): if hasattr(obj, "dtype"): if isinstance(obj, (int, np.integer)): return int(obj) elif isinstance(obj, (float, np.floating)): return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, dict): return {k: convert_to_json_serializable(v) for k, v in obj.items()} elif isinstance(obj, list): return [convert_to_json_serializable(item) for item in obj] return obj def build_graph_data(variables_meta: list, centrality: dict) -> dict: nodes = [] edges = [] node_map = {} for var in variables_meta: node_id = f"var_{var['id']}" node_map[var["id"]] = node_id color = get_priority_color(var["priority_level"]) nodes.append( { "id": node_id, "name": var["name"], "category": var["category"], "priority_level": var["priority_level"], "is_hard_constraint": var["is_hard_constraint"], "frequency": var["frequency"], "coefficient": var["coefficient_score"], "color": color, "size": max(5, min(30, var["frequency"] / 2)), } ) for edge in centrality.get("edges", []): source_id = edge["source"] target_id = edge["target"] if source_id in node_map and target_id in node_map: edges.append( { "source": node_map[source_id], "target": node_map[target_id], "weight": edge["weight"], } ) return {"nodes": nodes, "edges": edges} def get_priority_color(priority_level: int) -> str: colors = {1: "#ef4444", 2: "#f59e0b", 3: "#10b981", 4: "#3b82f6", None: "#6b7280"} return colors.get(priority_level, "#6b7280") def prepare_charts_data(statistics: dict, sections_meta: list) -> dict: charts = { "priority_distribution": { "labels": ["Priority 1", "Priority 2", "Priority 3", "Priority 4"], "data": [ statistics["priority_distribution"].get("priority_1", 0), statistics["priority_distribution"].get("priority_2", 0), statistics["priority_distribution"].get("priority_3", 0), statistics["priority_distribution"].get("priority_4", 0), ], }, "constraint_distribution": { "labels": ["Hard Constraints", "Soft Constraints"], "data": [ statistics["constraint_distribution"].get("hard", 0), statistics["constraint_distribution"].get("soft", 0), ], }, "variable_categories": { "labels": list(statistics.get("variable_categories", {}).keys()), "data": list(statistics.get("variable_categories", {}).values()), }, "sentence_length_distribution": { "min": statistics["sentence_length_stats"]["min"], "max": statistics["sentence_length_stats"]["max"], "mean": statistics["sentence_length_stats"]["mean"], "median": statistics["sentence_length_stats"]["median"], }, "overview_stats": { "total_variables": statistics["total_variables"], "total_sentences": statistics["sentences"], "total_tokens": statistics["total_tokens"], "unique_tokens": statistics["unique_tokens"], "avg_sentence_length": statistics["avg_sentence_length"], }, } return charts if __name__ == "__main__": main()