236 lines
8.1 KiB
Python

#!/usr/bin/env python3
import sys
import json
import os
import numpy as np
from pathlib import Path
from data_processor import DocumentProcessor
from db_manager import DatabaseManager
from quantitative import QuantitativeAnalyzer
from semantic_analyzer import SemanticAnalyzer
from metadata_builder import MetadataBuilder
def main():
print("Starting Claude's Constitution Analysis...")
base_dir = Path(__file__).parent.parent.parent
constitution_path = base_dir / "claudes-constitution.md"
if not constitution_path.exists():
print(f"Error: Constitution file not found at {constitution_path}")
sys.exit(1)
analysis_dir = base_dir / "constitution_analysis"
data_dir = analysis_dir / "data"
data_dir.mkdir(exist_ok=True)
db_path = data_dir / "constitution.db"
print("\n1. Parsing document...")
processor = DocumentProcessor(str(constitution_path))
sections = processor.parse()
sentences = processor.extract_sentences(sections)
variables = processor.extract_variables()
constraints = processor.classify_constraints()
print(f" - Found {len(sections)} sections")
print(f" - Found {len(sentences)} sentences")
print(f" - Found {len(variables)} variables")
print(f" - Found {len(constraints)} constraints")
print("\n2. Building database...")
db = DatabaseManager(str(db_path))
db.create_tables()
db.populate(sections, sentences, variables, constraints)
print(" - Database created and populated")
print("\n3. Performing quantitative analysis...")
quant_analyzer = QuantitativeAnalyzer(db)
tfidf_scores = quant_analyzer.compute_tfidf()
centrality = quant_analyzer.compute_centrality()
statistics = quant_analyzer.generate_statistics()
section_stats = quant_analyzer.compute_section_statistics()
with open(data_dir / "tfidf_scores.json", "w") as f:
json.dump(tfidf_scores, f, indent=2)
with open(data_dir / "centrality.json", "w") as f:
json.dump(centrality, f, indent=2)
with open(data_dir / "statistics.json", "w") as f:
json.dump(statistics, f, indent=2)
with open(data_dir / "section_stats.json", "w") as f:
json.dump(section_stats, f, indent=2)
print(" - Quantitative analysis complete")
print("\n4. Generating semantic embeddings...")
print(" (This may take several minutes...)")
semantic_analyzer = SemanticAnalyzer(db)
try:
semantic_analyzer.generate_all_embeddings()
semantic_analyzer.compute_similarities()
clusters = semantic_analyzer.cluster_variables()
with open(data_dir / "clusters.json", "w") as f:
json.dump(clusters, f, indent=2)
embeddings_meta = semantic_analyzer.get_embeddings_metadata()
with open(data_dir / "embeddings_meta.json", "w") as f:
json.dump(embeddings_meta, f, indent=2)
print(" - Semantic embeddings generated")
except Exception as e:
print(f" - Warning: Could not generate embeddings. Error: {e}")
print(" - Continuing without embeddings...")
print("\n5. Building metadata...")
metadata_builder = MetadataBuilder(db, quant_analyzer, semantic_analyzer)
variables_meta = metadata_builder.build_variable_metadata()
sections_meta = metadata_builder.build_section_metadata()
with open(data_dir / "variables.json", "w") as f:
json.dump(convert_to_json_serializable(variables_meta), f, indent=2)
with open(data_dir / "sections.json", "w") as f:
json.dump(convert_to_json_serializable(sections_meta), f, indent=2)
print(" - Metadata built and exported")
print("\n6. Preparing web data...")
graph_data = build_graph_data(variables_meta, centrality)
with open(data_dir / "graph.json", "w") as f:
json.dump(graph_data, f, indent=2)
charts_data = prepare_charts_data(statistics, sections_meta)
with open(data_dir / "charts.json", "w") as f:
json.dump(charts_data, f, indent=2)
print(" - Web data prepared")
db.close()
print("\n" + "=" * 50)
print("✓ Analysis complete!")
print("=" * 50)
print(f"\nData files created in {data_dir}:")
for f in data_dir.glob("*.json"):
size_kb = f.stat().st_size / 1024
print(f" - {f.name} ({size_kb:.1f} KB)")
print(f" - constitution.db ({(db_path.stat().st_size / (1024 * 1024)):.1f} MB)")
print(f"\nNext steps:")
print(f" 1. Generate HTML interface:")
print(f" python analysis/generate_html.py")
print(f" 2. Open in browser:")
print(f" firefox web/index.html")
def convert_to_json_serializable(obj):
if hasattr(obj, "dtype"):
if isinstance(obj, (int, np.integer)):
return int(obj)
elif isinstance(obj, (float, np.floating)):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, dict):
return {k: convert_to_json_serializable(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [convert_to_json_serializable(item) for item in obj]
return obj
def build_graph_data(variables_meta: list, centrality: dict) -> dict:
nodes = []
edges = []
node_map = {}
for var in variables_meta:
node_id = f"var_{var['id']}"
node_map[var["id"]] = node_id
color = get_priority_color(var["priority_level"])
nodes.append(
{
"id": node_id,
"name": var["name"],
"category": var["category"],
"priority_level": var["priority_level"],
"is_hard_constraint": var["is_hard_constraint"],
"frequency": var["frequency"],
"coefficient": var["coefficient_score"],
"color": color,
"size": max(5, min(30, var["frequency"] / 2)),
}
)
for edge in centrality.get("edges", []):
source_id = edge["source"]
target_id = edge["target"]
if source_id in node_map and target_id in node_map:
edges.append(
{
"source": node_map[source_id],
"target": node_map[target_id],
"weight": edge["weight"],
}
)
return {"nodes": nodes, "edges": edges}
def get_priority_color(priority_level: int) -> str:
colors = {1: "#ef4444", 2: "#f59e0b", 3: "#10b981", 4: "#3b82f6", None: "#6b7280"}
return colors.get(priority_level, "#6b7280")
def prepare_charts_data(statistics: dict, sections_meta: list) -> dict:
charts = {
"priority_distribution": {
"labels": ["Priority 1", "Priority 2", "Priority 3", "Priority 4"],
"data": [
statistics["priority_distribution"].get("priority_1", 0),
statistics["priority_distribution"].get("priority_2", 0),
statistics["priority_distribution"].get("priority_3", 0),
statistics["priority_distribution"].get("priority_4", 0),
],
},
"constraint_distribution": {
"labels": ["Hard Constraints", "Soft Constraints"],
"data": [
statistics["constraint_distribution"].get("hard", 0),
statistics["constraint_distribution"].get("soft", 0),
],
},
"variable_categories": {
"labels": list(statistics.get("variable_categories", {}).keys()),
"data": list(statistics.get("variable_categories", {}).values()),
},
"sentence_length_distribution": {
"min": statistics["sentence_length_stats"]["min"],
"max": statistics["sentence_length_stats"]["max"],
"mean": statistics["sentence_length_stats"]["mean"],
"median": statistics["sentence_length_stats"]["median"],
},
"overview_stats": {
"total_variables": statistics["total_variables"],
"total_sentences": statistics["sentences"],
"total_tokens": statistics["total_tokens"],
"unique_tokens": statistics["unique_tokens"],
"avg_sentence_length": statistics["avg_sentence_length"],
},
}
return charts
if __name__ == "__main__":
main()