236 lines
8.1 KiB
Python
236 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
|
import sys
|
|
import json
|
|
import os
|
|
import numpy as np
|
|
from pathlib import Path
|
|
|
|
from data_processor import DocumentProcessor
|
|
from db_manager import DatabaseManager
|
|
from quantitative import QuantitativeAnalyzer
|
|
from semantic_analyzer import SemanticAnalyzer
|
|
from metadata_builder import MetadataBuilder
|
|
|
|
|
|
def main():
|
|
print("Starting Claude's Constitution Analysis...")
|
|
|
|
base_dir = Path(__file__).parent.parent.parent
|
|
constitution_path = base_dir / "claudes-constitution.md"
|
|
|
|
if not constitution_path.exists():
|
|
print(f"Error: Constitution file not found at {constitution_path}")
|
|
sys.exit(1)
|
|
|
|
analysis_dir = base_dir / "constitution_analysis"
|
|
data_dir = analysis_dir / "data"
|
|
data_dir.mkdir(exist_ok=True)
|
|
|
|
db_path = data_dir / "constitution.db"
|
|
|
|
print("\n1. Parsing document...")
|
|
processor = DocumentProcessor(str(constitution_path))
|
|
sections = processor.parse()
|
|
sentences = processor.extract_sentences(sections)
|
|
variables = processor.extract_variables()
|
|
constraints = processor.classify_constraints()
|
|
|
|
print(f" - Found {len(sections)} sections")
|
|
print(f" - Found {len(sentences)} sentences")
|
|
print(f" - Found {len(variables)} variables")
|
|
print(f" - Found {len(constraints)} constraints")
|
|
|
|
print("\n2. Building database...")
|
|
db = DatabaseManager(str(db_path))
|
|
db.create_tables()
|
|
db.populate(sections, sentences, variables, constraints)
|
|
print(" - Database created and populated")
|
|
|
|
print("\n3. Performing quantitative analysis...")
|
|
quant_analyzer = QuantitativeAnalyzer(db)
|
|
tfidf_scores = quant_analyzer.compute_tfidf()
|
|
centrality = quant_analyzer.compute_centrality()
|
|
statistics = quant_analyzer.generate_statistics()
|
|
section_stats = quant_analyzer.compute_section_statistics()
|
|
|
|
with open(data_dir / "tfidf_scores.json", "w") as f:
|
|
json.dump(tfidf_scores, f, indent=2)
|
|
|
|
with open(data_dir / "centrality.json", "w") as f:
|
|
json.dump(centrality, f, indent=2)
|
|
|
|
with open(data_dir / "statistics.json", "w") as f:
|
|
json.dump(statistics, f, indent=2)
|
|
|
|
with open(data_dir / "section_stats.json", "w") as f:
|
|
json.dump(section_stats, f, indent=2)
|
|
|
|
print(" - Quantitative analysis complete")
|
|
|
|
print("\n4. Generating semantic embeddings...")
|
|
print(" (This may take several minutes...)")
|
|
semantic_analyzer = SemanticAnalyzer(db)
|
|
|
|
try:
|
|
semantic_analyzer.generate_all_embeddings()
|
|
semantic_analyzer.compute_similarities()
|
|
|
|
clusters = semantic_analyzer.cluster_variables()
|
|
with open(data_dir / "clusters.json", "w") as f:
|
|
json.dump(clusters, f, indent=2)
|
|
|
|
embeddings_meta = semantic_analyzer.get_embeddings_metadata()
|
|
with open(data_dir / "embeddings_meta.json", "w") as f:
|
|
json.dump(embeddings_meta, f, indent=2)
|
|
|
|
print(" - Semantic embeddings generated")
|
|
except Exception as e:
|
|
print(f" - Warning: Could not generate embeddings. Error: {e}")
|
|
print(" - Continuing without embeddings...")
|
|
|
|
print("\n5. Building metadata...")
|
|
metadata_builder = MetadataBuilder(db, quant_analyzer, semantic_analyzer)
|
|
variables_meta = metadata_builder.build_variable_metadata()
|
|
sections_meta = metadata_builder.build_section_metadata()
|
|
|
|
with open(data_dir / "variables.json", "w") as f:
|
|
json.dump(convert_to_json_serializable(variables_meta), f, indent=2)
|
|
|
|
with open(data_dir / "sections.json", "w") as f:
|
|
json.dump(convert_to_json_serializable(sections_meta), f, indent=2)
|
|
|
|
print(" - Metadata built and exported")
|
|
|
|
print("\n6. Preparing web data...")
|
|
graph_data = build_graph_data(variables_meta, centrality)
|
|
with open(data_dir / "graph.json", "w") as f:
|
|
json.dump(graph_data, f, indent=2)
|
|
|
|
charts_data = prepare_charts_data(statistics, sections_meta)
|
|
with open(data_dir / "charts.json", "w") as f:
|
|
json.dump(charts_data, f, indent=2)
|
|
|
|
print(" - Web data prepared")
|
|
|
|
db.close()
|
|
|
|
print("\n" + "=" * 50)
|
|
print("✓ Analysis complete!")
|
|
print("=" * 50)
|
|
print(f"\nData files created in {data_dir}:")
|
|
for f in data_dir.glob("*.json"):
|
|
size_kb = f.stat().st_size / 1024
|
|
print(f" - {f.name} ({size_kb:.1f} KB)")
|
|
print(f" - constitution.db ({(db_path.stat().st_size / (1024 * 1024)):.1f} MB)")
|
|
print(f"\nNext steps:")
|
|
print(f" 1. Generate HTML interface:")
|
|
print(f" python analysis/generate_html.py")
|
|
print(f" 2. Open in browser:")
|
|
print(f" firefox web/index.html")
|
|
|
|
|
|
def convert_to_json_serializable(obj):
|
|
if hasattr(obj, "dtype"):
|
|
if isinstance(obj, (int, np.integer)):
|
|
return int(obj)
|
|
elif isinstance(obj, (float, np.floating)):
|
|
return float(obj)
|
|
elif isinstance(obj, np.ndarray):
|
|
return obj.tolist()
|
|
elif isinstance(obj, dict):
|
|
return {k: convert_to_json_serializable(v) for k, v in obj.items()}
|
|
elif isinstance(obj, list):
|
|
return [convert_to_json_serializable(item) for item in obj]
|
|
return obj
|
|
|
|
|
|
def build_graph_data(variables_meta: list, centrality: dict) -> dict:
|
|
nodes = []
|
|
edges = []
|
|
|
|
node_map = {}
|
|
|
|
for var in variables_meta:
|
|
node_id = f"var_{var['id']}"
|
|
node_map[var["id"]] = node_id
|
|
|
|
color = get_priority_color(var["priority_level"])
|
|
|
|
nodes.append(
|
|
{
|
|
"id": node_id,
|
|
"name": var["name"],
|
|
"category": var["category"],
|
|
"priority_level": var["priority_level"],
|
|
"is_hard_constraint": var["is_hard_constraint"],
|
|
"frequency": var["frequency"],
|
|
"coefficient": var["coefficient_score"],
|
|
"color": color,
|
|
"size": max(5, min(30, var["frequency"] / 2)),
|
|
}
|
|
)
|
|
|
|
for edge in centrality.get("edges", []):
|
|
source_id = edge["source"]
|
|
target_id = edge["target"]
|
|
|
|
if source_id in node_map and target_id in node_map:
|
|
edges.append(
|
|
{
|
|
"source": node_map[source_id],
|
|
"target": node_map[target_id],
|
|
"weight": edge["weight"],
|
|
}
|
|
)
|
|
|
|
return {"nodes": nodes, "edges": edges}
|
|
|
|
|
|
def get_priority_color(priority_level: int) -> str:
|
|
colors = {1: "#ef4444", 2: "#f59e0b", 3: "#10b981", 4: "#3b82f6", None: "#6b7280"}
|
|
return colors.get(priority_level, "#6b7280")
|
|
|
|
|
|
def prepare_charts_data(statistics: dict, sections_meta: list) -> dict:
|
|
charts = {
|
|
"priority_distribution": {
|
|
"labels": ["Priority 1", "Priority 2", "Priority 3", "Priority 4"],
|
|
"data": [
|
|
statistics["priority_distribution"].get("priority_1", 0),
|
|
statistics["priority_distribution"].get("priority_2", 0),
|
|
statistics["priority_distribution"].get("priority_3", 0),
|
|
statistics["priority_distribution"].get("priority_4", 0),
|
|
],
|
|
},
|
|
"constraint_distribution": {
|
|
"labels": ["Hard Constraints", "Soft Constraints"],
|
|
"data": [
|
|
statistics["constraint_distribution"].get("hard", 0),
|
|
statistics["constraint_distribution"].get("soft", 0),
|
|
],
|
|
},
|
|
"variable_categories": {
|
|
"labels": list(statistics.get("variable_categories", {}).keys()),
|
|
"data": list(statistics.get("variable_categories", {}).values()),
|
|
},
|
|
"sentence_length_distribution": {
|
|
"min": statistics["sentence_length_stats"]["min"],
|
|
"max": statistics["sentence_length_stats"]["max"],
|
|
"mean": statistics["sentence_length_stats"]["mean"],
|
|
"median": statistics["sentence_length_stats"]["median"],
|
|
},
|
|
"overview_stats": {
|
|
"total_variables": statistics["total_variables"],
|
|
"total_sentences": statistics["sentences"],
|
|
"total_tokens": statistics["total_tokens"],
|
|
"unique_tokens": statistics["unique_tokens"],
|
|
"avg_sentence_length": statistics["avg_sentence_length"],
|
|
},
|
|
}
|
|
|
|
return charts
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|