253 lines
8.4 KiB
Python

import re
import json
from typing import List, Dict, Tuple, Optional
from pathlib import Path
class DocumentProcessor:
def __init__(self, document_path: str):
self.document_path = document_path
self.content = Path(document_path).read_text()
self.lines = self.content.split("\n")
def parse(self) -> List[Dict]:
sections = []
section_stack = []
current_section = None
for line_num, line in enumerate(self.lines, 1):
stripped = line.strip()
if stripped.startswith("#"):
section_type, title, level = self._parse_heading(stripped)
new_section = {
"section_type": section_type,
"title": title,
"content": "",
"line_start": line_num,
"line_end": line_num,
"hierarchy_level": level,
"parent_id": None,
"path": title,
}
if section_stack:
current_section = section_stack[-1]
new_section["parent_id"] = current_section.get("id")
new_section["path"] = f"{current_section['path']}/{title}"
current_section["line_end"] = line_num - 1
new_section["id"] = len(sections) + 1
sections.append(new_section)
section_stack.append(new_section)
elif stripped and section_stack:
section_stack[-1]["content"] += line + "\n"
section_stack[-1]["line_end"] = line_num
if section_stack:
section_stack[-1]["line_end"] = len(self.lines)
return sections
def _parse_heading(self, line: str) -> Tuple[str, str, int]:
if line.startswith("### "):
return "subsection", line[4:].strip(), 3
elif line.startswith("## "):
return "section", line[3:].strip(), 2
elif line.startswith("# "):
return "document", line[2:].strip(), 1
return "paragraph", line.strip(), 0
def extract_sentences(self, sections: List[Dict]) -> List[Dict]:
sentences = []
for section in sections:
content = section.get("content", "")
lines = content.split("\n")
sentence_number = 0
for line_num, line in enumerate(lines, section.get("line_start", 1)):
if line.strip():
sents = self._segment_sentences(line.strip())
for sent in sents:
sentence_number += 1
sentences.append(
{
"section_id": section["id"],
"text": sent,
"sentence_number": sentence_number,
"line_number": line_num,
}
)
return sentences
def _segment_sentences(self, text: str) -> List[str]:
sentence_endings = r"(?<=[.!?])\s+(?=[A-Z])"
sentences = re.split(sentence_endings, text)
return [s.strip() for s in sentences if s.strip()]
def extract_variables(self) -> List[Dict]:
variables = []
content = self.content.lower()
core_values = [
"broadly safe",
"broadly ethical",
"anthropic guidelines",
"genuinely helpful",
"honest",
"respectful",
]
for value in core_values:
count = content.count(value)
if count > 0:
variables.append(
{
"name": value,
"category": "core_value",
"priority_level": self._get_priority_level(value),
"is_hard_constraint": False,
"principal_assignment": "all",
"frequency": count,
"description": self._extract_definition(value),
}
)
constraints = self._extract_constraints()
variables.extend(constraints)
factors = self._extract_factors()
variables.extend(factors)
return variables
def _get_priority_level(self, value: str) -> Optional[int]:
priorities = {
"broadly safe": 1,
"broadly ethical": 2,
"anthropic guidelines": 3,
"genuinely helpful": 4,
}
return priorities.get(value.lower())
def _extract_definition(self, value: str) -> str:
pattern = rf"{re.escape(value)}[:\s]+([^.!?]*[.!?])"
match = re.search(pattern, self.content, re.IGNORECASE)
return match.group(1).strip() if match else ""
def _extract_constraints(self) -> List[Dict]:
constraints = []
patterns = [
(r"never\s+(?:to\s+)?([^,.!?]+)", "hard_constraint"),
(r"do\s+not\s+([^,.!?]+)", "soft_constraint"),
(r"avoid\s+([^,.!?]+)", "soft_constraint"),
(r"refrain\s+from\s+([^,.!?]+)", "soft_constraint"),
]
for pattern, constraint_type in patterns:
matches = re.finditer(pattern, self.content, re.IGNORECASE)
for match in matches:
constraint_text = match.group(1).strip()
if len(constraint_text) > 5:
constraints.append(
{
"name": constraint_text[:50],
"category": constraint_type,
"priority_level": None,
"is_hard_constraint": constraint_type == "hard_constraint",
"principal_assignment": "anthropic",
"frequency": 1,
"description": match.group(0),
}
)
return constraints
def _extract_factors(self) -> List[Dict]:
factors = [
"safety",
"ethics",
"helpfulness",
"honesty",
"transparency",
"respect",
"fairness",
"beneficence",
"non-maleficence",
"autonomy",
"justice",
"responsibility",
"accountability",
]
extracted = []
content_lower = self.content.lower()
for factor in factors:
count = content_lower.count(factor)
if count >= 3:
extracted.append(
{
"name": factor,
"category": "factor",
"priority_level": None,
"is_hard_constraint": False,
"principal_assignment": "all",
"frequency": count,
"description": f"Behavioral factor related to {factor}",
}
)
return extracted
def classify_constraints(self) -> List[Dict]:
hard_patterns = [
r"never\b",
r"under no circumstances",
r"absolutely",
r"unconditionally",
]
soft_patterns = [
r"prefer(?:ably|ed)",
r"should(?:\s+not)?",
r"ideally",
r"generally",
]
constraints = []
sentences = self.extract_sentences(self.parse())
for sent in sentences:
text = sent["text"].lower()
for pattern in hard_patterns:
if re.search(pattern, text):
constraints.append(
{
"type": "hard_constraint",
"content": sent["text"],
"section_id": sent["section_id"],
"sentence_id": sent["sentence_number"],
}
)
break
for pattern in soft_patterns:
if re.search(pattern, text):
constraints.append(
{
"type": "soft_constraint",
"content": sent["text"],
"section_id": sent["section_id"],
"sentence_id": sent["sentence_number"],
}
)
break
return constraints