253 lines
8.4 KiB
Python
253 lines
8.4 KiB
Python
import re
|
|
import json
|
|
from typing import List, Dict, Tuple, Optional
|
|
from pathlib import Path
|
|
|
|
|
|
class DocumentProcessor:
|
|
def __init__(self, document_path: str):
|
|
self.document_path = document_path
|
|
self.content = Path(document_path).read_text()
|
|
self.lines = self.content.split("\n")
|
|
|
|
def parse(self) -> List[Dict]:
|
|
sections = []
|
|
section_stack = []
|
|
current_section = None
|
|
|
|
for line_num, line in enumerate(self.lines, 1):
|
|
stripped = line.strip()
|
|
|
|
if stripped.startswith("#"):
|
|
section_type, title, level = self._parse_heading(stripped)
|
|
|
|
new_section = {
|
|
"section_type": section_type,
|
|
"title": title,
|
|
"content": "",
|
|
"line_start": line_num,
|
|
"line_end": line_num,
|
|
"hierarchy_level": level,
|
|
"parent_id": None,
|
|
"path": title,
|
|
}
|
|
|
|
if section_stack:
|
|
current_section = section_stack[-1]
|
|
new_section["parent_id"] = current_section.get("id")
|
|
new_section["path"] = f"{current_section['path']}/{title}"
|
|
current_section["line_end"] = line_num - 1
|
|
|
|
new_section["id"] = len(sections) + 1
|
|
sections.append(new_section)
|
|
section_stack.append(new_section)
|
|
|
|
elif stripped and section_stack:
|
|
section_stack[-1]["content"] += line + "\n"
|
|
section_stack[-1]["line_end"] = line_num
|
|
|
|
if section_stack:
|
|
section_stack[-1]["line_end"] = len(self.lines)
|
|
|
|
return sections
|
|
|
|
def _parse_heading(self, line: str) -> Tuple[str, str, int]:
|
|
if line.startswith("### "):
|
|
return "subsection", line[4:].strip(), 3
|
|
elif line.startswith("## "):
|
|
return "section", line[3:].strip(), 2
|
|
elif line.startswith("# "):
|
|
return "document", line[2:].strip(), 1
|
|
return "paragraph", line.strip(), 0
|
|
|
|
def extract_sentences(self, sections: List[Dict]) -> List[Dict]:
|
|
sentences = []
|
|
|
|
for section in sections:
|
|
content = section.get("content", "")
|
|
lines = content.split("\n")
|
|
|
|
sentence_number = 0
|
|
for line_num, line in enumerate(lines, section.get("line_start", 1)):
|
|
if line.strip():
|
|
sents = self._segment_sentences(line.strip())
|
|
for sent in sents:
|
|
sentence_number += 1
|
|
sentences.append(
|
|
{
|
|
"section_id": section["id"],
|
|
"text": sent,
|
|
"sentence_number": sentence_number,
|
|
"line_number": line_num,
|
|
}
|
|
)
|
|
|
|
return sentences
|
|
|
|
def _segment_sentences(self, text: str) -> List[str]:
|
|
sentence_endings = r"(?<=[.!?])\s+(?=[A-Z])"
|
|
sentences = re.split(sentence_endings, text)
|
|
return [s.strip() for s in sentences if s.strip()]
|
|
|
|
def extract_variables(self) -> List[Dict]:
|
|
variables = []
|
|
content = self.content.lower()
|
|
|
|
core_values = [
|
|
"broadly safe",
|
|
"broadly ethical",
|
|
"anthropic guidelines",
|
|
"genuinely helpful",
|
|
"honest",
|
|
"respectful",
|
|
]
|
|
|
|
for value in core_values:
|
|
count = content.count(value)
|
|
if count > 0:
|
|
variables.append(
|
|
{
|
|
"name": value,
|
|
"category": "core_value",
|
|
"priority_level": self._get_priority_level(value),
|
|
"is_hard_constraint": False,
|
|
"principal_assignment": "all",
|
|
"frequency": count,
|
|
"description": self._extract_definition(value),
|
|
}
|
|
)
|
|
|
|
constraints = self._extract_constraints()
|
|
variables.extend(constraints)
|
|
|
|
factors = self._extract_factors()
|
|
variables.extend(factors)
|
|
|
|
return variables
|
|
|
|
def _get_priority_level(self, value: str) -> Optional[int]:
|
|
priorities = {
|
|
"broadly safe": 1,
|
|
"broadly ethical": 2,
|
|
"anthropic guidelines": 3,
|
|
"genuinely helpful": 4,
|
|
}
|
|
return priorities.get(value.lower())
|
|
|
|
def _extract_definition(self, value: str) -> str:
|
|
pattern = rf"{re.escape(value)}[:\s]+([^.!?]*[.!?])"
|
|
match = re.search(pattern, self.content, re.IGNORECASE)
|
|
return match.group(1).strip() if match else ""
|
|
|
|
def _extract_constraints(self) -> List[Dict]:
|
|
constraints = []
|
|
patterns = [
|
|
(r"never\s+(?:to\s+)?([^,.!?]+)", "hard_constraint"),
|
|
(r"do\s+not\s+([^,.!?]+)", "soft_constraint"),
|
|
(r"avoid\s+([^,.!?]+)", "soft_constraint"),
|
|
(r"refrain\s+from\s+([^,.!?]+)", "soft_constraint"),
|
|
]
|
|
|
|
for pattern, constraint_type in patterns:
|
|
matches = re.finditer(pattern, self.content, re.IGNORECASE)
|
|
for match in matches:
|
|
constraint_text = match.group(1).strip()
|
|
if len(constraint_text) > 5:
|
|
constraints.append(
|
|
{
|
|
"name": constraint_text[:50],
|
|
"category": constraint_type,
|
|
"priority_level": None,
|
|
"is_hard_constraint": constraint_type == "hard_constraint",
|
|
"principal_assignment": "anthropic",
|
|
"frequency": 1,
|
|
"description": match.group(0),
|
|
}
|
|
)
|
|
|
|
return constraints
|
|
|
|
def _extract_factors(self) -> List[Dict]:
|
|
factors = [
|
|
"safety",
|
|
"ethics",
|
|
"helpfulness",
|
|
"honesty",
|
|
"transparency",
|
|
"respect",
|
|
"fairness",
|
|
"beneficence",
|
|
"non-maleficence",
|
|
"autonomy",
|
|
"justice",
|
|
"responsibility",
|
|
"accountability",
|
|
]
|
|
|
|
extracted = []
|
|
content_lower = self.content.lower()
|
|
|
|
for factor in factors:
|
|
count = content_lower.count(factor)
|
|
if count >= 3:
|
|
extracted.append(
|
|
{
|
|
"name": factor,
|
|
"category": "factor",
|
|
"priority_level": None,
|
|
"is_hard_constraint": False,
|
|
"principal_assignment": "all",
|
|
"frequency": count,
|
|
"description": f"Behavioral factor related to {factor}",
|
|
}
|
|
)
|
|
|
|
return extracted
|
|
|
|
def classify_constraints(self) -> List[Dict]:
|
|
hard_patterns = [
|
|
r"never\b",
|
|
r"under no circumstances",
|
|
r"absolutely",
|
|
r"unconditionally",
|
|
]
|
|
|
|
soft_patterns = [
|
|
r"prefer(?:ably|ed)",
|
|
r"should(?:\s+not)?",
|
|
r"ideally",
|
|
r"generally",
|
|
]
|
|
|
|
constraints = []
|
|
sentences = self.extract_sentences(self.parse())
|
|
|
|
for sent in sentences:
|
|
text = sent["text"].lower()
|
|
|
|
for pattern in hard_patterns:
|
|
if re.search(pattern, text):
|
|
constraints.append(
|
|
{
|
|
"type": "hard_constraint",
|
|
"content": sent["text"],
|
|
"section_id": sent["section_id"],
|
|
"sentence_id": sent["sentence_number"],
|
|
}
|
|
)
|
|
break
|
|
|
|
for pattern in soft_patterns:
|
|
if re.search(pattern, text):
|
|
constraints.append(
|
|
{
|
|
"type": "soft_constraint",
|
|
"content": sent["text"],
|
|
"section_id": sent["section_id"],
|
|
"sentence_id": sent["sentence_number"],
|
|
}
|
|
)
|
|
break
|
|
|
|
return constraints
|