import re import json from typing import List, Dict, Tuple, Optional from pathlib import Path class DocumentProcessor: def __init__(self, document_path: str): self.document_path = document_path self.content = Path(document_path).read_text() self.lines = self.content.split("\n") def parse(self) -> List[Dict]: sections = [] section_stack = [] current_section = None for line_num, line in enumerate(self.lines, 1): stripped = line.strip() if stripped.startswith("#"): section_type, title, level = self._parse_heading(stripped) new_section = { "section_type": section_type, "title": title, "content": "", "line_start": line_num, "line_end": line_num, "hierarchy_level": level, "parent_id": None, "path": title, } if section_stack: current_section = section_stack[-1] new_section["parent_id"] = current_section.get("id") new_section["path"] = f"{current_section['path']}/{title}" current_section["line_end"] = line_num - 1 new_section["id"] = len(sections) + 1 sections.append(new_section) section_stack.append(new_section) elif stripped and section_stack: section_stack[-1]["content"] += line + "\n" section_stack[-1]["line_end"] = line_num if section_stack: section_stack[-1]["line_end"] = len(self.lines) return sections def _parse_heading(self, line: str) -> Tuple[str, str, int]: if line.startswith("### "): return "subsection", line[4:].strip(), 3 elif line.startswith("## "): return "section", line[3:].strip(), 2 elif line.startswith("# "): return "document", line[2:].strip(), 1 return "paragraph", line.strip(), 0 def extract_sentences(self, sections: List[Dict]) -> List[Dict]: sentences = [] for section in sections: content = section.get("content", "") lines = content.split("\n") sentence_number = 0 for line_num, line in enumerate(lines, section.get("line_start", 1)): if line.strip(): sents = self._segment_sentences(line.strip()) for sent in sents: sentence_number += 1 sentences.append( { "section_id": section["id"], "text": sent, "sentence_number": sentence_number, "line_number": line_num, } ) return sentences def _segment_sentences(self, text: str) -> List[str]: sentence_endings = r"(?<=[.!?])\s+(?=[A-Z])" sentences = re.split(sentence_endings, text) return [s.strip() for s in sentences if s.strip()] def extract_variables(self) -> List[Dict]: variables = [] content = self.content.lower() core_values = [ "broadly safe", "broadly ethical", "anthropic guidelines", "genuinely helpful", "honest", "respectful", ] for value in core_values: count = content.count(value) if count > 0: variables.append( { "name": value, "category": "core_value", "priority_level": self._get_priority_level(value), "is_hard_constraint": False, "principal_assignment": "all", "frequency": count, "description": self._extract_definition(value), } ) constraints = self._extract_constraints() variables.extend(constraints) factors = self._extract_factors() variables.extend(factors) return variables def _get_priority_level(self, value: str) -> Optional[int]: priorities = { "broadly safe": 1, "broadly ethical": 2, "anthropic guidelines": 3, "genuinely helpful": 4, } return priorities.get(value.lower()) def _extract_definition(self, value: str) -> str: pattern = rf"{re.escape(value)}[:\s]+([^.!?]*[.!?])" match = re.search(pattern, self.content, re.IGNORECASE) return match.group(1).strip() if match else "" def _extract_constraints(self) -> List[Dict]: constraints = [] patterns = [ (r"never\s+(?:to\s+)?([^,.!?]+)", "hard_constraint"), (r"do\s+not\s+([^,.!?]+)", "soft_constraint"), (r"avoid\s+([^,.!?]+)", "soft_constraint"), (r"refrain\s+from\s+([^,.!?]+)", "soft_constraint"), ] for pattern, constraint_type in patterns: matches = re.finditer(pattern, self.content, re.IGNORECASE) for match in matches: constraint_text = match.group(1).strip() if len(constraint_text) > 5: constraints.append( { "name": constraint_text[:50], "category": constraint_type, "priority_level": None, "is_hard_constraint": constraint_type == "hard_constraint", "principal_assignment": "anthropic", "frequency": 1, "description": match.group(0), } ) return constraints def _extract_factors(self) -> List[Dict]: factors = [ "safety", "ethics", "helpfulness", "honesty", "transparency", "respect", "fairness", "beneficence", "non-maleficence", "autonomy", "justice", "responsibility", "accountability", ] extracted = [] content_lower = self.content.lower() for factor in factors: count = content_lower.count(factor) if count >= 3: extracted.append( { "name": factor, "category": "factor", "priority_level": None, "is_hard_constraint": False, "principal_assignment": "all", "frequency": count, "description": f"Behavioral factor related to {factor}", } ) return extracted def classify_constraints(self) -> List[Dict]: hard_patterns = [ r"never\b", r"under no circumstances", r"absolutely", r"unconditionally", ] soft_patterns = [ r"prefer(?:ably|ed)", r"should(?:\s+not)?", r"ideally", r"generally", ] constraints = [] sentences = self.extract_sentences(self.parse()) for sent in sentences: text = sent["text"].lower() for pattern in hard_patterns: if re.search(pattern, text): constraints.append( { "type": "hard_constraint", "content": sent["text"], "section_id": sent["section_id"], "sentence_id": sent["sentence_number"], } ) break for pattern in soft_patterns: if re.search(pattern, text): constraints.append( { "type": "soft_constraint", "content": sent["text"], "section_id": sent["section_id"], "sentence_id": sent["sentence_number"], } ) break return constraints