.agents/constitution/constitution_analysis/analysis/data_processor.py

import re
import json
from typing import List, Dict, Tuple, Optional
from pathlib import Path


class DocumentProcessor:
    def __init__(self, document_path: str):
        self.document_path = document_path
        self.content = Path(document_path).read_text()
        self.lines = self.content.split("\n")

    def parse(self) -> List[Dict]:
        sections = []
        section_stack = []
        current_section = None

        for line_num, line in enumerate(self.lines, 1):
            stripped = line.strip()

            if stripped.startswith("#"):
                section_type, title, level = self._parse_heading(stripped)

                new_section = {
                    "section_type": section_type,
                    "title": title,
                    "content": "",
                    "line_start": line_num,
                    "line_end": line_num,
                    "hierarchy_level": level,
                    "parent_id": None,
                    "path": title,
                }

                if section_stack:
                    current_section = section_stack[-1]
                    new_section["parent_id"] = current_section.get("id")
                    new_section["path"] = f"{current_section['path']}/{title}"
                    current_section["line_end"] = line_num - 1

                new_section["id"] = len(sections) + 1
                sections.append(new_section)
                section_stack.append(new_section)

            elif stripped and section_stack:
                section_stack[-1]["content"] += line + "\n"
                section_stack[-1]["line_end"] = line_num

        if section_stack:
            section_stack[-1]["line_end"] = len(self.lines)

        return sections

    def _parse_heading(self, line: str) -> Tuple[str, str, int]:
        if line.startswith("### "):
            return "subsection", line[4:].strip(), 3
        elif line.startswith("## "):
            return "section", line[3:].strip(), 2
        elif line.startswith("# "):
            return "document", line[2:].strip(), 1
        return "paragraph", line.strip(), 0

    def extract_sentences(self, sections: List[Dict]) -> List[Dict]:
        sentences = []

        for section in sections:
            content = section.get("content", "")
            lines = content.split("\n")

            sentence_number = 0
            for line_num, line in enumerate(lines, section.get("line_start", 1)):
                if line.strip():
                    sents = self._segment_sentences(line.strip())
                    for sent in sents:
                        sentence_number += 1
                        sentences.append(
                            {
                                "section_id": section["id"],
                                "text": sent,
                                "sentence_number": sentence_number,
                                "line_number": line_num,
                            }
                        )

        return sentences

    def _segment_sentences(self, text: str) -> List[str]:
        sentence_endings = r"(?<=[.!?])\s+(?=[A-Z])"
        sentences = re.split(sentence_endings, text)
        return [s.strip() for s in sentences if s.strip()]

    def extract_variables(self) -> List[Dict]:
        variables = []
        content = self.content.lower()

        core_values = [
            "broadly safe",
            "broadly ethical",
            "anthropic guidelines",
            "genuinely helpful",
            "honest",
            "respectful",
        ]

        for value in core_values:
            count = content.count(value)
            if count > 0:
                variables.append(
                    {
                        "name": value,
                        "category": "core_value",
                        "priority_level": self._get_priority_level(value),
                        "is_hard_constraint": False,
                        "principal_assignment": "all",
                        "frequency": count,
                        "description": self._extract_definition(value),
                    }
                )

        constraints = self._extract_constraints()
        variables.extend(constraints)

        factors = self._extract_factors()
        variables.extend(factors)

        return variables

    def _get_priority_level(self, value: str) -> Optional[int]:
        priorities = {
            "broadly safe": 1,
            "broadly ethical": 2,
            "anthropic guidelines": 3,
            "genuinely helpful": 4,
        }
        return priorities.get(value.lower())

    def _extract_definition(self, value: str) -> str:
        pattern = rf"{re.escape(value)}[:\s]+([^.!?]*[.!?])"
        match = re.search(pattern, self.content, re.IGNORECASE)
        return match.group(1).strip() if match else ""

    def _extract_constraints(self) -> List[Dict]:
        constraints = []
        patterns = [
            (r"never\s+(?:to\s+)?([^,.!?]+)", "hard_constraint"),
            (r"do\s+not\s+([^,.!?]+)", "soft_constraint"),
            (r"avoid\s+([^,.!?]+)", "soft_constraint"),
            (r"refrain\s+from\s+([^,.!?]+)", "soft_constraint"),
        ]

        for pattern, constraint_type in patterns:
            matches = re.finditer(pattern, self.content, re.IGNORECASE)
            for match in matches:
                constraint_text = match.group(1).strip()
                if len(constraint_text) > 5:
                    constraints.append(
                        {
                            "name": constraint_text[:50],
                            "category": constraint_type,
                            "priority_level": None,
                            "is_hard_constraint": constraint_type == "hard_constraint",
                            "principal_assignment": "anthropic",
                            "frequency": 1,
                            "description": match.group(0),
                        }
                    )

        return constraints

    def _extract_factors(self) -> List[Dict]:
        factors = [
            "safety",
            "ethics",
            "helpfulness",
            "honesty",
            "transparency",
            "respect",
            "fairness",
            "beneficence",
            "non-maleficence",
            "autonomy",
            "justice",
            "responsibility",
            "accountability",
        ]

        extracted = []
        content_lower = self.content.lower()

        for factor in factors:
            count = content_lower.count(factor)
            if count >= 3:
                extracted.append(
                    {
                        "name": factor,
                        "category": "factor",
                        "priority_level": None,
                        "is_hard_constraint": False,
                        "principal_assignment": "all",
                        "frequency": count,
                        "description": f"Behavioral factor related to {factor}",
                    }
                )

        return extracted

    def classify_constraints(self) -> List[Dict]:
        hard_patterns = [
            r"never\b",
            r"under no circumstances",
            r"absolutely",
            r"unconditionally",
        ]

        soft_patterns = [
            r"prefer(?:ably|ed)",
            r"should(?:\s+not)?",
            r"ideally",
            r"generally",
        ]

        constraints = []
        sentences = self.extract_sentences(self.parse())

        for sent in sentences:
            text = sent["text"].lower()

            for pattern in hard_patterns:
                if re.search(pattern, text):
                    constraints.append(
                        {
                            "type": "hard_constraint",
                            "content": sent["text"],
                            "section_id": sent["section_id"],
                            "sentence_id": sent["sentence_number"],
                        }
                    )
                    break

            for pattern in soft_patterns:
                if re.search(pattern, text):
                    constraints.append(
                        {
                            "type": "soft_constraint",
                            "content": sent["text"],
                            "section_id": sent["section_id"],
                            "sentence_id": sent["sentence_number"],
                        }
                    )
                    break

        return constraints