chrysopedia/backend/pipeline/citation_utils.py

"""Citation extraction and validation utilities for synthesized technique pages.

Used by stage 5 synthesis and the test harness to verify that [N] citation
markers in body sections reference valid source moments.
"""

from __future__ import annotations

import re

from pipeline.schemas import BodySection

# Matches [N] or [N,M] or [N,M,P] style citation markers where N,M,P are integers.
_CITATION_RE = re.compile(r"\[(\d+(?:,\s*\d+)*)\]")


def extract_citations(text: str) -> list[int]:
    """Extract all citation indices from ``[N]`` and ``[N,M,...]`` markers in *text*.

    Returns a sorted list of unique integer indices.
    """
    indices: set[int] = set()
    for match in _CITATION_RE.finditer(text):
        for part in match.group(1).split(","):
            indices.add(int(part.strip()))
    return sorted(indices)


def validate_citations(
    sections: list[BodySection],
    moment_count: int,
) -> dict:
    """Validate citation markers across all *sections* against *moment_count* source moments.

    Moments are expected to be referenced as 0-based indices ``[0]`` through
    ``[moment_count - 1]``.

    Returns a dict with:
        valid (bool): True when every cited index is in range and every moment is cited.
        total_citations (int): Count of unique cited indices.
        invalid_indices (list[int]): Cited indices that are out of range.
        uncited_moments (list[int]): In-range moment indices that are never cited.
        coverage_pct (float): Percentage of moments that are cited (0.0–100.0).
    """
    all_indices: set[int] = set()

    for section in sections:
        all_indices.update(extract_citations(section.content))
        for sub in section.subsections:
            all_indices.update(extract_citations(sub.content))

    valid_range = set(range(moment_count))
    invalid_indices = sorted(all_indices - valid_range)
    cited_in_range = all_indices & valid_range
    uncited_moments = sorted(valid_range - cited_in_range)
    coverage_pct = (len(cited_in_range) / moment_count * 100.0) if moment_count > 0 else 0.0

    return {
        "valid": len(invalid_indices) == 0 and len(uncited_moments) == 0,
        "total_citations": len(cited_in_range),
        "invalid_indices": invalid_indices,
        "uncited_moments": uncited_moments,
        "coverage_pct": round(coverage_pct, 1),
    }