"""Citation extraction and validation utilities for synthesized technique pages. Used by stage 5 synthesis and the test harness to verify that [N] citation markers in body sections reference valid source moments. """ from __future__ import annotations import re from pipeline.schemas import BodySection # Matches [N] or [N,M] or [N,M,P] style citation markers where N,M,P are integers. _CITATION_RE = re.compile(r"\[(\d+(?:,\s*\d+)*)\]") def extract_citations(text: str) -> list[int]: """Extract all citation indices from ``[N]`` and ``[N,M,...]`` markers in *text*. Returns a sorted list of unique integer indices. """ indices: set[int] = set() for match in _CITATION_RE.finditer(text): for part in match.group(1).split(","): indices.add(int(part.strip())) return sorted(indices) def validate_citations( sections: list[BodySection], moment_count: int, ) -> dict: """Validate citation markers across all *sections* against *moment_count* source moments. Moments are expected to be referenced as 0-based indices ``[0]`` through ``[moment_count - 1]``. Returns a dict with: valid (bool): True when every cited index is in range and every moment is cited. total_citations (int): Count of unique cited indices. invalid_indices (list[int]): Cited indices that are out of range. uncited_moments (list[int]): In-range moment indices that are never cited. coverage_pct (float): Percentage of moments that are cited (0.0–100.0). """ all_indices: set[int] = set() for section in sections: all_indices.update(extract_citations(section.content)) for sub in section.subsections: all_indices.update(extract_citations(sub.content)) valid_range = set(range(moment_count)) invalid_indices = sorted(all_indices - valid_range) cited_in_range = all_indices & valid_range uncited_moments = sorted(valid_range - cited_in_range) coverage_pct = (len(cited_in_range) / moment_count * 100.0) if moment_count > 0 else 0.0 return { "valid": len(invalid_indices) == 0 and len(uncited_moments) == 0, "total_citations": len(cited_in_range), "invalid_indices": invalid_indices, "uncited_moments": uncited_moments, "coverage_pct": round(coverage_pct, 1), }