chrysopedia/backend/pipeline/citation_utils.py
jlightner 15dcab201a test: Added BodySection/BodySubSection schema models, changed Synthesiz…
- "backend/pipeline/schemas.py"
- "backend/pipeline/citation_utils.py"
- "backend/pipeline/test_citation_utils.py"

GSD-Task: S01/T01
2026-04-03 00:50:30 +00:00

64 lines
2.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Citation extraction and validation utilities for synthesized technique pages.
Used by stage 5 synthesis and the test harness to verify that [N] citation
markers in body sections reference valid source moments.
"""
from __future__ import annotations
import re
from pipeline.schemas import BodySection
# Matches [N] or [N,M] or [N,M,P] style citation markers where N,M,P are integers.
_CITATION_RE = re.compile(r"\[(\d+(?:,\s*\d+)*)\]")
def extract_citations(text: str) -> list[int]:
"""Extract all citation indices from ``[N]`` and ``[N,M,...]`` markers in *text*.
Returns a sorted list of unique integer indices.
"""
indices: set[int] = set()
for match in _CITATION_RE.finditer(text):
for part in match.group(1).split(","):
indices.add(int(part.strip()))
return sorted(indices)
def validate_citations(
sections: list[BodySection],
moment_count: int,
) -> dict:
"""Validate citation markers across all *sections* against *moment_count* source moments.
Moments are expected to be referenced as 0-based indices ``[0]`` through
``[moment_count - 1]``.
Returns a dict with:
valid (bool): True when every cited index is in range and every moment is cited.
total_citations (int): Count of unique cited indices.
invalid_indices (list[int]): Cited indices that are out of range.
uncited_moments (list[int]): In-range moment indices that are never cited.
coverage_pct (float): Percentage of moments that are cited (0.0100.0).
"""
all_indices: set[int] = set()
for section in sections:
all_indices.update(extract_citations(section.content))
for sub in section.subsections:
all_indices.update(extract_citations(sub.content))
valid_range = set(range(moment_count))
invalid_indices = sorted(all_indices - valid_range)
cited_in_range = all_indices & valid_range
uncited_moments = sorted(valid_range - cited_in_range)
coverage_pct = (len(cited_in_range) / moment_count * 100.0) if moment_count > 0 else 0.0
return {
"valid": len(invalid_indices) == 0 and len(uncited_moments) == 0,
"total_citations": len(cited_in_range),
"invalid_indices": invalid_indices,
"uncited_moments": uncited_moments,
"coverage_pct": round(coverage_pct, 1),
}