- "backend/pipeline/schemas.py" - "backend/pipeline/citation_utils.py" - "backend/pipeline/test_citation_utils.py" GSD-Task: S01/T01
64 lines
2.3 KiB
Python
64 lines
2.3 KiB
Python
"""Citation extraction and validation utilities for synthesized technique pages.
|
||
|
||
Used by stage 5 synthesis and the test harness to verify that [N] citation
|
||
markers in body sections reference valid source moments.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
|
||
from pipeline.schemas import BodySection
|
||
|
||
# Matches [N] or [N,M] or [N,M,P] style citation markers where N,M,P are integers.
|
||
_CITATION_RE = re.compile(r"\[(\d+(?:,\s*\d+)*)\]")
|
||
|
||
|
||
def extract_citations(text: str) -> list[int]:
|
||
"""Extract all citation indices from ``[N]`` and ``[N,M,...]`` markers in *text*.
|
||
|
||
Returns a sorted list of unique integer indices.
|
||
"""
|
||
indices: set[int] = set()
|
||
for match in _CITATION_RE.finditer(text):
|
||
for part in match.group(1).split(","):
|
||
indices.add(int(part.strip()))
|
||
return sorted(indices)
|
||
|
||
|
||
def validate_citations(
|
||
sections: list[BodySection],
|
||
moment_count: int,
|
||
) -> dict:
|
||
"""Validate citation markers across all *sections* against *moment_count* source moments.
|
||
|
||
Moments are expected to be referenced as 0-based indices ``[0]`` through
|
||
``[moment_count - 1]``.
|
||
|
||
Returns a dict with:
|
||
valid (bool): True when every cited index is in range and every moment is cited.
|
||
total_citations (int): Count of unique cited indices.
|
||
invalid_indices (list[int]): Cited indices that are out of range.
|
||
uncited_moments (list[int]): In-range moment indices that are never cited.
|
||
coverage_pct (float): Percentage of moments that are cited (0.0–100.0).
|
||
"""
|
||
all_indices: set[int] = set()
|
||
|
||
for section in sections:
|
||
all_indices.update(extract_citations(section.content))
|
||
for sub in section.subsections:
|
||
all_indices.update(extract_citations(sub.content))
|
||
|
||
valid_range = set(range(moment_count))
|
||
invalid_indices = sorted(all_indices - valid_range)
|
||
cited_in_range = all_indices & valid_range
|
||
uncited_moments = sorted(valid_range - cited_in_range)
|
||
coverage_pct = (len(cited_in_range) / moment_count * 100.0) if moment_count > 0 else 0.0
|
||
|
||
return {
|
||
"valid": len(invalid_indices) == 0 and len(uncited_moments) == 0,
|
||
"total_citations": len(cited_in_range),
|
||
"invalid_indices": invalid_indices,
|
||
"uncited_moments": uncited_moments,
|
||
"coverage_pct": round(coverage_pct, 1),
|
||
}
|