chrysopedia/backend/pipeline/test_citation_utils.py
jlightner 15dcab201a test: Added BodySection/BodySubSection schema models, changed Synthesiz…
- "backend/pipeline/schemas.py"
- "backend/pipeline/citation_utils.py"
- "backend/pipeline/test_citation_utils.py"

GSD-Task: S01/T01
2026-04-03 00:50:30 +00:00

108 lines
4.4 KiB
Python

"""Unit tests for citation extraction and validation utilities."""
from __future__ import annotations
import pytest
from pipeline.citation_utils import extract_citations, validate_citations
from pipeline.schemas import BodySection, BodySubSection
# ── extract_citations ────────────────────────────────────────────────────────
class TestExtractCitations:
def test_single_markers(self):
assert extract_citations("This uses reverb [0] and delay [2].") == [0, 2]
def test_multi_marker(self):
assert extract_citations("Combined approach [0,2] works well.") == [0, 2]
def test_multi_marker_with_spaces(self):
assert extract_citations("See [1, 3, 5] for details.") == [1, 3, 5]
def test_no_citations(self):
assert extract_citations("Plain text without citations.") == []
def test_duplicate_indices_deduplicated(self):
assert extract_citations("[1] and again [1] and [1,2]") == [1, 2]
def test_returns_sorted(self):
assert extract_citations("[5] then [1] then [3]") == [1, 3, 5]
def test_adjacent_markers(self):
assert extract_citations("[0][1][2]") == [0, 1, 2]
def test_does_not_match_non_numeric_brackets(self):
assert extract_citations("[abc] and [N] but [7] works") == [7]
# ── validate_citations ──────────────────────────────────────────────────────
def _make_sections(texts: list[str], sub_texts: list[list[str]] | None = None) -> list[BodySection]:
"""Helper to build BodySection lists for testing."""
sections = []
for i, text in enumerate(texts):
subs = []
if sub_texts and i < len(sub_texts):
subs = [BodySubSection(heading=f"Sub {j}", content=t) for j, t in enumerate(sub_texts[i])]
sections.append(BodySection(heading=f"Section {i}", content=text, subsections=subs))
return sections
class TestValidateCitations:
def test_all_moments_cited(self):
sections = _make_sections(["Uses [0] and [1].", "Also [2]."])
result = validate_citations(sections, moment_count=3)
assert result["valid"] is True
assert result["total_citations"] == 3
assert result["invalid_indices"] == []
assert result["uncited_moments"] == []
assert result["coverage_pct"] == 100.0
def test_out_of_range_index(self):
sections = _make_sections(["Reference [0] and [5]."])
result = validate_citations(sections, moment_count=3)
assert result["valid"] is False
assert result["invalid_indices"] == [5]
assert result["uncited_moments"] == [1, 2]
def test_multi_citation_markers(self):
sections = _make_sections(["Combined [0,2] technique."])
result = validate_citations(sections, moment_count=3)
assert result["valid"] is False # moment 1 uncited
assert result["total_citations"] == 2
assert result["uncited_moments"] == [1]
assert result["coverage_pct"] == pytest.approx(66.7, abs=0.1)
def test_no_citations_at_all(self):
sections = _make_sections(["Plain text with no markers."])
result = validate_citations(sections, moment_count=2)
assert result["valid"] is False
assert result["total_citations"] == 0
assert result["uncited_moments"] == [0, 1]
assert result["coverage_pct"] == 0.0
def test_empty_sections(self):
result = validate_citations([], moment_count=0)
assert result["valid"] is True
assert result["total_citations"] == 0
assert result["coverage_pct"] == 0.0
def test_subsection_citations_counted(self):
sections = _make_sections(
["Section text [0]."],
sub_texts=[["Subsection cites [1] and [2]."]],
)
result = validate_citations(sections, moment_count=3)
assert result["valid"] is True
assert result["total_citations"] == 3
def test_zero_moment_count_with_citations(self):
"""Citations exist but moment_count is 0 — all indices are out of range."""
sections = _make_sections(["References [0] and [1]."])
result = validate_citations(sections, moment_count=0)
assert result["valid"] is False
assert result["invalid_indices"] == [0, 1]
assert result["coverage_pct"] == 0.0