"""Unit tests for citation extraction and validation utilities.""" from __future__ import annotations import pytest from pipeline.citation_utils import extract_citations, validate_citations from pipeline.schemas import BodySection, BodySubSection # ── extract_citations ──────────────────────────────────────────────────────── class TestExtractCitations: def test_single_markers(self): assert extract_citations("This uses reverb [0] and delay [2].") == [0, 2] def test_multi_marker(self): assert extract_citations("Combined approach [0,2] works well.") == [0, 2] def test_multi_marker_with_spaces(self): assert extract_citations("See [1, 3, 5] for details.") == [1, 3, 5] def test_no_citations(self): assert extract_citations("Plain text without citations.") == [] def test_duplicate_indices_deduplicated(self): assert extract_citations("[1] and again [1] and [1,2]") == [1, 2] def test_returns_sorted(self): assert extract_citations("[5] then [1] then [3]") == [1, 3, 5] def test_adjacent_markers(self): assert extract_citations("[0][1][2]") == [0, 1, 2] def test_does_not_match_non_numeric_brackets(self): assert extract_citations("[abc] and [N] but [7] works") == [7] # ── validate_citations ────────────────────────────────────────────────────── def _make_sections(texts: list[str], sub_texts: list[list[str]] | None = None) -> list[BodySection]: """Helper to build BodySection lists for testing.""" sections = [] for i, text in enumerate(texts): subs = [] if sub_texts and i < len(sub_texts): subs = [BodySubSection(heading=f"Sub {j}", content=t) for j, t in enumerate(sub_texts[i])] sections.append(BodySection(heading=f"Section {i}", content=text, subsections=subs)) return sections class TestValidateCitations: def test_all_moments_cited(self): sections = _make_sections(["Uses [0] and [1].", "Also [2]."]) result = validate_citations(sections, moment_count=3) assert result["valid"] is True assert result["total_citations"] == 3 assert result["invalid_indices"] == [] assert result["uncited_moments"] == [] assert result["coverage_pct"] == 100.0 def test_out_of_range_index(self): sections = _make_sections(["Reference [0] and [5]."]) result = validate_citations(sections, moment_count=3) assert result["valid"] is False assert result["invalid_indices"] == [5] assert result["uncited_moments"] == [1, 2] def test_multi_citation_markers(self): sections = _make_sections(["Combined [0,2] technique."]) result = validate_citations(sections, moment_count=3) assert result["valid"] is False # moment 1 uncited assert result["total_citations"] == 2 assert result["uncited_moments"] == [1] assert result["coverage_pct"] == pytest.approx(66.7, abs=0.1) def test_no_citations_at_all(self): sections = _make_sections(["Plain text with no markers."]) result = validate_citations(sections, moment_count=2) assert result["valid"] is False assert result["total_citations"] == 0 assert result["uncited_moments"] == [0, 1] assert result["coverage_pct"] == 0.0 def test_empty_sections(self): result = validate_citations([], moment_count=0) assert result["valid"] is True assert result["total_citations"] == 0 assert result["coverage_pct"] == 0.0 def test_subsection_citations_counted(self): sections = _make_sections( ["Section text [0]."], sub_texts=[["Subsection cites [1] and [2]."]], ) result = validate_citations(sections, moment_count=3) assert result["valid"] is True assert result["total_citations"] == 3 def test_zero_moment_count_with_citations(self): """Citations exist but moment_count is 0 — all indices are out of range.""" sections = _make_sections(["References [0] and [1]."]) result = validate_citations(sections, moment_count=0) assert result["valid"] is False assert result["invalid_indices"] == [0, 1] assert result["coverage_pct"] == 0.0