- "backend/pipeline/schemas.py" - "backend/pipeline/citation_utils.py" - "backend/pipeline/test_citation_utils.py" GSD-Task: S01/T01
108 lines
4.4 KiB
Python
108 lines
4.4 KiB
Python
"""Unit tests for citation extraction and validation utilities."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from pipeline.citation_utils import extract_citations, validate_citations
|
|
from pipeline.schemas import BodySection, BodySubSection
|
|
|
|
|
|
# ── extract_citations ────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestExtractCitations:
|
|
def test_single_markers(self):
|
|
assert extract_citations("This uses reverb [0] and delay [2].") == [0, 2]
|
|
|
|
def test_multi_marker(self):
|
|
assert extract_citations("Combined approach [0,2] works well.") == [0, 2]
|
|
|
|
def test_multi_marker_with_spaces(self):
|
|
assert extract_citations("See [1, 3, 5] for details.") == [1, 3, 5]
|
|
|
|
def test_no_citations(self):
|
|
assert extract_citations("Plain text without citations.") == []
|
|
|
|
def test_duplicate_indices_deduplicated(self):
|
|
assert extract_citations("[1] and again [1] and [1,2]") == [1, 2]
|
|
|
|
def test_returns_sorted(self):
|
|
assert extract_citations("[5] then [1] then [3]") == [1, 3, 5]
|
|
|
|
def test_adjacent_markers(self):
|
|
assert extract_citations("[0][1][2]") == [0, 1, 2]
|
|
|
|
def test_does_not_match_non_numeric_brackets(self):
|
|
assert extract_citations("[abc] and [N] but [7] works") == [7]
|
|
|
|
|
|
# ── validate_citations ──────────────────────────────────────────────────────
|
|
|
|
|
|
def _make_sections(texts: list[str], sub_texts: list[list[str]] | None = None) -> list[BodySection]:
|
|
"""Helper to build BodySection lists for testing."""
|
|
sections = []
|
|
for i, text in enumerate(texts):
|
|
subs = []
|
|
if sub_texts and i < len(sub_texts):
|
|
subs = [BodySubSection(heading=f"Sub {j}", content=t) for j, t in enumerate(sub_texts[i])]
|
|
sections.append(BodySection(heading=f"Section {i}", content=text, subsections=subs))
|
|
return sections
|
|
|
|
|
|
class TestValidateCitations:
|
|
def test_all_moments_cited(self):
|
|
sections = _make_sections(["Uses [0] and [1].", "Also [2]."])
|
|
result = validate_citations(sections, moment_count=3)
|
|
assert result["valid"] is True
|
|
assert result["total_citations"] == 3
|
|
assert result["invalid_indices"] == []
|
|
assert result["uncited_moments"] == []
|
|
assert result["coverage_pct"] == 100.0
|
|
|
|
def test_out_of_range_index(self):
|
|
sections = _make_sections(["Reference [0] and [5]."])
|
|
result = validate_citations(sections, moment_count=3)
|
|
assert result["valid"] is False
|
|
assert result["invalid_indices"] == [5]
|
|
assert result["uncited_moments"] == [1, 2]
|
|
|
|
def test_multi_citation_markers(self):
|
|
sections = _make_sections(["Combined [0,2] technique."])
|
|
result = validate_citations(sections, moment_count=3)
|
|
assert result["valid"] is False # moment 1 uncited
|
|
assert result["total_citations"] == 2
|
|
assert result["uncited_moments"] == [1]
|
|
assert result["coverage_pct"] == pytest.approx(66.7, abs=0.1)
|
|
|
|
def test_no_citations_at_all(self):
|
|
sections = _make_sections(["Plain text with no markers."])
|
|
result = validate_citations(sections, moment_count=2)
|
|
assert result["valid"] is False
|
|
assert result["total_citations"] == 0
|
|
assert result["uncited_moments"] == [0, 1]
|
|
assert result["coverage_pct"] == 0.0
|
|
|
|
def test_empty_sections(self):
|
|
result = validate_citations([], moment_count=0)
|
|
assert result["valid"] is True
|
|
assert result["total_citations"] == 0
|
|
assert result["coverage_pct"] == 0.0
|
|
|
|
def test_subsection_citations_counted(self):
|
|
sections = _make_sections(
|
|
["Section text [0]."],
|
|
sub_texts=[["Subsection cites [1] and [2]."]],
|
|
)
|
|
result = validate_citations(sections, moment_count=3)
|
|
assert result["valid"] is True
|
|
assert result["total_citations"] == 3
|
|
|
|
def test_zero_moment_count_with_citations(self):
|
|
"""Citations exist but moment_count is 0 — all indices are out of range."""
|
|
sections = _make_sections(["References [0] and [1]."])
|
|
result = validate_citations(sections, moment_count=0)
|
|
assert result["valid"] is False
|
|
assert result["invalid_indices"] == [0, 1]
|
|
assert result["coverage_pct"] == 0.0
|