chrysopedia/backend/pipeline/test_citation_utils.py

"""Unit tests for citation extraction and validation utilities."""

from __future__ import annotations

import pytest

from pipeline.citation_utils import extract_citations, validate_citations
from pipeline.schemas import BodySection, BodySubSection


# ── extract_citations ────────────────────────────────────────────────────────


class TestExtractCitations:
    def test_single_markers(self):
        assert extract_citations("This uses reverb [0] and delay [2].") == [0, 2]

    def test_multi_marker(self):
        assert extract_citations("Combined approach [0,2] works well.") == [0, 2]

    def test_multi_marker_with_spaces(self):
        assert extract_citations("See [1, 3, 5] for details.") == [1, 3, 5]

    def test_no_citations(self):
        assert extract_citations("Plain text without citations.") == []

    def test_duplicate_indices_deduplicated(self):
        assert extract_citations("[1] and again [1] and [1,2]") == [1, 2]

    def test_returns_sorted(self):
        assert extract_citations("[5] then [1] then [3]") == [1, 3, 5]

    def test_adjacent_markers(self):
        assert extract_citations("[0][1][2]") == [0, 1, 2]

    def test_does_not_match_non_numeric_brackets(self):
        assert extract_citations("[abc] and [N] but [7] works") == [7]


# ── validate_citations ──────────────────────────────────────────────────────


def _make_sections(texts: list[str], sub_texts: list[list[str]] | None = None) -> list[BodySection]:
    """Helper to build BodySection lists for testing."""
    sections = []
    for i, text in enumerate(texts):
        subs = []
        if sub_texts and i < len(sub_texts):
            subs = [BodySubSection(heading=f"Sub {j}", content=t) for j, t in enumerate(sub_texts[i])]
        sections.append(BodySection(heading=f"Section {i}", content=text, subsections=subs))
    return sections


class TestValidateCitations:
    def test_all_moments_cited(self):
        sections = _make_sections(["Uses [0] and [1].", "Also [2]."])
        result = validate_citations(sections, moment_count=3)
        assert result["valid"] is True
        assert result["total_citations"] == 3
        assert result["invalid_indices"] == []
        assert result["uncited_moments"] == []
        assert result["coverage_pct"] == 100.0

    def test_out_of_range_index(self):
        sections = _make_sections(["Reference [0] and [5]."])
        result = validate_citations(sections, moment_count=3)
        assert result["valid"] is False
        assert result["invalid_indices"] == [5]
        assert result["uncited_moments"] == [1, 2]

    def test_multi_citation_markers(self):
        sections = _make_sections(["Combined [0,2] technique."])
        result = validate_citations(sections, moment_count=3)
        assert result["valid"] is False  # moment 1 uncited
        assert result["total_citations"] == 2
        assert result["uncited_moments"] == [1]
        assert result["coverage_pct"] == pytest.approx(66.7, abs=0.1)

    def test_no_citations_at_all(self):
        sections = _make_sections(["Plain text with no markers."])
        result = validate_citations(sections, moment_count=2)
        assert result["valid"] is False
        assert result["total_citations"] == 0
        assert result["uncited_moments"] == [0, 1]
        assert result["coverage_pct"] == 0.0

    def test_empty_sections(self):
        result = validate_citations([], moment_count=0)
        assert result["valid"] is True
        assert result["total_citations"] == 0
        assert result["coverage_pct"] == 0.0

    def test_subsection_citations_counted(self):
        sections = _make_sections(
            ["Section text [0]."],
            sub_texts=[["Subsection cites [1] and [2]."]],
        )
        result = validate_citations(sections, moment_count=3)
        assert result["valid"] is True
        assert result["total_citations"] == 3

    def test_zero_moment_count_with_citations(self):
        """Citations exist but moment_count is 0 — all indices are out of range."""
        sections = _make_sections(["References [0] and [1]."])
        result = validate_citations(sections, moment_count=0)
        assert result["valid"] is False
        assert result["invalid_indices"] == [0, 1]
        assert result["coverage_pct"] == 0.0