"""Tests for test_harness compatibility with v2 body_sections format. Validates that word-counting and citation integration work correctly with the list[BodySection] structure (v2) instead of the old dict format. """ from __future__ import annotations import pytest from pipeline.citation_utils import validate_citations from pipeline.schemas import BodySection, BodySubSection, SynthesizedPage, SynthesisResult # ── Helpers ────────────────────────────────────────────────────────────────── def _make_page( body_sections: list[BodySection], moment_indices: list[int] | None = None, title: str = "Test Page", slug: str = "test-page", ) -> SynthesizedPage: return SynthesizedPage( title=title, slug=slug, topic_category="Testing", summary="A test page.", body_sections=body_sections, moment_indices=moment_indices or [], ) def _count_words_v2(sections: list[BodySection]) -> int: """Replicate the word-counting logic from the updated test_harness.""" return sum( len(s.content.split()) + sum(len(sub.content.split()) for sub in s.subsections) for s in sections ) def _count_words_metadata(pages_dicts: list[dict]) -> int: """Replicate the metadata total_words logic (operates on dicts after model_dump).""" return sum( sum( len(s.get("content", "").split()) + sum(len(sub.get("content", "").split()) for sub in s.get("subsections", [])) for s in p.get("body_sections", []) ) for p in pages_dicts ) # ── Word counting tests ───────────────────────────────────────────────────── class TestWordCounting: def test_flat_sections_no_subsections(self): sections = [ BodySection(heading="Intro", content="one two three"), BodySection(heading="Details", content="four five"), ] assert _count_words_v2(sections) == 5 def test_sections_with_subsections(self): sections = [ BodySection( heading="Main", content="alpha beta", # 2 words subsections=[ BodySubSection(heading="Sub A", content="gamma delta epsilon"), # 3 words BodySubSection(heading="Sub B", content="zeta"), # 1 word ], ), ] assert _count_words_v2(sections) == 6 def test_empty_sections_list(self): assert _count_words_v2([]) == 0 def test_section_with_empty_content(self): sections = [ BodySection(heading="Empty", content=""), ] # "".split() returns [], len([]) == 0 assert _count_words_v2(sections) == 0 def test_metadata_word_count_matches(self): """Metadata total_words (from model_dump dicts) matches Pydantic object counting.""" sections = [ BodySection( heading="H2", content="one two three", subsections=[ BodySubSection(heading="H3", content="four five six seven"), ], ), BodySection(heading="Another", content="eight nine"), ] page = _make_page(sections, moment_indices=[0, 1]) pages_dicts = [page.model_dump()] assert _count_words_v2(sections) == 9 assert _count_words_metadata(pages_dicts) == 9 # ── Section/subsection counting ───────────────────────────────────────────── class TestSectionCounting: def test_section_and_subsection_counts(self): sections = [ BodySection(heading="A", content="text", subsections=[ BodySubSection(heading="A.1", content="sub text"), ]), BodySection(heading="B", content="more text"), BodySection(heading="C", content="even more", subsections=[ BodySubSection(heading="C.1", content="sub1"), BodySubSection(heading="C.2", content="sub2"), ]), ] section_count = len(sections) subsection_count = sum(len(s.subsections) for s in sections) assert section_count == 3 assert subsection_count == 3 # ── Citation integration ───────────────────────────────────────────────────── class TestCitationIntegration: def test_full_coverage(self): sections = [ BodySection(heading="Intro", content="First point [0]. Second point [1]."), BodySection(heading="Details", content="More on [0] and [2]."), ] result = validate_citations(sections, moment_count=3) assert result["valid"] is True assert result["coverage_pct"] == 100.0 assert result["invalid_indices"] == [] assert result["uncited_moments"] == [] def test_partial_coverage(self): sections = [ BodySection(heading="Intro", content="Only cites [0]."), ] result = validate_citations(sections, moment_count=3) assert result["valid"] is False assert result["coverage_pct"] == pytest.approx(33.3, abs=0.1) assert result["uncited_moments"] == [1, 2] def test_invalid_index(self): sections = [ BodySection(heading="Bad", content="Cites [0] and [99]."), ] result = validate_citations(sections, moment_count=2) assert result["invalid_indices"] == [99] def test_citations_in_subsections(self): sections = [ BodySection( heading="Main", content="See [0].", subsections=[ BodySubSection(heading="Sub", content="Also [1] and [2]."), ], ), ] result = validate_citations(sections, moment_count=3) assert result["valid"] is True assert result["total_citations"] == 3 def test_multi_citation_markers(self): sections = [ BodySection(heading="X", content="Both sources agree [0,1]."), ] result = validate_citations(sections, moment_count=2) assert result["valid"] is True assert result["total_citations"] == 2 def test_no_sections(self): result = validate_citations([], moment_count=0) assert result["valid"] is True assert result["coverage_pct"] == 0.0 # ── End-to-end: SynthesisResult with v2 body_sections ─────────────────────── class TestSynthesisResultV2: def test_round_trip_model_dump(self): """SynthesisResult with v2 body_sections round-trips through model_dump/validate.""" sections = [ BodySection( heading="Overview", content="This technique [0] is fundamental.", subsections=[ BodySubSection(heading="Key Concept", content="Detail [1]."), ], ), ] page = _make_page(sections, moment_indices=[0, 1]) result = SynthesisResult(pages=[page]) dumped = result.model_dump() restored = SynthesisResult.model_validate(dumped) assert len(restored.pages) == 1 restored_page = restored.pages[0] assert len(restored_page.body_sections) == 1 assert restored_page.body_sections[0].heading == "Overview" assert len(restored_page.body_sections[0].subsections) == 1 assert restored_page.body_sections_format == "v2"