chrysopedia/backend/pipeline/test_harness_v2_format.py
jlightner 44197f550c test: Updated test_harness.py word-count/section-count logic for list[B…
- "backend/pipeline/test_harness.py"
- "backend/pipeline/test_harness_v2_format.py"

GSD-Task: S01/T03
2026-04-03 00:54:27 +00:00

213 lines
7.7 KiB
Python

"""Tests for test_harness compatibility with v2 body_sections format.
Validates that word-counting and citation integration work correctly
with the list[BodySection] structure (v2) instead of the old dict format.
"""
from __future__ import annotations
import pytest
from pipeline.citation_utils import validate_citations
from pipeline.schemas import BodySection, BodySubSection, SynthesizedPage, SynthesisResult
# ── Helpers ──────────────────────────────────────────────────────────────────
def _make_page(
body_sections: list[BodySection],
moment_indices: list[int] | None = None,
title: str = "Test Page",
slug: str = "test-page",
) -> SynthesizedPage:
return SynthesizedPage(
title=title,
slug=slug,
topic_category="Testing",
summary="A test page.",
body_sections=body_sections,
moment_indices=moment_indices or [],
)
def _count_words_v2(sections: list[BodySection]) -> int:
"""Replicate the word-counting logic from the updated test_harness."""
return sum(
len(s.content.split()) + sum(len(sub.content.split()) for sub in s.subsections)
for s in sections
)
def _count_words_metadata(pages_dicts: list[dict]) -> int:
"""Replicate the metadata total_words logic (operates on dicts after model_dump)."""
return sum(
sum(
len(s.get("content", "").split())
+ sum(len(sub.get("content", "").split()) for sub in s.get("subsections", []))
for s in p.get("body_sections", [])
)
for p in pages_dicts
)
# ── Word counting tests ─────────────────────────────────────────────────────
class TestWordCounting:
def test_flat_sections_no_subsections(self):
sections = [
BodySection(heading="Intro", content="one two three"),
BodySection(heading="Details", content="four five"),
]
assert _count_words_v2(sections) == 5
def test_sections_with_subsections(self):
sections = [
BodySection(
heading="Main",
content="alpha beta", # 2 words
subsections=[
BodySubSection(heading="Sub A", content="gamma delta epsilon"), # 3 words
BodySubSection(heading="Sub B", content="zeta"), # 1 word
],
),
]
assert _count_words_v2(sections) == 6
def test_empty_sections_list(self):
assert _count_words_v2([]) == 0
def test_section_with_empty_content(self):
sections = [
BodySection(heading="Empty", content=""),
]
# "".split() returns [], len([]) == 0
assert _count_words_v2(sections) == 0
def test_metadata_word_count_matches(self):
"""Metadata total_words (from model_dump dicts) matches Pydantic object counting."""
sections = [
BodySection(
heading="H2",
content="one two three",
subsections=[
BodySubSection(heading="H3", content="four five six seven"),
],
),
BodySection(heading="Another", content="eight nine"),
]
page = _make_page(sections, moment_indices=[0, 1])
pages_dicts = [page.model_dump()]
assert _count_words_v2(sections) == 9
assert _count_words_metadata(pages_dicts) == 9
# ── Section/subsection counting ─────────────────────────────────────────────
class TestSectionCounting:
def test_section_and_subsection_counts(self):
sections = [
BodySection(heading="A", content="text", subsections=[
BodySubSection(heading="A.1", content="sub text"),
]),
BodySection(heading="B", content="more text"),
BodySection(heading="C", content="even more", subsections=[
BodySubSection(heading="C.1", content="sub1"),
BodySubSection(heading="C.2", content="sub2"),
]),
]
section_count = len(sections)
subsection_count = sum(len(s.subsections) for s in sections)
assert section_count == 3
assert subsection_count == 3
# ── Citation integration ─────────────────────────────────────────────────────
class TestCitationIntegration:
def test_full_coverage(self):
sections = [
BodySection(heading="Intro", content="First point [0]. Second point [1]."),
BodySection(heading="Details", content="More on [0] and [2]."),
]
result = validate_citations(sections, moment_count=3)
assert result["valid"] is True
assert result["coverage_pct"] == 100.0
assert result["invalid_indices"] == []
assert result["uncited_moments"] == []
def test_partial_coverage(self):
sections = [
BodySection(heading="Intro", content="Only cites [0]."),
]
result = validate_citations(sections, moment_count=3)
assert result["valid"] is False
assert result["coverage_pct"] == pytest.approx(33.3, abs=0.1)
assert result["uncited_moments"] == [1, 2]
def test_invalid_index(self):
sections = [
BodySection(heading="Bad", content="Cites [0] and [99]."),
]
result = validate_citations(sections, moment_count=2)
assert result["invalid_indices"] == [99]
def test_citations_in_subsections(self):
sections = [
BodySection(
heading="Main",
content="See [0].",
subsections=[
BodySubSection(heading="Sub", content="Also [1] and [2]."),
],
),
]
result = validate_citations(sections, moment_count=3)
assert result["valid"] is True
assert result["total_citations"] == 3
def test_multi_citation_markers(self):
sections = [
BodySection(heading="X", content="Both sources agree [0,1]."),
]
result = validate_citations(sections, moment_count=2)
assert result["valid"] is True
assert result["total_citations"] == 2
def test_no_sections(self):
result = validate_citations([], moment_count=0)
assert result["valid"] is True
assert result["coverage_pct"] == 0.0
# ── End-to-end: SynthesisResult with v2 body_sections ───────────────────────
class TestSynthesisResultV2:
def test_round_trip_model_dump(self):
"""SynthesisResult with v2 body_sections round-trips through model_dump/validate."""
sections = [
BodySection(
heading="Overview",
content="This technique [0] is fundamental.",
subsections=[
BodySubSection(heading="Key Concept", content="Detail [1]."),
],
),
]
page = _make_page(sections, moment_indices=[0, 1])
result = SynthesisResult(pages=[page])
dumped = result.model_dump()
restored = SynthesisResult.model_validate(dumped)
assert len(restored.pages) == 1
restored_page = restored.pages[0]
assert len(restored_page.body_sections) == 1
assert restored_page.body_sections[0].heading == "Overview"
assert len(restored_page.body_sections[0].subsections) == 1
assert restored_page.body_sections_format == "v2"