chrysopedia/backend/pipeline/test_section_embedding.py
jlightner 57b8705e26 feat: Added per-section embedding to stage 6 for v2 technique pages wit…
- "backend/schemas.py"
- "backend/pipeline/stages.py"
- "backend/pipeline/qdrant_client.py"
- "backend/search_service.py"
- "backend/pipeline/test_section_embedding.py"

GSD-Task: S07/T01
2026-04-03 02:12:56 +00:00

328 lines
13 KiB
Python

"""Unit tests for per-section embedding in stage 6.
Tests _slugify_heading, section embed text construction, delete-before-upsert
ordering, v1 page skipping, upsert payload correctness, and deterministic UUIDs.
"""
from __future__ import annotations
import uuid
from unittest.mock import MagicMock, call, patch
import pytest
# ── slugify tests ────────────────────────────────────────────────────────────
from pipeline.stages import _slugify_heading
class TestSlugifyHeading:
"""Verify _slugify_heading matches frontend TableOfContents.tsx slugify."""
def test_simple_heading(self):
assert _slugify_heading("Grain Position Control") == "grain-position-control"
def test_ampersand_and_special_chars(self):
# Consecutive non-alphanumeric chars collapse to a single hyphen
assert _slugify_heading("LFO Routing & Modulation") == "lfo-routing-modulation"
def test_leading_trailing_special(self):
assert _slugify_heading(" —Hello World! ") == "hello-world"
def test_numbers_preserved(self):
assert _slugify_heading("Step 1: Setup") == "step-1-setup"
def test_empty_string(self):
assert _slugify_heading("") == ""
def test_only_special_chars(self):
assert _slugify_heading("!@#$%") == ""
def test_unicode_stripped(self):
assert _slugify_heading("Café Sounds") == "caf-sounds"
def test_multiple_hyphens_collapse(self):
assert _slugify_heading("A -- B --- C") == "a-b-c"
# ── Deterministic UUID tests ─────────────────────────────────────────────────
_QDRANT_NAMESPACE = uuid.UUID("a1b2c3d4-e5f6-7890-abcd-ef1234567890")
class TestDeterministicUUIDs:
"""Verify same page+section always produces the same point ID."""
def test_same_input_same_uuid(self):
id1 = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:page-abc:grain-position-control"))
id2 = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:page-abc:grain-position-control"))
assert id1 == id2
def test_different_section_different_uuid(self):
id1 = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:page-abc:section-a"))
id2 = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:page-abc:section-b"))
assert id1 != id2
# ── QdrantManager section methods ────────────────────────────────────────────
class TestQdrantManagerSections:
"""Test upsert_technique_sections and delete_sections_by_page_id."""
def _make_manager(self):
"""Create a QdrantManager with a mocked client."""
with patch("pipeline.qdrant_client.QdrantClient") as MockClient:
mock_client = MockClient.return_value
from pipeline.qdrant_client import QdrantManager
settings = MagicMock()
settings.qdrant_url = "http://localhost:6333"
settings.qdrant_collection = "test_collection"
settings.embedding_dimensions = 768
mgr = QdrantManager(settings)
mgr._client = mock_client
return mgr, mock_client
def test_upsert_builds_correct_payloads(self):
mgr, mock_client = self._make_manager()
sections = [
{
"page_id": "p1",
"creator_id": "c1",
"creator_name": "Keota",
"title": "Granular Synthesis",
"slug": "granular-synthesis",
"section_heading": "Grain Position Control",
"section_anchor": "grain-position-control",
"topic_category": "Sound Design",
"topic_tags": ["granular", "synthesis"],
"summary": "Control the grain position parameter.",
},
]
vectors = [[0.1] * 768]
mgr.upsert_technique_sections(sections, vectors)
# Verify upsert was called
assert mock_client.upsert.called
points = mock_client.upsert.call_args[1]["points"]
assert len(points) == 1
payload = points[0].payload
assert payload["type"] == "technique_section"
assert payload["page_id"] == "p1"
assert payload["section_heading"] == "Grain Position Control"
assert payload["section_anchor"] == "grain-position-control"
assert payload["slug"] == "granular-synthesis"
# Verify deterministic UUID
expected_id = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:p1:grain-position-control"))
assert points[0].id == expected_id
def test_upsert_count_mismatch_skips(self):
mgr, mock_client = self._make_manager()
mgr.upsert_technique_sections([{"page_id": "p1"}], [[0.1], [0.2]])
assert not mock_client.upsert.called
def test_upsert_empty_list_skips(self):
mgr, mock_client = self._make_manager()
mgr.upsert_technique_sections([], [])
assert not mock_client.upsert.called
def test_summary_truncated_to_200_chars(self):
mgr, mock_client = self._make_manager()
long_summary = "x" * 500
sections = [{
"page_id": "p1", "section_heading": "H", "section_anchor": "h",
"summary": long_summary,
}]
vectors = [[0.1] * 768]
mgr.upsert_technique_sections(sections, vectors)
payload = mock_client.upsert.call_args[1]["points"][0].payload
assert len(payload["summary"]) == 200
def test_delete_sections_by_page_id(self):
mgr, mock_client = self._make_manager()
mgr.delete_sections_by_page_id("p1")
assert mock_client.delete.called
filter_arg = mock_client.delete.call_args[1]["points_selector"]
# Verify filter has both page_id and type conditions
must_conditions = filter_arg.must
assert len(must_conditions) == 2
keys = {c.key for c in must_conditions}
assert keys == {"page_id", "type"}
def test_delete_sections_logs_on_failure(self):
mgr, mock_client = self._make_manager()
mock_client.delete.side_effect = Exception("connection refused")
# Should not raise
mgr.delete_sections_by_page_id("p1")
# ── Stage 6 section embedding logic ─────────────────────────────────────────
class TestStage6SectionEmbedding:
"""Test the section embedding block within stage6_embed_and_index.
Uses mocked DB, embedding client, and QdrantManager to verify:
- v2 pages produce section points
- v1 pages are skipped
- delete is called before upsert
- embed text includes creator/page/section context
- sections with empty headings are skipped
- subsection content is included in embed text
"""
def _make_page(self, page_id="p1", creator_id="c1", format_="v2",
body_sections=None, title="Granular Synthesis",
slug="granular-synthesis"):
"""Create a mock TechniquePage-like object."""
page = MagicMock()
page.id = page_id
page.creator_id = creator_id
page.body_sections_format = format_
page.body_sections = body_sections
page.title = title
page.slug = slug
page.topic_category = "Sound Design"
page.topic_tags = ["granular"]
page.summary = "Page summary"
return page
def test_v1_page_produces_zero_sections(self):
"""Pages with body_sections_format != 'v2' should be skipped."""
page = self._make_page(format_="v1", body_sections=[
{"heading": "Section A", "content": "Content A"},
])
v2_pages = [p for p in [page] if getattr(p, "body_sections_format", "v1") == "v2"]
assert len(v2_pages) == 0
def test_v2_page_none_body_sections(self):
"""Page with body_sections=None → skipped (not a list)."""
page = self._make_page(format_="v2", body_sections=None)
v2_pages = [p for p in [page] if getattr(p, "body_sections_format", "v1") == "v2"]
assert len(v2_pages) == 1
# body_sections is None → not a list → skipped in the loop
assert not isinstance(page.body_sections, list)
def test_section_empty_heading_skipped(self):
"""Sections with empty heading should be skipped."""
page = self._make_page(body_sections=[
{"heading": "", "content": "Orphan content"},
{"heading": "Valid", "content": "Real content"},
])
sections_with_heading = [
s for s in page.body_sections
if isinstance(s, dict) and s.get("heading", "").strip()
]
assert len(sections_with_heading) == 1
assert sections_with_heading[0]["heading"] == "Valid"
def test_subsection_content_included_in_embed_text(self):
"""Section with subsections should include subsection content."""
section = {
"heading": "Grain Position Control",
"content": "Main content",
"subsections": [
{"heading": "Fine Tuning", "content": "Fine tune the position."},
{"heading": "Automation", "content": "Automate grain pos."},
],
}
# Reproduce the embed text construction from stage 6
creator_name = "Keota"
page_title = "Granular Synthesis"
heading = section["heading"]
section_content = section.get("content", "")
subsection_parts = []
for sub in section.get("subsections", []):
if isinstance(sub, dict):
sub_heading = sub.get("heading", "")
sub_content = sub.get("content", "")
if sub_heading:
subsection_parts.append(f"{sub_heading}: {sub_content}")
elif sub_content:
subsection_parts.append(sub_content)
embed_text = (
f"{creator_name} {page_title}{heading}: "
f"{section_content} {' '.join(subsection_parts)}"
).strip()
assert "Fine Tuning: Fine tune the position." in embed_text
assert "Automation: Automate grain pos." in embed_text
assert "Keota Granular Synthesis" in embed_text
def test_subsection_no_direct_content(self):
"""Section with subsections but no direct content still embeds subsection text."""
section = {
"heading": "Advanced Techniques",
"content": "",
"subsections": [
{"heading": "Sub A", "content": "Content A"},
],
}
heading = section["heading"]
section_content = section.get("content", "")
subsection_parts = []
for sub in section.get("subsections", []):
if isinstance(sub, dict):
sub_heading = sub.get("heading", "")
sub_content = sub.get("content", "")
if sub_heading:
subsection_parts.append(f"{sub_heading}: {sub_content}")
elif sub_content:
subsection_parts.append(sub_content)
embed_text = (
f"Creator Page — {heading}: "
f"{section_content} {' '.join(subsection_parts)}"
).strip()
assert "Sub A: Content A" in embed_text
def test_delete_called_before_upsert_ordering(self):
"""Verify delete_sections_by_page_id is called before upsert_technique_sections."""
call_order = []
mock_qdrant = MagicMock()
mock_qdrant.delete_sections_by_page_id.side_effect = lambda pid: call_order.append(("delete", pid))
mock_qdrant.upsert_technique_sections.side_effect = lambda s, v: call_order.append(("upsert", len(s)))
mock_embed = MagicMock()
mock_embed.embed.return_value = [[0.1] * 768] # One vector
page = self._make_page(body_sections=[
{"heading": "Section A", "content": "Content A"},
])
creator_map = {str(page.creator_id): "TestCreator"}
v2_pages = [page]
page_id_str = str(page.id)
# Simulate the section embedding block
for p in v2_pages:
body_sections = p.body_sections
if not isinstance(body_sections, list):
continue
creator_name = creator_map.get(str(p.creator_id), "")
mock_qdrant.delete_sections_by_page_id(str(p.id))
section_texts = []
section_dicts = []
for section in body_sections:
if not isinstance(section, dict):
continue
heading = section.get("heading", "")
if not heading or not heading.strip():
continue
section_anchor = _slugify_heading(heading)
section_texts.append(f"{creator_name} {p.title}{heading}")
section_dicts.append({"page_id": str(p.id), "section_anchor": section_anchor})
if section_texts:
vectors = mock_embed.embed(section_texts)
if vectors:
mock_qdrant.upsert_technique_sections(section_dicts, vectors)
assert call_order[0][0] == "delete"
assert call_order[1][0] == "upsert"