"""Unit tests for per-section embedding in stage 6. Tests _slugify_heading, section embed text construction, delete-before-upsert ordering, v1 page skipping, upsert payload correctness, and deterministic UUIDs. """ from __future__ import annotations import uuid from unittest.mock import MagicMock, call, patch import pytest # ── slugify tests ──────────────────────────────────────────────────────────── from pipeline.stages import _slugify_heading class TestSlugifyHeading: """Verify _slugify_heading matches frontend TableOfContents.tsx slugify.""" def test_simple_heading(self): assert _slugify_heading("Grain Position Control") == "grain-position-control" def test_ampersand_and_special_chars(self): # Consecutive non-alphanumeric chars collapse to a single hyphen assert _slugify_heading("LFO Routing & Modulation") == "lfo-routing-modulation" def test_leading_trailing_special(self): assert _slugify_heading(" —Hello World! ") == "hello-world" def test_numbers_preserved(self): assert _slugify_heading("Step 1: Setup") == "step-1-setup" def test_empty_string(self): assert _slugify_heading("") == "" def test_only_special_chars(self): assert _slugify_heading("!@#$%") == "" def test_unicode_stripped(self): assert _slugify_heading("Café Sounds") == "caf-sounds" def test_multiple_hyphens_collapse(self): assert _slugify_heading("A -- B --- C") == "a-b-c" # ── Deterministic UUID tests ───────────────────────────────────────────────── _QDRANT_NAMESPACE = uuid.UUID("a1b2c3d4-e5f6-7890-abcd-ef1234567890") class TestDeterministicUUIDs: """Verify same page+section always produces the same point ID.""" def test_same_input_same_uuid(self): id1 = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:page-abc:grain-position-control")) id2 = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:page-abc:grain-position-control")) assert id1 == id2 def test_different_section_different_uuid(self): id1 = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:page-abc:section-a")) id2 = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:page-abc:section-b")) assert id1 != id2 # ── QdrantManager section methods ──────────────────────────────────────────── class TestQdrantManagerSections: """Test upsert_technique_sections and delete_sections_by_page_id.""" def _make_manager(self): """Create a QdrantManager with a mocked client.""" with patch("pipeline.qdrant_client.QdrantClient") as MockClient: mock_client = MockClient.return_value from pipeline.qdrant_client import QdrantManager settings = MagicMock() settings.qdrant_url = "http://localhost:6333" settings.qdrant_collection = "test_collection" settings.embedding_dimensions = 768 mgr = QdrantManager(settings) mgr._client = mock_client return mgr, mock_client def test_upsert_builds_correct_payloads(self): mgr, mock_client = self._make_manager() sections = [ { "page_id": "p1", "creator_id": "c1", "creator_name": "Keota", "title": "Granular Synthesis", "slug": "granular-synthesis", "section_heading": "Grain Position Control", "section_anchor": "grain-position-control", "topic_category": "Sound Design", "topic_tags": ["granular", "synthesis"], "summary": "Control the grain position parameter.", }, ] vectors = [[0.1] * 768] mgr.upsert_technique_sections(sections, vectors) # Verify upsert was called assert mock_client.upsert.called points = mock_client.upsert.call_args[1]["points"] assert len(points) == 1 payload = points[0].payload assert payload["type"] == "technique_section" assert payload["page_id"] == "p1" assert payload["section_heading"] == "Grain Position Control" assert payload["section_anchor"] == "grain-position-control" assert payload["slug"] == "granular-synthesis" # Verify deterministic UUID expected_id = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:p1:grain-position-control")) assert points[0].id == expected_id def test_upsert_count_mismatch_skips(self): mgr, mock_client = self._make_manager() mgr.upsert_technique_sections([{"page_id": "p1"}], [[0.1], [0.2]]) assert not mock_client.upsert.called def test_upsert_empty_list_skips(self): mgr, mock_client = self._make_manager() mgr.upsert_technique_sections([], []) assert not mock_client.upsert.called def test_summary_truncated_to_200_chars(self): mgr, mock_client = self._make_manager() long_summary = "x" * 500 sections = [{ "page_id": "p1", "section_heading": "H", "section_anchor": "h", "summary": long_summary, }] vectors = [[0.1] * 768] mgr.upsert_technique_sections(sections, vectors) payload = mock_client.upsert.call_args[1]["points"][0].payload assert len(payload["summary"]) == 200 def test_delete_sections_by_page_id(self): mgr, mock_client = self._make_manager() mgr.delete_sections_by_page_id("p1") assert mock_client.delete.called filter_arg = mock_client.delete.call_args[1]["points_selector"] # Verify filter has both page_id and type conditions must_conditions = filter_arg.must assert len(must_conditions) == 2 keys = {c.key for c in must_conditions} assert keys == {"page_id", "type"} def test_delete_sections_logs_on_failure(self): mgr, mock_client = self._make_manager() mock_client.delete.side_effect = Exception("connection refused") # Should not raise mgr.delete_sections_by_page_id("p1") # ── Stage 6 section embedding logic ───────────────────────────────────────── class TestStage6SectionEmbedding: """Test the section embedding block within stage6_embed_and_index. Uses mocked DB, embedding client, and QdrantManager to verify: - v2 pages produce section points - v1 pages are skipped - delete is called before upsert - embed text includes creator/page/section context - sections with empty headings are skipped - subsection content is included in embed text """ def _make_page(self, page_id="p1", creator_id="c1", format_="v2", body_sections=None, title="Granular Synthesis", slug="granular-synthesis"): """Create a mock TechniquePage-like object.""" page = MagicMock() page.id = page_id page.creator_id = creator_id page.body_sections_format = format_ page.body_sections = body_sections page.title = title page.slug = slug page.topic_category = "Sound Design" page.topic_tags = ["granular"] page.summary = "Page summary" return page def test_v1_page_produces_zero_sections(self): """Pages with body_sections_format != 'v2' should be skipped.""" page = self._make_page(format_="v1", body_sections=[ {"heading": "Section A", "content": "Content A"}, ]) v2_pages = [p for p in [page] if getattr(p, "body_sections_format", "v1") == "v2"] assert len(v2_pages) == 0 def test_v2_page_none_body_sections(self): """Page with body_sections=None → skipped (not a list).""" page = self._make_page(format_="v2", body_sections=None) v2_pages = [p for p in [page] if getattr(p, "body_sections_format", "v1") == "v2"] assert len(v2_pages) == 1 # body_sections is None → not a list → skipped in the loop assert not isinstance(page.body_sections, list) def test_section_empty_heading_skipped(self): """Sections with empty heading should be skipped.""" page = self._make_page(body_sections=[ {"heading": "", "content": "Orphan content"}, {"heading": "Valid", "content": "Real content"}, ]) sections_with_heading = [ s for s in page.body_sections if isinstance(s, dict) and s.get("heading", "").strip() ] assert len(sections_with_heading) == 1 assert sections_with_heading[0]["heading"] == "Valid" def test_subsection_content_included_in_embed_text(self): """Section with subsections should include subsection content.""" section = { "heading": "Grain Position Control", "content": "Main content", "subsections": [ {"heading": "Fine Tuning", "content": "Fine tune the position."}, {"heading": "Automation", "content": "Automate grain pos."}, ], } # Reproduce the embed text construction from stage 6 creator_name = "Keota" page_title = "Granular Synthesis" heading = section["heading"] section_content = section.get("content", "") subsection_parts = [] for sub in section.get("subsections", []): if isinstance(sub, dict): sub_heading = sub.get("heading", "") sub_content = sub.get("content", "") if sub_heading: subsection_parts.append(f"{sub_heading}: {sub_content}") elif sub_content: subsection_parts.append(sub_content) embed_text = ( f"{creator_name} {page_title} — {heading}: " f"{section_content} {' '.join(subsection_parts)}" ).strip() assert "Fine Tuning: Fine tune the position." in embed_text assert "Automation: Automate grain pos." in embed_text assert "Keota Granular Synthesis" in embed_text def test_subsection_no_direct_content(self): """Section with subsections but no direct content still embeds subsection text.""" section = { "heading": "Advanced Techniques", "content": "", "subsections": [ {"heading": "Sub A", "content": "Content A"}, ], } heading = section["heading"] section_content = section.get("content", "") subsection_parts = [] for sub in section.get("subsections", []): if isinstance(sub, dict): sub_heading = sub.get("heading", "") sub_content = sub.get("content", "") if sub_heading: subsection_parts.append(f"{sub_heading}: {sub_content}") elif sub_content: subsection_parts.append(sub_content) embed_text = ( f"Creator Page — {heading}: " f"{section_content} {' '.join(subsection_parts)}" ).strip() assert "Sub A: Content A" in embed_text def test_delete_called_before_upsert_ordering(self): """Verify delete_sections_by_page_id is called before upsert_technique_sections.""" call_order = [] mock_qdrant = MagicMock() mock_qdrant.delete_sections_by_page_id.side_effect = lambda pid: call_order.append(("delete", pid)) mock_qdrant.upsert_technique_sections.side_effect = lambda s, v: call_order.append(("upsert", len(s))) mock_embed = MagicMock() mock_embed.embed.return_value = [[0.1] * 768] # One vector page = self._make_page(body_sections=[ {"heading": "Section A", "content": "Content A"}, ]) creator_map = {str(page.creator_id): "TestCreator"} v2_pages = [page] page_id_str = str(page.id) # Simulate the section embedding block for p in v2_pages: body_sections = p.body_sections if not isinstance(body_sections, list): continue creator_name = creator_map.get(str(p.creator_id), "") mock_qdrant.delete_sections_by_page_id(str(p.id)) section_texts = [] section_dicts = [] for section in body_sections: if not isinstance(section, dict): continue heading = section.get("heading", "") if not heading or not heading.strip(): continue section_anchor = _slugify_heading(heading) section_texts.append(f"{creator_name} {p.title} — {heading}") section_dicts.append({"page_id": str(p.id), "section_anchor": section_anchor}) if section_texts: vectors = mock_embed.embed(section_texts) if vectors: mock_qdrant.upsert_technique_sections(section_dicts, vectors) assert call_order[0][0] == "delete" assert call_order[1][0] == "upsert"