- "backend/schemas.py" - "backend/pipeline/stages.py" - "backend/pipeline/qdrant_client.py" - "backend/search_service.py" - "backend/pipeline/test_section_embedding.py" GSD-Task: S07/T01
328 lines
13 KiB
Python
328 lines
13 KiB
Python
"""Unit tests for per-section embedding in stage 6.
|
|
|
|
Tests _slugify_heading, section embed text construction, delete-before-upsert
|
|
ordering, v1 page skipping, upsert payload correctness, and deterministic UUIDs.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import uuid
|
|
from unittest.mock import MagicMock, call, patch
|
|
|
|
import pytest
|
|
|
|
# ── slugify tests ────────────────────────────────────────────────────────────
|
|
|
|
from pipeline.stages import _slugify_heading
|
|
|
|
|
|
class TestSlugifyHeading:
|
|
"""Verify _slugify_heading matches frontend TableOfContents.tsx slugify."""
|
|
|
|
def test_simple_heading(self):
|
|
assert _slugify_heading("Grain Position Control") == "grain-position-control"
|
|
|
|
def test_ampersand_and_special_chars(self):
|
|
# Consecutive non-alphanumeric chars collapse to a single hyphen
|
|
assert _slugify_heading("LFO Routing & Modulation") == "lfo-routing-modulation"
|
|
|
|
def test_leading_trailing_special(self):
|
|
assert _slugify_heading(" —Hello World! ") == "hello-world"
|
|
|
|
def test_numbers_preserved(self):
|
|
assert _slugify_heading("Step 1: Setup") == "step-1-setup"
|
|
|
|
def test_empty_string(self):
|
|
assert _slugify_heading("") == ""
|
|
|
|
def test_only_special_chars(self):
|
|
assert _slugify_heading("!@#$%") == ""
|
|
|
|
def test_unicode_stripped(self):
|
|
assert _slugify_heading("Café Sounds") == "caf-sounds"
|
|
|
|
def test_multiple_hyphens_collapse(self):
|
|
assert _slugify_heading("A -- B --- C") == "a-b-c"
|
|
|
|
|
|
# ── Deterministic UUID tests ─────────────────────────────────────────────────
|
|
|
|
_QDRANT_NAMESPACE = uuid.UUID("a1b2c3d4-e5f6-7890-abcd-ef1234567890")
|
|
|
|
|
|
class TestDeterministicUUIDs:
|
|
"""Verify same page+section always produces the same point ID."""
|
|
|
|
def test_same_input_same_uuid(self):
|
|
id1 = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:page-abc:grain-position-control"))
|
|
id2 = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:page-abc:grain-position-control"))
|
|
assert id1 == id2
|
|
|
|
def test_different_section_different_uuid(self):
|
|
id1 = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:page-abc:section-a"))
|
|
id2 = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:page-abc:section-b"))
|
|
assert id1 != id2
|
|
|
|
|
|
# ── QdrantManager section methods ────────────────────────────────────────────
|
|
|
|
|
|
class TestQdrantManagerSections:
|
|
"""Test upsert_technique_sections and delete_sections_by_page_id."""
|
|
|
|
def _make_manager(self):
|
|
"""Create a QdrantManager with a mocked client."""
|
|
with patch("pipeline.qdrant_client.QdrantClient") as MockClient:
|
|
mock_client = MockClient.return_value
|
|
from pipeline.qdrant_client import QdrantManager
|
|
settings = MagicMock()
|
|
settings.qdrant_url = "http://localhost:6333"
|
|
settings.qdrant_collection = "test_collection"
|
|
settings.embedding_dimensions = 768
|
|
mgr = QdrantManager(settings)
|
|
mgr._client = mock_client
|
|
return mgr, mock_client
|
|
|
|
def test_upsert_builds_correct_payloads(self):
|
|
mgr, mock_client = self._make_manager()
|
|
sections = [
|
|
{
|
|
"page_id": "p1",
|
|
"creator_id": "c1",
|
|
"creator_name": "Keota",
|
|
"title": "Granular Synthesis",
|
|
"slug": "granular-synthesis",
|
|
"section_heading": "Grain Position Control",
|
|
"section_anchor": "grain-position-control",
|
|
"topic_category": "Sound Design",
|
|
"topic_tags": ["granular", "synthesis"],
|
|
"summary": "Control the grain position parameter.",
|
|
},
|
|
]
|
|
vectors = [[0.1] * 768]
|
|
|
|
mgr.upsert_technique_sections(sections, vectors)
|
|
|
|
# Verify upsert was called
|
|
assert mock_client.upsert.called
|
|
points = mock_client.upsert.call_args[1]["points"]
|
|
assert len(points) == 1
|
|
|
|
payload = points[0].payload
|
|
assert payload["type"] == "technique_section"
|
|
assert payload["page_id"] == "p1"
|
|
assert payload["section_heading"] == "Grain Position Control"
|
|
assert payload["section_anchor"] == "grain-position-control"
|
|
assert payload["slug"] == "granular-synthesis"
|
|
|
|
# Verify deterministic UUID
|
|
expected_id = str(uuid.uuid5(_QDRANT_NAMESPACE, "ts:p1:grain-position-control"))
|
|
assert points[0].id == expected_id
|
|
|
|
def test_upsert_count_mismatch_skips(self):
|
|
mgr, mock_client = self._make_manager()
|
|
mgr.upsert_technique_sections([{"page_id": "p1"}], [[0.1], [0.2]])
|
|
assert not mock_client.upsert.called
|
|
|
|
def test_upsert_empty_list_skips(self):
|
|
mgr, mock_client = self._make_manager()
|
|
mgr.upsert_technique_sections([], [])
|
|
assert not mock_client.upsert.called
|
|
|
|
def test_summary_truncated_to_200_chars(self):
|
|
mgr, mock_client = self._make_manager()
|
|
long_summary = "x" * 500
|
|
sections = [{
|
|
"page_id": "p1", "section_heading": "H", "section_anchor": "h",
|
|
"summary": long_summary,
|
|
}]
|
|
vectors = [[0.1] * 768]
|
|
mgr.upsert_technique_sections(sections, vectors)
|
|
payload = mock_client.upsert.call_args[1]["points"][0].payload
|
|
assert len(payload["summary"]) == 200
|
|
|
|
def test_delete_sections_by_page_id(self):
|
|
mgr, mock_client = self._make_manager()
|
|
mgr.delete_sections_by_page_id("p1")
|
|
assert mock_client.delete.called
|
|
filter_arg = mock_client.delete.call_args[1]["points_selector"]
|
|
# Verify filter has both page_id and type conditions
|
|
must_conditions = filter_arg.must
|
|
assert len(must_conditions) == 2
|
|
keys = {c.key for c in must_conditions}
|
|
assert keys == {"page_id", "type"}
|
|
|
|
def test_delete_sections_logs_on_failure(self):
|
|
mgr, mock_client = self._make_manager()
|
|
mock_client.delete.side_effect = Exception("connection refused")
|
|
# Should not raise
|
|
mgr.delete_sections_by_page_id("p1")
|
|
|
|
|
|
# ── Stage 6 section embedding logic ─────────────────────────────────────────
|
|
|
|
class TestStage6SectionEmbedding:
|
|
"""Test the section embedding block within stage6_embed_and_index.
|
|
|
|
Uses mocked DB, embedding client, and QdrantManager to verify:
|
|
- v2 pages produce section points
|
|
- v1 pages are skipped
|
|
- delete is called before upsert
|
|
- embed text includes creator/page/section context
|
|
- sections with empty headings are skipped
|
|
- subsection content is included in embed text
|
|
"""
|
|
|
|
def _make_page(self, page_id="p1", creator_id="c1", format_="v2",
|
|
body_sections=None, title="Granular Synthesis",
|
|
slug="granular-synthesis"):
|
|
"""Create a mock TechniquePage-like object."""
|
|
page = MagicMock()
|
|
page.id = page_id
|
|
page.creator_id = creator_id
|
|
page.body_sections_format = format_
|
|
page.body_sections = body_sections
|
|
page.title = title
|
|
page.slug = slug
|
|
page.topic_category = "Sound Design"
|
|
page.topic_tags = ["granular"]
|
|
page.summary = "Page summary"
|
|
return page
|
|
|
|
def test_v1_page_produces_zero_sections(self):
|
|
"""Pages with body_sections_format != 'v2' should be skipped."""
|
|
page = self._make_page(format_="v1", body_sections=[
|
|
{"heading": "Section A", "content": "Content A"},
|
|
])
|
|
v2_pages = [p for p in [page] if getattr(p, "body_sections_format", "v1") == "v2"]
|
|
assert len(v2_pages) == 0
|
|
|
|
def test_v2_page_none_body_sections(self):
|
|
"""Page with body_sections=None → skipped (not a list)."""
|
|
page = self._make_page(format_="v2", body_sections=None)
|
|
v2_pages = [p for p in [page] if getattr(p, "body_sections_format", "v1") == "v2"]
|
|
assert len(v2_pages) == 1
|
|
# body_sections is None → not a list → skipped in the loop
|
|
assert not isinstance(page.body_sections, list)
|
|
|
|
def test_section_empty_heading_skipped(self):
|
|
"""Sections with empty heading should be skipped."""
|
|
page = self._make_page(body_sections=[
|
|
{"heading": "", "content": "Orphan content"},
|
|
{"heading": "Valid", "content": "Real content"},
|
|
])
|
|
sections_with_heading = [
|
|
s for s in page.body_sections
|
|
if isinstance(s, dict) and s.get("heading", "").strip()
|
|
]
|
|
assert len(sections_with_heading) == 1
|
|
assert sections_with_heading[0]["heading"] == "Valid"
|
|
|
|
def test_subsection_content_included_in_embed_text(self):
|
|
"""Section with subsections should include subsection content."""
|
|
section = {
|
|
"heading": "Grain Position Control",
|
|
"content": "Main content",
|
|
"subsections": [
|
|
{"heading": "Fine Tuning", "content": "Fine tune the position."},
|
|
{"heading": "Automation", "content": "Automate grain pos."},
|
|
],
|
|
}
|
|
|
|
# Reproduce the embed text construction from stage 6
|
|
creator_name = "Keota"
|
|
page_title = "Granular Synthesis"
|
|
heading = section["heading"]
|
|
section_content = section.get("content", "")
|
|
subsection_parts = []
|
|
for sub in section.get("subsections", []):
|
|
if isinstance(sub, dict):
|
|
sub_heading = sub.get("heading", "")
|
|
sub_content = sub.get("content", "")
|
|
if sub_heading:
|
|
subsection_parts.append(f"{sub_heading}: {sub_content}")
|
|
elif sub_content:
|
|
subsection_parts.append(sub_content)
|
|
|
|
embed_text = (
|
|
f"{creator_name} {page_title} — {heading}: "
|
|
f"{section_content} {' '.join(subsection_parts)}"
|
|
).strip()
|
|
|
|
assert "Fine Tuning: Fine tune the position." in embed_text
|
|
assert "Automation: Automate grain pos." in embed_text
|
|
assert "Keota Granular Synthesis" in embed_text
|
|
|
|
def test_subsection_no_direct_content(self):
|
|
"""Section with subsections but no direct content still embeds subsection text."""
|
|
section = {
|
|
"heading": "Advanced Techniques",
|
|
"content": "",
|
|
"subsections": [
|
|
{"heading": "Sub A", "content": "Content A"},
|
|
],
|
|
}
|
|
heading = section["heading"]
|
|
section_content = section.get("content", "")
|
|
subsection_parts = []
|
|
for sub in section.get("subsections", []):
|
|
if isinstance(sub, dict):
|
|
sub_heading = sub.get("heading", "")
|
|
sub_content = sub.get("content", "")
|
|
if sub_heading:
|
|
subsection_parts.append(f"{sub_heading}: {sub_content}")
|
|
elif sub_content:
|
|
subsection_parts.append(sub_content)
|
|
|
|
embed_text = (
|
|
f"Creator Page — {heading}: "
|
|
f"{section_content} {' '.join(subsection_parts)}"
|
|
).strip()
|
|
|
|
assert "Sub A: Content A" in embed_text
|
|
|
|
def test_delete_called_before_upsert_ordering(self):
|
|
"""Verify delete_sections_by_page_id is called before upsert_technique_sections."""
|
|
call_order = []
|
|
mock_qdrant = MagicMock()
|
|
mock_qdrant.delete_sections_by_page_id.side_effect = lambda pid: call_order.append(("delete", pid))
|
|
mock_qdrant.upsert_technique_sections.side_effect = lambda s, v: call_order.append(("upsert", len(s)))
|
|
|
|
mock_embed = MagicMock()
|
|
mock_embed.embed.return_value = [[0.1] * 768] # One vector
|
|
|
|
page = self._make_page(body_sections=[
|
|
{"heading": "Section A", "content": "Content A"},
|
|
])
|
|
|
|
creator_map = {str(page.creator_id): "TestCreator"}
|
|
v2_pages = [page]
|
|
page_id_str = str(page.id)
|
|
|
|
# Simulate the section embedding block
|
|
for p in v2_pages:
|
|
body_sections = p.body_sections
|
|
if not isinstance(body_sections, list):
|
|
continue
|
|
creator_name = creator_map.get(str(p.creator_id), "")
|
|
mock_qdrant.delete_sections_by_page_id(str(p.id))
|
|
|
|
section_texts = []
|
|
section_dicts = []
|
|
for section in body_sections:
|
|
if not isinstance(section, dict):
|
|
continue
|
|
heading = section.get("heading", "")
|
|
if not heading or not heading.strip():
|
|
continue
|
|
section_anchor = _slugify_heading(heading)
|
|
section_texts.append(f"{creator_name} {p.title} — {heading}")
|
|
section_dicts.append({"page_id": str(p.id), "section_anchor": section_anchor})
|
|
|
|
if section_texts:
|
|
vectors = mock_embed.embed(section_texts)
|
|
if vectors:
|
|
mock_qdrant.upsert_technique_sections(section_dicts, vectors)
|
|
|
|
assert call_order[0][0] == "delete"
|
|
assert call_order[1][0] == "upsert"
|