"""Integration tests for the /api/v1/search endpoint. Tests run against a real PostgreSQL test database via httpx.AsyncClient. SearchService is mocked at the router dependency level so we can test endpoint behavior without requiring external embedding API or Qdrant. """ from __future__ import annotations import uuid from unittest.mock import AsyncMock, MagicMock, patch import pytest import pytest_asyncio from httpx import AsyncClient from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker from models import ( ContentType, Creator, KeyMoment, KeyMomentContentType, ProcessingStatus, SourceVideo, TechniquePage, ) from search_service import SearchService SEARCH_URL = "/api/v1/search" # ── Seed helpers ───────────────────────────────────────────────────────────── async def _seed_search_data(db_engine) -> dict: """Seed 2 creators, 3 technique pages, and 5 key moments for search tests. Returns a dict with creator/technique IDs and metadata for assertions. """ session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: # Creators creator1 = Creator( name="Mr. Bill", slug="mr-bill", genres=["Bass music", "Glitch"], folder_name="MrBill", ) creator2 = Creator( name="KOAN Sound", slug="koan-sound", genres=["Drum & bass", "Neuro"], folder_name="KOANSound", ) session.add_all([creator1, creator2]) await session.flush() # Videos (needed for key moments FK) video1 = SourceVideo( creator_id=creator1.id, filename="bass-design-101.mp4", file_path="MrBill/bass-design-101.mp4", duration_seconds=600, content_type=ContentType.tutorial, processing_status=ProcessingStatus.complete, ) video2 = SourceVideo( creator_id=creator2.id, filename="reese-bass-deep-dive.mp4", file_path="KOANSound/reese-bass-deep-dive.mp4", duration_seconds=900, content_type=ContentType.tutorial, processing_status=ProcessingStatus.complete, ) session.add_all([video1, video2]) await session.flush() # Technique pages tp1 = TechniquePage( creator_id=creator1.id, title="Reese Bass Design", slug="reese-bass-design", topic_category="Sound design", topic_tags=["bass", "textures"], summary="How to create a classic reese bass", ) tp2 = TechniquePage( creator_id=creator2.id, title="Granular Pad Textures", slug="granular-pad-textures", topic_category="Synthesis", topic_tags=["granular", "pads"], summary="Creating pad textures with granular synthesis", ) tp3 = TechniquePage( creator_id=creator1.id, title="FM Bass Layering", slug="fm-bass-layering", topic_category="Synthesis", topic_tags=["fm", "bass"], summary="FM synthesis techniques for bass layering", ) session.add_all([tp1, tp2, tp3]) await session.flush() # Key moments km1 = KeyMoment( source_video_id=video1.id, technique_page_id=tp1.id, title="Setting up the Reese oscillator", summary="Initial oscillator setup for reese bass", start_time=10.0, end_time=60.0, content_type=KeyMomentContentType.technique, ) km2 = KeyMoment( source_video_id=video1.id, technique_page_id=tp1.id, title="Adding distortion to the Reese", summary="Distortion processing chain for reese bass", start_time=60.0, end_time=120.0, content_type=KeyMomentContentType.technique, ) km3 = KeyMoment( source_video_id=video2.id, technique_page_id=tp2.id, title="Granular engine settings", summary="Dialing in granular engine parameters", start_time=20.0, end_time=80.0, content_type=KeyMomentContentType.settings, ) km4 = KeyMoment( source_video_id=video1.id, technique_page_id=tp3.id, title="FM ratio selection", summary="Choosing FM ratios for bass tones", start_time=5.0, end_time=45.0, content_type=KeyMomentContentType.technique, ) km5 = KeyMoment( source_video_id=video2.id, title="Outro and credits", summary="End of the video", start_time=800.0, end_time=900.0, content_type=KeyMomentContentType.workflow, ) session.add_all([km1, km2, km3, km4, km5]) await session.commit() return { "creator1_id": str(creator1.id), "creator1_name": creator1.name, "creator1_slug": creator1.slug, "creator2_id": str(creator2.id), "creator2_name": creator2.name, "tp1_slug": tp1.slug, "tp1_title": tp1.title, "tp2_slug": tp2.slug, "tp3_slug": tp3.slug, } # ── Tests ──────────────────────────────────────────────────────────────────── @pytest.mark.asyncio async def test_search_happy_path_with_mocked_service(client, db_engine): """Search endpoint returns mocked results with correct response shape.""" seed = await _seed_search_data(db_engine) # Mock the SearchService.search method to return canned results mock_result = { "items": [ { "type": "technique_page", "title": "Reese Bass Design", "slug": "reese-bass-design", "summary": "How to create a classic reese bass", "topic_category": "Sound design", "topic_tags": ["bass", "textures"], "creator_name": "Mr. Bill", "creator_slug": "mr-bill", "score": 0.95, } ], "total": 1, "query": "reese bass", "fallback_used": False, } with patch("routers.search.SearchService") as MockSvc: instance = MockSvc.return_value instance.search = AsyncMock(return_value=mock_result) resp = await client.get(SEARCH_URL, params={"q": "reese bass"}) assert resp.status_code == 200 data = resp.json() assert data["query"] == "reese bass" assert data["total"] == 1 assert data["fallback_used"] is False assert len(data["items"]) == 1 item = data["items"][0] assert item["title"] == "Reese Bass Design" assert item["slug"] == "reese-bass-design" assert "score" in item @pytest.mark.asyncio async def test_search_empty_query_returns_empty(client, db_engine): """Empty search query returns empty results without hitting SearchService.""" await _seed_search_data(db_engine) # With empty query, the search service returns empty results directly mock_result = { "items": [], "total": 0, "query": "", "fallback_used": False, } with patch("routers.search.SearchService") as MockSvc: instance = MockSvc.return_value instance.search = AsyncMock(return_value=mock_result) resp = await client.get(SEARCH_URL, params={"q": ""}) assert resp.status_code == 200 data = resp.json() assert data["items"] == [] assert data["total"] == 0 assert data["query"] == "" assert data["fallback_used"] is False @pytest.mark.asyncio async def test_search_keyword_fallback(client, db_engine): """When embedding fails, search uses keyword fallback and sets fallback_used=true.""" seed = await _seed_search_data(db_engine) mock_result = { "items": [ { "type": "technique_page", "title": "Reese Bass Design", "slug": "reese-bass-design", "summary": "How to create a classic reese bass", "topic_category": "Sound design", "topic_tags": ["bass", "textures"], "creator_name": "", "creator_slug": "", "score": 0.0, } ], "total": 1, "query": "reese", "fallback_used": True, } with patch("routers.search.SearchService") as MockSvc: instance = MockSvc.return_value instance.search = AsyncMock(return_value=mock_result) resp = await client.get(SEARCH_URL, params={"q": "reese"}) assert resp.status_code == 200 data = resp.json() assert data["fallback_used"] is True assert data["total"] >= 1 assert data["items"][0]["title"] == "Reese Bass Design" @pytest.mark.asyncio async def test_search_scope_filter(client, db_engine): """Search with scope=topics returns only technique_page type results.""" await _seed_search_data(db_engine) mock_result = { "items": [ { "type": "technique_page", "title": "FM Bass Layering", "slug": "fm-bass-layering", "summary": "FM synthesis techniques for bass layering", "topic_category": "Synthesis", "topic_tags": ["fm", "bass"], "creator_name": "Mr. Bill", "creator_slug": "mr-bill", "score": 0.88, } ], "total": 1, "query": "bass", "fallback_used": False, } with patch("routers.search.SearchService") as MockSvc: instance = MockSvc.return_value instance.search = AsyncMock(return_value=mock_result) resp = await client.get(SEARCH_URL, params={"q": "bass", "scope": "topics"}) assert resp.status_code == 200 data = resp.json() # All items should be technique_page type when scope=topics for item in data["items"]: assert item["type"] == "technique_page" # Verify the service was called with scope=topics call_kwargs = instance.search.call_args assert call_kwargs.kwargs.get("scope") == "topics" or call_kwargs[1].get("scope") == "topics" @pytest.mark.asyncio async def test_search_no_matching_results(client, db_engine): """Search with no matching results returns empty items list.""" await _seed_search_data(db_engine) mock_result = { "items": [], "total": 0, "query": "zzzznonexistent", "fallback_used": True, } with patch("routers.search.SearchService") as MockSvc: instance = MockSvc.return_value instance.search = AsyncMock(return_value=mock_result) resp = await client.get(SEARCH_URL, params={"q": "zzzznonexistent"}) assert resp.status_code == 200 data = resp.json() assert data["items"] == [] assert data["total"] == 0 # ── SearchService.keyword_search integration tests ────────────────────────── @pytest.mark.asyncio async def test_keyword_search_technique_page_has_technique_page_slug(db_engine): """Keyword search for technique pages includes technique_page_slug matching its own slug.""" seed = await _seed_search_data(db_engine) session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: from config import Settings svc = SearchService(settings=Settings()) kw_result = await svc.keyword_search("Reese Bass", "topics", 10, session) results = kw_result["items"] assert len(results) >= 1 tp_result = next(r for r in results if r["type"] == "technique_page") assert tp_result["technique_page_slug"] == "reese-bass-design" assert tp_result["slug"] == "reese-bass-design" # technique_page_slug == slug for technique pages (they ARE the parent) assert tp_result["technique_page_slug"] == tp_result["slug"] @pytest.mark.asyncio async def test_keyword_search_key_moment_has_parent_technique_page_slug(db_engine): """Keyword search for key moments returns the parent technique page slug.""" seed = await _seed_search_data(db_engine) session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: from config import Settings svc = SearchService(settings=Settings()) kw_result = await svc.keyword_search("Reese", "all", 20, session) results = kw_result["items"] km_results = [r for r in results if r["type"] == "key_moment"] assert len(km_results) >= 1 for km in km_results: assert "technique_page_slug" in km # Both Reese-related key moments belong to tp1 (reese-bass-design) assert km["technique_page_slug"] == "reese-bass-design" @pytest.mark.asyncio async def test_keyword_search_key_moment_without_technique_page(db_engine): """Key moments without a technique_page_id get empty technique_page_slug.""" seed = await _seed_search_data(db_engine) session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: from config import Settings svc = SearchService(settings=Settings()) kw_result = await svc.keyword_search("Outro", "all", 20, session) results = kw_result["items"] km_results = [r for r in results if r["type"] == "key_moment"] assert len(km_results) == 1 assert km_results[0]["technique_page_slug"] == "" # ── Multi-token AND keyword search tests ───────────────────────────────────── @pytest.mark.asyncio async def test_keyword_search_multi_token_and_logic(db_engine): """Multi-token query requires all tokens to match across fields.""" seed = await _seed_search_data(db_engine) session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: from config import Settings svc = SearchService(settings=Settings()) # "Reese Bass" — both tokens appear in tp1 title "Reese Bass Design" kw_result = await svc.keyword_search("Reese Bass", "topics", 10, session) items = kw_result["items"] assert len(items) >= 1 assert all("reese" in r["title"].lower() or "bass" in r["title"].lower() for r in items if r["type"] == "technique_page") # "Granular bass" — 'granular' is in tp2, 'bass' is NOT in tp2 title/summary # but tp2 summary says "granular synthesis" not "bass" — no AND match expected kw_result2 = await svc.keyword_search("Granular bass", "topics", 10, session) items2 = kw_result2["items"] # Should NOT contain tp2 since "bass" doesn't appear in tp2's fields tp2_results = [r for r in items2 if r["slug"] == "granular-pad-textures"] assert len(tp2_results) == 0 @pytest.mark.asyncio async def test_keyword_search_cross_field_token_matching(db_engine): """Tokens can match across different fields (e.g., one in title, one in creator name).""" seed = await _seed_search_data(db_engine) session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: from config import Settings svc = SearchService(settings=Settings()) # "Bill Reese" — "Bill" matches Creator.name "Mr. Bill", "Reese" matches title kw_result = await svc.keyword_search("Bill Reese", "topics", 10, session) items = kw_result["items"] assert len(items) >= 1 # tp1 "Reese Bass Design" by "Mr. Bill" should match slugs = [r["slug"] for r in items] assert "reese-bass-design" in slugs @pytest.mark.asyncio async def test_keyword_search_partial_matches_on_zero_and(db_engine): """When AND yields no results, partial_matches returns rows scored by token coverage.""" seed = await _seed_search_data(db_engine) session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: from config import Settings svc = SearchService(settings=Settings()) # "xyznonexistent Reese" — no row matches both, but "Reese" matches several kw_result = await svc.keyword_search("xyznonexistent Reese", "all", 20, session) assert kw_result["items"] == [] assert len(kw_result["partial_matches"]) >= 1 # Partial matches should have scores between 0 and 1 for pm in kw_result["partial_matches"]: assert 0 < pm["score"] <= 1.0 @pytest.mark.asyncio async def test_keyword_search_single_token_no_partial(db_engine): """Single-token search that fails returns no partial_matches (only multi-token triggers partial).""" seed = await _seed_search_data(db_engine) session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: from config import Settings svc = SearchService(settings=Settings()) kw_result = await svc.keyword_search("xyznonexistent", "all", 20, session) assert kw_result["items"] == [] assert kw_result["partial_matches"] == [] @pytest.mark.asyncio async def test_keyword_search_topic_tags_matching(db_engine): """Tokens that appear in topic_tags array are matched via array_to_string.""" seed = await _seed_search_data(db_engine) session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: from config import Settings svc = SearchService(settings=Settings()) # "textures" is a topic_tag on tp1, "Bill" is the creator kw_result = await svc.keyword_search("textures Bill", "topics", 10, session) items = kw_result["items"] assert len(items) >= 1 slugs = [r["slug"] for r in items] assert "reese-bass-design" in slugs @pytest.mark.asyncio async def test_keyword_search_creator_genres_matching(db_engine): """Creator search matches against genres array via array_to_string.""" seed = await _seed_search_data(db_engine) session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: from config import Settings svc = SearchService(settings=Settings()) # "Glitch" is a genre on creator1 "Mr. Bill" kw_result = await svc.keyword_search("Bill Glitch", "creators", 10, session) items = kw_result["items"] assert len(items) >= 1 assert any(r["title"] == "Mr. Bill" for r in items) # ── Suggestions endpoint tests ─────────────────────────────────────────────── SUGGESTIONS_URL = "/api/v1/search/suggestions" @pytest.mark.asyncio async def test_suggestions_returns_correct_shape(client, db_engine): """Suggestions endpoint returns items with text and type fields.""" await _seed_search_data(db_engine) resp = await client.get(SUGGESTIONS_URL) assert resp.status_code == 200 data = resp.json() assert "suggestions" in data assert isinstance(data["suggestions"], list) assert len(data["suggestions"]) > 0 for item in data["suggestions"]: assert "text" in item assert "type" in item assert item["type"] in ("topic", "technique", "creator") assert len(item["text"]) > 0 @pytest.mark.asyncio async def test_suggestions_includes_all_types(client, db_engine): """Suggestions should include technique, topic, and creator types.""" await _seed_search_data(db_engine) resp = await client.get(SUGGESTIONS_URL) assert resp.status_code == 200 data = resp.json() types_present = {item["type"] for item in data["suggestions"]} assert "technique" in types_present, "Expected technique suggestions" assert "topic" in types_present, "Expected topic suggestions" assert "creator" in types_present, "Expected creator suggestions" @pytest.mark.asyncio async def test_suggestions_no_duplicates(client, db_engine): """Suggestions should not contain duplicate texts (case-insensitive).""" await _seed_search_data(db_engine) resp = await client.get(SUGGESTIONS_URL) assert resp.status_code == 200 data = resp.json() texts_lower = [item["text"].lower() for item in data["suggestions"]] assert len(texts_lower) == len(set(texts_lower)), "Duplicate suggestions found" @pytest.mark.asyncio async def test_suggestions_empty_db(client, db_engine): """Suggestions endpoint returns empty list on empty database.""" resp = await client.get(SUGGESTIONS_URL) assert resp.status_code == 200 data = resp.json() assert data["suggestions"] == [] @pytest.mark.asyncio async def test_suggestions_respects_view_count_ordering(client, db_engine): """Higher view_count technique pages should appear first among techniques.""" session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: creator = Creator( name="Test Creator", slug="test-creator", genres=["Electronic"], folder_name="TestCreator", view_count=10, ) session.add(creator) await session.flush() tp_low = TechniquePage( creator_id=creator.id, title="Low Views Page", slug="low-views-page", topic_category="Sound design", topic_tags=["bass"], view_count=5, ) tp_high = TechniquePage( creator_id=creator.id, title="High Views Page", slug="high-views-page", topic_category="Synthesis", topic_tags=["pads"], view_count=100, ) session.add_all([tp_low, tp_high]) await session.commit() resp = await client.get(SUGGESTIONS_URL) assert resp.status_code == 200 data = resp.json() technique_items = [ item for item in data["suggestions"] if item["type"] == "technique" ] assert len(technique_items) >= 2 # High Views Page should come before Low Views Page titles = [item["text"] for item in technique_items] assert titles.index("High Views Page") < titles.index("Low Views Page") # ── Match context tests ────────────────────────────────────────────────────── @pytest.mark.asyncio async def test_keyword_search_match_context_creator(db_engine): """Match context includes creator name when query matches creator.""" seed = await _seed_search_data(db_engine) session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: from config import Settings svc = SearchService(settings=Settings()) kw_result = await svc.keyword_search("Bill", "topics", 10, session) items = kw_result["items"] assert len(items) >= 1 # At least one result should have match_context mentioning the creator contexts = [r["match_context"] for r in items] assert any("Creator: Mr. Bill" in c for c in contexts), f"Expected creator context, got: {contexts}" @pytest.mark.asyncio async def test_keyword_search_match_context_tag(db_engine): """Match context includes tag name when query matches a topic tag.""" seed = await _seed_search_data(db_engine) session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: from config import Settings svc = SearchService(settings=Settings()) kw_result = await svc.keyword_search("granular", "topics", 10, session) items = kw_result["items"] assert len(items) >= 1 contexts = [r["match_context"] for r in items] assert any("Tag: granular" in c for c in contexts), f"Expected tag context, got: {contexts}" @pytest.mark.asyncio async def test_keyword_search_match_context_multi_token(db_engine): """Multi-token match context shows multiple match reasons.""" seed = await _seed_search_data(db_engine) session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: from config import Settings svc = SearchService(settings=Settings()) # "Bill bass" — "Bill" matches creator, "bass" matches tag/title kw_result = await svc.keyword_search("Bill bass", "topics", 10, session) items = kw_result["items"] assert len(items) >= 1 # The match_context should contain both creator and another field contexts = [r["match_context"] for r in items] assert any("Creator: Mr. Bill" in c for c in contexts)