"""Integration tests for the LLM extraction pipeline.
Tests run against a real PostgreSQL test database with mocked LLM and Qdrant
clients. Pipeline stages are sync (Celery tasks), so tests call stage
functions directly with sync SQLAlchemy sessions.
Tests (a)–(f) call pipeline stages directly. Tests (g)–(i) use the async
HTTP client. Test (j) verifies LLM fallback logic.
"""
from __future__ import annotations
import json
import os
import pathlib
import uuid
from unittest.mock import MagicMock, patch, PropertyMock
import openai
import pytest
from sqlalchemy import create_engine, select
from sqlalchemy.orm import Session, sessionmaker
from sqlalchemy.pool import NullPool
from models import (
Creator,
KeyMoment,
KeyMomentContentType,
ProcessingStatus,
SourceVideo,
TechniquePage,
TranscriptSegment,
)
from pipeline.schemas import (
ClassificationResult,
ExtractionResult,
SegmentationResult,
SynthesisResult,
)
from tests.fixtures.mock_llm_responses import (
STAGE2_SEGMENTATION_RESPONSE,
STAGE3_EXTRACTION_RESPONSE,
STAGE4_CLASSIFICATION_RESPONSE,
STAGE5_SYNTHESIS_RESPONSE,
make_mock_embeddings,
)
# ── Test database URL ────────────────────────────────────────────────────────
TEST_DATABASE_URL_SYNC = os.getenv(
"TEST_DATABASE_URL",
"postgresql+asyncpg://chrysopedia:changeme@localhost:5433/chrysopedia_test",
).replace("postgresql+asyncpg://", "postgresql+psycopg2://")
# ── Helpers ──────────────────────────────────────────────────────────────────
def _make_mock_openai_response(content: str):
"""Build a mock OpenAI ChatCompletion response object."""
mock_message = MagicMock()
mock_message.content = content
mock_choice = MagicMock()
mock_choice.message = mock_message
mock_response = MagicMock()
mock_response.choices = [mock_choice]
return mock_response
def _make_mock_embedding_response(vectors: list[list[float]]):
"""Build a mock OpenAI Embedding response object."""
mock_items = []
for i, vec in enumerate(vectors):
item = MagicMock()
item.embedding = vec
item.index = i
mock_items.append(item)
mock_response = MagicMock()
mock_response.data = mock_items
return mock_response
def _patch_pipeline_engine(sync_engine):
"""Patch the pipeline.stages module to use the test sync engine/session."""
return [
patch("pipeline.stages._engine", sync_engine),
patch(
"pipeline.stages._SessionLocal",
sessionmaker(bind=sync_engine),
),
]
def _patch_llm_completions(side_effect_fn):
"""Patch openai.OpenAI so all instances share a mocked chat.completions.create."""
mock_client = MagicMock()
mock_client.chat.completions.create.side_effect = side_effect_fn
return patch("openai.OpenAI", return_value=mock_client)
def _create_canonical_tags_file(tmp_path: pathlib.Path) -> pathlib.Path:
"""Write a minimal canonical_tags.yaml for stage4 to load."""
config_dir = tmp_path / "config"
config_dir.mkdir(exist_ok=True)
tags_path = config_dir / "canonical_tags.yaml"
tags_path.write_text(
"categories:\n"
" - name: Mixing\n"
" description: Balancing and processing elements\n"
" sub_topics: [eq, compression, gain staging, bus processing]\n"
" - name: Sound design\n"
" description: Creating sounds\n"
" sub_topics: [bass, drums]\n"
)
return tags_path
# ── (a) Stage 2: Segmentation ───────────────────────────────────────────────
def test_stage2_segmentation_updates_topic_labels(
db_engine, sync_engine, pre_ingested_video, tmp_path
):
"""Stage 2 should update topic_label on each TranscriptSegment."""
video_id = pre_ingested_video["video_id"]
# Create prompts directory
prompts_dir = tmp_path / "prompts"
prompts_dir.mkdir()
(prompts_dir / "stage2_segmentation.txt").write_text("You are a segmentation assistant.")
# Build the mock LLM that returns the segmentation response
def llm_side_effect(**kwargs):
return _make_mock_openai_response(STAGE2_SEGMENTATION_RESPONSE)
patches = _patch_pipeline_engine(sync_engine)
for p in patches:
p.start()
with _patch_llm_completions(llm_side_effect), \
patch("pipeline.stages.get_settings") as mock_settings:
s = MagicMock()
s.prompts_path = str(prompts_dir)
s.llm_api_url = "http://mock:11434/v1"
s.llm_api_key = "sk-test"
s.llm_model = "test-model"
s.llm_fallback_url = "http://mock:11434/v1"
s.llm_fallback_model = "test-model"
s.database_url = TEST_DATABASE_URL_SYNC.replace("psycopg2", "asyncpg")
mock_settings.return_value = s
# Import and call stage directly (not via Celery)
from pipeline.stages import stage2_segmentation
result = stage2_segmentation(video_id)
assert result == video_id
for p in patches:
p.stop()
# Verify: check topic_label on segments
factory = sessionmaker(bind=sync_engine)
session = factory()
try:
segments = (
session.execute(
select(TranscriptSegment)
.where(TranscriptSegment.source_video_id == video_id)
.order_by(TranscriptSegment.segment_index)
)
.scalars()
.all()
)
# Segments 0,1 should have "Introduction", segments 2,3,4 should have "Gain Staging Technique"
assert segments[0].topic_label == "Introduction"
assert segments[1].topic_label == "Introduction"
assert segments[2].topic_label == "Gain Staging Technique"
assert segments[3].topic_label == "Gain Staging Technique"
assert segments[4].topic_label == "Gain Staging Technique"
finally:
session.close()
# ── (b) Stage 3: Extraction ─────────────────────────────────────────────────
def test_stage3_extraction_creates_key_moments(
db_engine, sync_engine, pre_ingested_video, tmp_path
):
"""Stages 2+3 should create KeyMoment rows and set processing_status=extracted."""
video_id = pre_ingested_video["video_id"]
prompts_dir = tmp_path / "prompts"
prompts_dir.mkdir()
(prompts_dir / "stage2_segmentation.txt").write_text("Segment assistant.")
(prompts_dir / "stage3_extraction.txt").write_text("Extraction assistant.")
call_count = {"n": 0}
responses = [STAGE2_SEGMENTATION_RESPONSE, STAGE3_EXTRACTION_RESPONSE, STAGE3_EXTRACTION_RESPONSE]
def llm_side_effect(**kwargs):
idx = min(call_count["n"], len(responses) - 1)
resp = responses[idx]
call_count["n"] += 1
return _make_mock_openai_response(resp)
patches = _patch_pipeline_engine(sync_engine)
for p in patches:
p.start()
with _patch_llm_completions(llm_side_effect), \
patch("pipeline.stages.get_settings") as mock_settings:
s = MagicMock()
s.prompts_path = str(prompts_dir)
s.llm_api_url = "http://mock:11434/v1"
s.llm_api_key = "sk-test"
s.llm_model = "test-model"
s.llm_fallback_url = "http://mock:11434/v1"
s.llm_fallback_model = "test-model"
s.database_url = TEST_DATABASE_URL_SYNC.replace("psycopg2", "asyncpg")
mock_settings.return_value = s
from pipeline.stages import stage2_segmentation, stage3_extraction
stage2_segmentation(video_id)
stage3_extraction(video_id)
for p in patches:
p.stop()
# Verify key moments created
factory = sessionmaker(bind=sync_engine)
session = factory()
try:
moments = (
session.execute(
select(KeyMoment)
.where(KeyMoment.source_video_id == video_id)
.order_by(KeyMoment.start_time)
)
.scalars()
.all()
)
# Two topic groups → extraction called twice → up to 4 moments
# (2 per group from the mock response)
assert len(moments) >= 2
assert moments[0].title == "Setting Levels for Gain Staging"
assert moments[0].content_type == KeyMomentContentType.technique
# Verify processing_status
video = session.execute(
select(SourceVideo).where(SourceVideo.id == video_id)
).scalar_one()
assert video.processing_status == ProcessingStatus.extracted
finally:
session.close()
# ── (c) Stage 4: Classification ─────────────────────────────────────────────
def test_stage4_classification_assigns_tags(
db_engine, sync_engine, pre_ingested_video, tmp_path
):
"""Stages 2+3+4 should store classification data in Redis."""
video_id = pre_ingested_video["video_id"]
prompts_dir = tmp_path / "prompts"
prompts_dir.mkdir()
(prompts_dir / "stage2_segmentation.txt").write_text("Segment assistant.")
(prompts_dir / "stage3_extraction.txt").write_text("Extraction assistant.")
(prompts_dir / "stage4_classification.txt").write_text("Classification assistant.")
_create_canonical_tags_file(tmp_path)
call_count = {"n": 0}
responses = [
STAGE2_SEGMENTATION_RESPONSE,
STAGE3_EXTRACTION_RESPONSE,
STAGE3_EXTRACTION_RESPONSE,
STAGE4_CLASSIFICATION_RESPONSE,
]
def llm_side_effect(**kwargs):
idx = min(call_count["n"], len(responses) - 1)
resp = responses[idx]
call_count["n"] += 1
return _make_mock_openai_response(resp)
patches = _patch_pipeline_engine(sync_engine)
for p in patches:
p.start()
stored_cls_data = {}
def mock_store_classification(vid, data):
stored_cls_data[vid] = data
with _patch_llm_completions(llm_side_effect), \
patch("pipeline.stages.get_settings") as mock_settings, \
patch("pipeline.stages._load_canonical_tags") as mock_tags, \
patch("pipeline.stages._store_classification_data", side_effect=mock_store_classification):
s = MagicMock()
s.prompts_path = str(prompts_dir)
s.llm_api_url = "http://mock:11434/v1"
s.llm_api_key = "sk-test"
s.llm_model = "test-model"
s.llm_fallback_url = "http://mock:11434/v1"
s.llm_fallback_model = "test-model"
s.database_url = TEST_DATABASE_URL_SYNC.replace("psycopg2", "asyncpg")
s.review_mode = True
mock_settings.return_value = s
mock_tags.return_value = {
"categories": [
{"name": "Mixing", "description": "Balancing", "sub_topics": ["gain staging", "eq"]},
]
}
from pipeline.stages import stage2_segmentation, stage3_extraction, stage4_classification
stage2_segmentation(video_id)
stage3_extraction(video_id)
stage4_classification(video_id)
for p in patches:
p.stop()
# Verify classification data was stored
assert video_id in stored_cls_data
cls_data = stored_cls_data[video_id]
assert len(cls_data) >= 1
assert cls_data[0]["topic_category"] == "Mixing"
assert "gain staging" in cls_data[0]["topic_tags"]
# ── (d) Stage 5: Synthesis ──────────────────────────────────────────────────
def test_stage5_synthesis_creates_technique_pages(
db_engine, sync_engine, pre_ingested_video, tmp_path
):
"""Full pipeline stages 2-5 should create TechniquePage rows linked to KeyMoments."""
video_id = pre_ingested_video["video_id"]
prompts_dir = tmp_path / "prompts"
prompts_dir.mkdir()
(prompts_dir / "stage2_segmentation.txt").write_text("Segment assistant.")
(prompts_dir / "stage3_extraction.txt").write_text("Extraction assistant.")
(prompts_dir / "stage4_classification.txt").write_text("Classification assistant.")
(prompts_dir / "stage5_synthesis.txt").write_text("Synthesis assistant.")
call_count = {"n": 0}
responses = [
STAGE2_SEGMENTATION_RESPONSE,
STAGE3_EXTRACTION_RESPONSE,
STAGE3_EXTRACTION_RESPONSE,
STAGE4_CLASSIFICATION_RESPONSE,
STAGE5_SYNTHESIS_RESPONSE,
]
def llm_side_effect(**kwargs):
idx = min(call_count["n"], len(responses) - 1)
resp = responses[idx]
call_count["n"] += 1
return _make_mock_openai_response(resp)
patches = _patch_pipeline_engine(sync_engine)
for p in patches:
p.start()
# Mock classification data in Redis (simulate stage 4 having stored it)
mock_cls_data = [
{"moment_id": "will-be-replaced", "topic_category": "Mixing", "topic_tags": ["gain staging"]},
]
with _patch_llm_completions(llm_side_effect), \
patch("pipeline.stages.get_settings") as mock_settings, \
patch("pipeline.stages._load_canonical_tags") as mock_tags, \
patch("pipeline.stages._store_classification_data"), \
patch("pipeline.stages._load_classification_data") as mock_load_cls:
s = MagicMock()
s.prompts_path = str(prompts_dir)
s.llm_api_url = "http://mock:11434/v1"
s.llm_api_key = "sk-test"
s.llm_model = "test-model"
s.llm_fallback_url = "http://mock:11434/v1"
s.llm_fallback_model = "test-model"
s.database_url = TEST_DATABASE_URL_SYNC.replace("psycopg2", "asyncpg")
s.review_mode = True
mock_settings.return_value = s
mock_tags.return_value = {
"categories": [
{"name": "Mixing", "description": "Balancing", "sub_topics": ["gain staging"]},
]
}
from pipeline.stages import (
stage2_segmentation,
stage3_extraction,
stage4_classification,
stage5_synthesis,
)
stage2_segmentation(video_id)
stage3_extraction(video_id)
stage4_classification(video_id)
# Now set up mock_load_cls to return data with real moment IDs
factory = sessionmaker(bind=sync_engine)
sess = factory()
real_moments = (
sess.execute(
select(KeyMoment).where(KeyMoment.source_video_id == video_id)
)
.scalars()
.all()
)
real_cls = [
{"moment_id": str(m.id), "topic_category": "Mixing", "topic_tags": ["gain staging"]}
for m in real_moments
]
sess.close()
mock_load_cls.return_value = real_cls
stage5_synthesis(video_id)
for p in patches:
p.stop()
# Verify TechniquePages created
factory = sessionmaker(bind=sync_engine)
session = factory()
try:
pages = session.execute(select(TechniquePage)).scalars().all()
assert len(pages) >= 1
page = pages[0]
assert page.title == "Gain Staging in Mixing"
assert page.body_sections is not None
assert "Overview" in page.body_sections
assert page.signal_chains is not None
assert len(page.signal_chains) >= 1
assert page.summary is not None
# Verify KeyMoments are linked to the TechniquePage
moments = (
session.execute(
select(KeyMoment).where(KeyMoment.technique_page_id == page.id)
)
.scalars()
.all()
)
assert len(moments) >= 1
# Verify processing_status updated
video = session.execute(
select(SourceVideo).where(SourceVideo.id == video_id)
).scalar_one()
assert video.processing_status == ProcessingStatus.reviewed
finally:
session.close()
# ── (e) Stage 6: Embed & Index ──────────────────────────────────────────────
def test_stage6_embeds_and_upserts_to_qdrant(
db_engine, sync_engine, pre_ingested_video, tmp_path
):
"""Full pipeline through stage 6 should call EmbeddingClient and QdrantManager."""
video_id = pre_ingested_video["video_id"]
prompts_dir = tmp_path / "prompts"
prompts_dir.mkdir()
(prompts_dir / "stage2_segmentation.txt").write_text("Segment assistant.")
(prompts_dir / "stage3_extraction.txt").write_text("Extraction assistant.")
(prompts_dir / "stage4_classification.txt").write_text("Classification assistant.")
(prompts_dir / "stage5_synthesis.txt").write_text("Synthesis assistant.")
call_count = {"n": 0}
responses = [
STAGE2_SEGMENTATION_RESPONSE,
STAGE3_EXTRACTION_RESPONSE,
STAGE3_EXTRACTION_RESPONSE,
STAGE4_CLASSIFICATION_RESPONSE,
STAGE5_SYNTHESIS_RESPONSE,
]
def llm_side_effect(**kwargs):
idx = min(call_count["n"], len(responses) - 1)
resp = responses[idx]
call_count["n"] += 1
return _make_mock_openai_response(resp)
patches = _patch_pipeline_engine(sync_engine)
for p in patches:
p.start()
mock_embed_client = MagicMock()
mock_embed_client.embed.side_effect = lambda texts: make_mock_embeddings(len(texts))
mock_qdrant_mgr = MagicMock()
with _patch_llm_completions(llm_side_effect), \
patch("pipeline.stages.get_settings") as mock_settings, \
patch("pipeline.stages._load_canonical_tags") as mock_tags, \
patch("pipeline.stages._store_classification_data"), \
patch("pipeline.stages._load_classification_data") as mock_load_cls, \
patch("pipeline.stages.EmbeddingClient", return_value=mock_embed_client), \
patch("pipeline.stages.QdrantManager", return_value=mock_qdrant_mgr):
s = MagicMock()
s.prompts_path = str(prompts_dir)
s.llm_api_url = "http://mock:11434/v1"
s.llm_api_key = "sk-test"
s.llm_model = "test-model"
s.llm_fallback_url = "http://mock:11434/v1"
s.llm_fallback_model = "test-model"
s.database_url = TEST_DATABASE_URL_SYNC.replace("psycopg2", "asyncpg")
s.review_mode = True
s.embedding_api_url = "http://mock:11434/v1"
s.embedding_model = "test-embed"
s.embedding_dimensions = 768
s.qdrant_url = "http://mock:6333"
s.qdrant_collection = "test_collection"
mock_settings.return_value = s
mock_tags.return_value = {
"categories": [
{"name": "Mixing", "description": "Balancing", "sub_topics": ["gain staging"]},
]
}
from pipeline.stages import (
stage2_segmentation,
stage3_extraction,
stage4_classification,
stage5_synthesis,
stage6_embed_and_index,
)
stage2_segmentation(video_id)
stage3_extraction(video_id)
stage4_classification(video_id)
# Load real moment IDs for classification data mock
factory = sessionmaker(bind=sync_engine)
sess = factory()
real_moments = (
sess.execute(
select(KeyMoment).where(KeyMoment.source_video_id == video_id)
)
.scalars()
.all()
)
real_cls = [
{"moment_id": str(m.id), "topic_category": "Mixing", "topic_tags": ["gain staging"]}
for m in real_moments
]
sess.close()
mock_load_cls.return_value = real_cls
stage5_synthesis(video_id)
stage6_embed_and_index(video_id)
for p in patches:
p.stop()
# Verify EmbeddingClient.embed was called
assert mock_embed_client.embed.called
# Verify QdrantManager methods called
mock_qdrant_mgr.ensure_collection.assert_called_once()
assert (
mock_qdrant_mgr.upsert_technique_pages.called
or mock_qdrant_mgr.upsert_key_moments.called
), "Expected at least one upsert call to QdrantManager"
# ── (f) Resumability ────────────────────────────────────────────────────────
def test_run_pipeline_resumes_from_extracted(
db_engine, sync_engine, pre_ingested_video, tmp_path
):
"""When status=extracted, run_pipeline should skip stages 2+3 and run 4+5+6."""
video_id = pre_ingested_video["video_id"]
# Set video status to "extracted" directly
factory = sessionmaker(bind=sync_engine)
session = factory()
video = session.execute(
select(SourceVideo).where(SourceVideo.id == video_id)
).scalar_one()
video.processing_status = ProcessingStatus.extracted
session.commit()
session.close()
patches = _patch_pipeline_engine(sync_engine)
for p in patches:
p.start()
with patch("pipeline.stages.get_settings") as mock_settings, \
patch("pipeline.stages.stage2_segmentation") as mock_s2, \
patch("pipeline.stages.stage3_extraction") as mock_s3, \
patch("pipeline.stages.stage4_classification") as mock_s4, \
patch("pipeline.stages.stage5_synthesis") as mock_s5, \
patch("pipeline.stages.stage6_embed_and_index") as mock_s6, \
patch("pipeline.stages.celery_chain") as mock_chain:
s = MagicMock()
s.database_url = TEST_DATABASE_URL_SYNC.replace("psycopg2", "asyncpg")
mock_settings.return_value = s
# Mock chain to inspect what stages it gets
mock_pipeline = MagicMock()
mock_chain.return_value = mock_pipeline
# Mock the .s() method on each task
mock_s2.s = MagicMock(return_value="s2_sig")
mock_s3.s = MagicMock(return_value="s3_sig")
mock_s4.s = MagicMock(return_value="s4_sig")
mock_s5.s = MagicMock(return_value="s5_sig")
mock_s6.s = MagicMock(return_value="s6_sig")
from pipeline.stages import run_pipeline
run_pipeline(video_id)
# Verify: stages 2 and 3 should NOT have .s() called with video_id
mock_s2.s.assert_not_called()
mock_s3.s.assert_not_called()
# Stages 4, 5, 6 should have .s() called
mock_s4.s.assert_called_once_with(video_id)
mock_s5.s.assert_called_once()
mock_s6.s.assert_called_once()
for p in patches:
p.stop()
# ── (g) Pipeline trigger endpoint ───────────────────────────────────────────
async def test_pipeline_trigger_endpoint(client, db_engine):
"""POST /api/v1/pipeline/trigger/{video_id} with valid video returns 200."""
# Ingest a transcript first to create a video
sample = pathlib.Path(__file__).parent / "fixtures" / "sample_transcript.json"
with patch("routers.ingest.run_pipeline", create=True) as mock_rp:
mock_rp.delay = MagicMock()
resp = await client.post(
"/api/v1/ingest",
files={"file": (sample.name, sample.read_bytes(), "application/json")},
)
assert resp.status_code == 200
video_id = resp.json()["video_id"]
# Trigger the pipeline
with patch("pipeline.stages.run_pipeline") as mock_rp:
mock_rp.delay = MagicMock()
resp = await client.post(f"/api/v1/pipeline/trigger/{video_id}")
assert resp.status_code == 200
data = resp.json()
assert data["status"] == "triggered"
assert data["video_id"] == video_id
# ── (h) Pipeline trigger 404 ────────────────────────────────────────────────
async def test_pipeline_trigger_404_for_missing_video(client):
"""POST /api/v1/pipeline/trigger/{nonexistent} returns 404."""
fake_id = str(uuid.uuid4())
resp = await client.post(f"/api/v1/pipeline/trigger/{fake_id}")
assert resp.status_code == 404
assert "not found" in resp.json()["detail"].lower()
# ── (i) Ingest dispatches pipeline ──────────────────────────────────────────
async def test_ingest_dispatches_pipeline(client, db_engine):
"""Ingesting a transcript should call run_pipeline.delay with the video_id."""
sample = pathlib.Path(__file__).parent / "fixtures" / "sample_transcript.json"
with patch("pipeline.stages.run_pipeline") as mock_rp:
mock_rp.delay = MagicMock()
resp = await client.post(
"/api/v1/ingest",
files={"file": (sample.name, sample.read_bytes(), "application/json")},
)
assert resp.status_code == 200
video_id = resp.json()["video_id"]
mock_rp.delay.assert_called_once_with(video_id)
# ── (j) LLM fallback on primary failure ─────────────────────────────────────
def test_llm_fallback_on_primary_failure():
"""LLMClient should fall back to secondary endpoint when primary raises APIConnectionError."""
from pipeline.llm_client import LLMClient
settings = MagicMock()
settings.llm_api_url = "http://primary:11434/v1"
settings.llm_api_key = "sk-test"
settings.llm_fallback_url = "http://fallback:11434/v1"
settings.llm_fallback_model = "fallback-model"
settings.llm_model = "primary-model"
with patch("openai.OpenAI") as MockOpenAI:
primary_client = MagicMock()
fallback_client = MagicMock()
# First call → primary, second call → fallback
MockOpenAI.side_effect = [primary_client, fallback_client]
client = LLMClient(settings)
# Primary raises APIConnectionError
primary_client.chat.completions.create.side_effect = openai.APIConnectionError(
request=MagicMock()
)
# Fallback succeeds
fallback_response = _make_mock_openai_response('{"result": "ok"}')
fallback_client.chat.completions.create.return_value = fallback_response
result = client.complete("system", "user")
assert result == '{"result": "ok"}'
primary_client.chat.completions.create.assert_called_once()
fallback_client.chat.completions.create.assert_called_once()
# ── Think-tag stripping ─────────────────────────────────────────────────────
def test_strip_think_tags():
"""strip_think_tags should handle all edge cases correctly."""
from pipeline.llm_client import strip_think_tags
# Single block with JSON after
assert strip_think_tags('reasoning here{"a": 1}') == '{"a": 1}'
# Multiline think block
assert strip_think_tags(
'\nI need to analyze this.\nLet me think step by step.\n\n{"result": "ok"}'
) == '{"result": "ok"}'
# Multiple think blocks
result = strip_think_tags('firsthellosecond world')
assert result == "hello world"
# No think tags — passthrough
assert strip_think_tags('{"clean": true}') == '{"clean": true}'
# Empty string
assert strip_think_tags("") == ""
# Think block with special characters
assert strip_think_tags(
'analyzing "complex" & stuff{"done": true}'
) == '{"done": true}'
# Only a think block, no actual content
assert strip_think_tags("just thinking") == ""