"""Integration tests for the transcript ingest endpoint. Tests run against a real PostgreSQL database via httpx.AsyncClient on the FastAPI ASGI app. Each test gets a clean database state via TRUNCATE in the client fixture (conftest.py). """ import json import pathlib import pytest from httpx import AsyncClient from sqlalchemy import func, select, text from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker from models import Creator, SourceVideo, TranscriptSegment # ── Helpers ────────────────────────────────────────────────────────────────── INGEST_URL = "/api/v1/ingest" def _upload_file(path: pathlib.Path): """Return a dict suitable for httpx multipart file upload.""" return {"file": (path.name, path.read_bytes(), "application/json")} async def _query_db(db_engine, stmt): """Run a read query in its own session to avoid connection contention.""" session_factory = async_sessionmaker( db_engine, class_=AsyncSession, expire_on_commit=False ) async with session_factory() as session: result = await session.execute(stmt) return result async def _count_rows(db_engine, model): """Count rows in a table via a fresh session.""" result = await _query_db(db_engine, select(func.count(model.id))) return result.scalar_one() # ── Happy-path tests ──────────────────────────────────────────────────────── async def test_ingest_creates_creator_and_video(client, sample_transcript_path, db_engine): """POST a valid transcript → 200 with creator, video, and 5 segments created.""" resp = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path)) assert resp.status_code == 200, f"Expected 200, got {resp.status_code}: {resp.text}" data = resp.json() assert "video_id" in data assert "creator_id" in data assert data["segments_stored"] == 5 assert data["creator_name"] == "Skope" assert data["is_reupload"] is False # Verify DB state via a fresh session session_factory = async_sessionmaker(db_engine, class_=AsyncSession, expire_on_commit=False) async with session_factory() as session: # Creator exists with correct folder_name and slug result = await session.execute( select(Creator).where(Creator.folder_name == "Skope") ) creator = result.scalar_one() assert creator.slug == "skope" assert creator.name == "Skope" # SourceVideo exists with correct status result = await session.execute( select(SourceVideo).where(SourceVideo.creator_id == creator.id) ) video = result.scalar_one() assert video.processing_status.value == "transcribed" assert video.filename == "mixing-basics-ep1.mp4" # 5 TranscriptSegment rows with sequential indices result = await session.execute( select(TranscriptSegment) .where(TranscriptSegment.source_video_id == video.id) .order_by(TranscriptSegment.segment_index) ) segments = result.scalars().all() assert len(segments) == 5 assert [s.segment_index for s in segments] == [0, 1, 2, 3, 4] async def test_ingest_reuses_existing_creator(client, sample_transcript_path, db_engine): """If a Creator with the same folder_name already exists, reuse it.""" session_factory = async_sessionmaker(db_engine, class_=AsyncSession, expire_on_commit=False) # Pre-create a Creator with folder_name='Skope' in a separate session async with session_factory() as session: existing = Creator(name="Skope", slug="skope", folder_name="Skope") session.add(existing) await session.commit() await session.refresh(existing) existing_id = existing.id # POST transcript — should reuse the creator resp = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path)) assert resp.status_code == 200 data = resp.json() assert data["creator_id"] == str(existing_id) # Verify only 1 Creator row in DB count = await _count_rows(db_engine, Creator) assert count == 1, f"Expected 1 creator, got {count}" async def test_ingest_idempotent_reupload(client, sample_transcript_path, db_engine): """Uploading the same transcript twice is idempotent: same video, no duplicate segments.""" # First upload resp1 = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path)) assert resp1.status_code == 200 data1 = resp1.json() assert data1["is_reupload"] is False video_id = data1["video_id"] # Second upload (same file) resp2 = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path)) assert resp2.status_code == 200 data2 = resp2.json() assert data2["is_reupload"] is True assert data2["video_id"] == video_id # Verify DB: still only 1 SourceVideo and 5 segments (not 10) video_count = await _count_rows(db_engine, SourceVideo) assert video_count == 1, f"Expected 1 video, got {video_count}" seg_count = await _count_rows(db_engine, TranscriptSegment) assert seg_count == 5, f"Expected 5 segments, got {seg_count}" async def test_ingest_saves_json_to_disk(client, sample_transcript_path, tmp_path): """Ingested transcript raw JSON is persisted to the filesystem.""" resp = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path)) assert resp.status_code == 200 # The ingest endpoint saves to {transcript_storage_path}/{creator_folder}/{source_file}.json expected_path = tmp_path / "Skope" / "mixing-basics-ep1.mp4.json" assert expected_path.exists(), f"Expected file at {expected_path}" # Verify the saved JSON is valid and matches the source saved = json.loads(expected_path.read_text()) source = json.loads(sample_transcript_path.read_text()) assert saved == source # ── Error tests ────────────────────────────────────────────────────────────── async def test_ingest_rejects_invalid_json(client, tmp_path): """Uploading a non-JSON file returns 422.""" bad_file = tmp_path / "bad.json" bad_file.write_text("this is not valid json {{{") resp = await client.post( INGEST_URL, files={"file": ("bad.json", bad_file.read_bytes(), "application/json")}, ) assert resp.status_code == 422, f"Expected 422, got {resp.status_code}: {resp.text}" assert "JSON parse error" in resp.json()["detail"] async def test_ingest_rejects_missing_fields(client, tmp_path): """Uploading JSON without required fields returns 422.""" incomplete = tmp_path / "incomplete.json" # Missing creator_folder and segments incomplete.write_text(json.dumps({"source_file": "test.mp4", "duration_seconds": 100})) resp = await client.post( INGEST_URL, files={"file": ("incomplete.json", incomplete.read_bytes(), "application/json")}, ) assert resp.status_code == 422, f"Expected 422, got {resp.status_code}: {resp.text}" assert "Missing required keys" in resp.json()["detail"]