chrysopedia/backend/tests/test_ingest.py
jlightner bef8d95e64 test: Added 6 integration tests proving ingestion, creator auto-detecti…
- "backend/tests/conftest.py"
- "backend/tests/test_ingest.py"
- "backend/tests/fixtures/sample_transcript.json"
- "backend/pytest.ini"
- "backend/requirements.txt"
- "backend/models.py"

GSD-Task: S02/T02
2026-03-29 22:16:15 +00:00

179 lines
7.3 KiB
Python

"""Integration tests for the transcript ingest endpoint.
Tests run against a real PostgreSQL database via httpx.AsyncClient
on the FastAPI ASGI app. Each test gets a clean database state via
TRUNCATE in the client fixture (conftest.py).
"""
import json
import pathlib
import pytest
from httpx import AsyncClient
from sqlalchemy import func, select, text
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from models import Creator, SourceVideo, TranscriptSegment
# ── Helpers ──────────────────────────────────────────────────────────────────
INGEST_URL = "/api/v1/ingest"
def _upload_file(path: pathlib.Path):
"""Return a dict suitable for httpx multipart file upload."""
return {"file": (path.name, path.read_bytes(), "application/json")}
async def _query_db(db_engine, stmt):
"""Run a read query in its own session to avoid connection contention."""
session_factory = async_sessionmaker(
db_engine, class_=AsyncSession, expire_on_commit=False
)
async with session_factory() as session:
result = await session.execute(stmt)
return result
async def _count_rows(db_engine, model):
"""Count rows in a table via a fresh session."""
result = await _query_db(db_engine, select(func.count(model.id)))
return result.scalar_one()
# ── Happy-path tests ────────────────────────────────────────────────────────
async def test_ingest_creates_creator_and_video(client, sample_transcript_path, db_engine):
"""POST a valid transcript → 200 with creator, video, and 5 segments created."""
resp = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path))
assert resp.status_code == 200, f"Expected 200, got {resp.status_code}: {resp.text}"
data = resp.json()
assert "video_id" in data
assert "creator_id" in data
assert data["segments_stored"] == 5
assert data["creator_name"] == "Skope"
assert data["is_reupload"] is False
# Verify DB state via a fresh session
session_factory = async_sessionmaker(db_engine, class_=AsyncSession, expire_on_commit=False)
async with session_factory() as session:
# Creator exists with correct folder_name and slug
result = await session.execute(
select(Creator).where(Creator.folder_name == "Skope")
)
creator = result.scalar_one()
assert creator.slug == "skope"
assert creator.name == "Skope"
# SourceVideo exists with correct status
result = await session.execute(
select(SourceVideo).where(SourceVideo.creator_id == creator.id)
)
video = result.scalar_one()
assert video.processing_status.value == "transcribed"
assert video.filename == "mixing-basics-ep1.mp4"
# 5 TranscriptSegment rows with sequential indices
result = await session.execute(
select(TranscriptSegment)
.where(TranscriptSegment.source_video_id == video.id)
.order_by(TranscriptSegment.segment_index)
)
segments = result.scalars().all()
assert len(segments) == 5
assert [s.segment_index for s in segments] == [0, 1, 2, 3, 4]
async def test_ingest_reuses_existing_creator(client, sample_transcript_path, db_engine):
"""If a Creator with the same folder_name already exists, reuse it."""
session_factory = async_sessionmaker(db_engine, class_=AsyncSession, expire_on_commit=False)
# Pre-create a Creator with folder_name='Skope' in a separate session
async with session_factory() as session:
existing = Creator(name="Skope", slug="skope", folder_name="Skope")
session.add(existing)
await session.commit()
await session.refresh(existing)
existing_id = existing.id
# POST transcript — should reuse the creator
resp = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path))
assert resp.status_code == 200
data = resp.json()
assert data["creator_id"] == str(existing_id)
# Verify only 1 Creator row in DB
count = await _count_rows(db_engine, Creator)
assert count == 1, f"Expected 1 creator, got {count}"
async def test_ingest_idempotent_reupload(client, sample_transcript_path, db_engine):
"""Uploading the same transcript twice is idempotent: same video, no duplicate segments."""
# First upload
resp1 = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path))
assert resp1.status_code == 200
data1 = resp1.json()
assert data1["is_reupload"] is False
video_id = data1["video_id"]
# Second upload (same file)
resp2 = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path))
assert resp2.status_code == 200
data2 = resp2.json()
assert data2["is_reupload"] is True
assert data2["video_id"] == video_id
# Verify DB: still only 1 SourceVideo and 5 segments (not 10)
video_count = await _count_rows(db_engine, SourceVideo)
assert video_count == 1, f"Expected 1 video, got {video_count}"
seg_count = await _count_rows(db_engine, TranscriptSegment)
assert seg_count == 5, f"Expected 5 segments, got {seg_count}"
async def test_ingest_saves_json_to_disk(client, sample_transcript_path, tmp_path):
"""Ingested transcript raw JSON is persisted to the filesystem."""
resp = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path))
assert resp.status_code == 200
# The ingest endpoint saves to {transcript_storage_path}/{creator_folder}/{source_file}.json
expected_path = tmp_path / "Skope" / "mixing-basics-ep1.mp4.json"
assert expected_path.exists(), f"Expected file at {expected_path}"
# Verify the saved JSON is valid and matches the source
saved = json.loads(expected_path.read_text())
source = json.loads(sample_transcript_path.read_text())
assert saved == source
# ── Error tests ──────────────────────────────────────────────────────────────
async def test_ingest_rejects_invalid_json(client, tmp_path):
"""Uploading a non-JSON file returns 422."""
bad_file = tmp_path / "bad.json"
bad_file.write_text("this is not valid json {{{")
resp = await client.post(
INGEST_URL,
files={"file": ("bad.json", bad_file.read_bytes(), "application/json")},
)
assert resp.status_code == 422, f"Expected 422, got {resp.status_code}: {resp.text}"
assert "JSON parse error" in resp.json()["detail"]
async def test_ingest_rejects_missing_fields(client, tmp_path):
"""Uploading JSON without required fields returns 422."""
incomplete = tmp_path / "incomplete.json"
# Missing creator_folder and segments
incomplete.write_text(json.dumps({"source_file": "test.mp4", "duration_seconds": 100}))
resp = await client.post(
INGEST_URL,
files={"file": ("incomplete.json", incomplete.read_bytes(), "application/json")},
)
assert resp.status_code == 422, f"Expected 422, got {resp.status_code}: {resp.text}"
assert "Missing required keys" in resp.json()["detail"]