- "backend/tests/conftest.py" - "backend/tests/test_ingest.py" - "backend/tests/fixtures/sample_transcript.json" - "backend/pytest.ini" - "backend/requirements.txt" - "backend/models.py" GSD-Task: S02/T02
179 lines
7.3 KiB
Python
179 lines
7.3 KiB
Python
"""Integration tests for the transcript ingest endpoint.
|
|
|
|
Tests run against a real PostgreSQL database via httpx.AsyncClient
|
|
on the FastAPI ASGI app. Each test gets a clean database state via
|
|
TRUNCATE in the client fixture (conftest.py).
|
|
"""
|
|
|
|
import json
|
|
import pathlib
|
|
|
|
import pytest
|
|
from httpx import AsyncClient
|
|
from sqlalchemy import func, select, text
|
|
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
|
|
|
|
from models import Creator, SourceVideo, TranscriptSegment
|
|
|
|
|
|
# ── Helpers ──────────────────────────────────────────────────────────────────
|
|
|
|
INGEST_URL = "/api/v1/ingest"
|
|
|
|
|
|
def _upload_file(path: pathlib.Path):
|
|
"""Return a dict suitable for httpx multipart file upload."""
|
|
return {"file": (path.name, path.read_bytes(), "application/json")}
|
|
|
|
|
|
async def _query_db(db_engine, stmt):
|
|
"""Run a read query in its own session to avoid connection contention."""
|
|
session_factory = async_sessionmaker(
|
|
db_engine, class_=AsyncSession, expire_on_commit=False
|
|
)
|
|
async with session_factory() as session:
|
|
result = await session.execute(stmt)
|
|
return result
|
|
|
|
|
|
async def _count_rows(db_engine, model):
|
|
"""Count rows in a table via a fresh session."""
|
|
result = await _query_db(db_engine, select(func.count(model.id)))
|
|
return result.scalar_one()
|
|
|
|
|
|
# ── Happy-path tests ────────────────────────────────────────────────────────
|
|
|
|
|
|
async def test_ingest_creates_creator_and_video(client, sample_transcript_path, db_engine):
|
|
"""POST a valid transcript → 200 with creator, video, and 5 segments created."""
|
|
resp = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path))
|
|
assert resp.status_code == 200, f"Expected 200, got {resp.status_code}: {resp.text}"
|
|
|
|
data = resp.json()
|
|
assert "video_id" in data
|
|
assert "creator_id" in data
|
|
assert data["segments_stored"] == 5
|
|
assert data["creator_name"] == "Skope"
|
|
assert data["is_reupload"] is False
|
|
|
|
# Verify DB state via a fresh session
|
|
session_factory = async_sessionmaker(db_engine, class_=AsyncSession, expire_on_commit=False)
|
|
async with session_factory() as session:
|
|
# Creator exists with correct folder_name and slug
|
|
result = await session.execute(
|
|
select(Creator).where(Creator.folder_name == "Skope")
|
|
)
|
|
creator = result.scalar_one()
|
|
assert creator.slug == "skope"
|
|
assert creator.name == "Skope"
|
|
|
|
# SourceVideo exists with correct status
|
|
result = await session.execute(
|
|
select(SourceVideo).where(SourceVideo.creator_id == creator.id)
|
|
)
|
|
video = result.scalar_one()
|
|
assert video.processing_status.value == "transcribed"
|
|
assert video.filename == "mixing-basics-ep1.mp4"
|
|
|
|
# 5 TranscriptSegment rows with sequential indices
|
|
result = await session.execute(
|
|
select(TranscriptSegment)
|
|
.where(TranscriptSegment.source_video_id == video.id)
|
|
.order_by(TranscriptSegment.segment_index)
|
|
)
|
|
segments = result.scalars().all()
|
|
assert len(segments) == 5
|
|
assert [s.segment_index for s in segments] == [0, 1, 2, 3, 4]
|
|
|
|
|
|
async def test_ingest_reuses_existing_creator(client, sample_transcript_path, db_engine):
|
|
"""If a Creator with the same folder_name already exists, reuse it."""
|
|
session_factory = async_sessionmaker(db_engine, class_=AsyncSession, expire_on_commit=False)
|
|
|
|
# Pre-create a Creator with folder_name='Skope' in a separate session
|
|
async with session_factory() as session:
|
|
existing = Creator(name="Skope", slug="skope", folder_name="Skope")
|
|
session.add(existing)
|
|
await session.commit()
|
|
await session.refresh(existing)
|
|
existing_id = existing.id
|
|
|
|
# POST transcript — should reuse the creator
|
|
resp = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path))
|
|
assert resp.status_code == 200
|
|
data = resp.json()
|
|
assert data["creator_id"] == str(existing_id)
|
|
|
|
# Verify only 1 Creator row in DB
|
|
count = await _count_rows(db_engine, Creator)
|
|
assert count == 1, f"Expected 1 creator, got {count}"
|
|
|
|
|
|
async def test_ingest_idempotent_reupload(client, sample_transcript_path, db_engine):
|
|
"""Uploading the same transcript twice is idempotent: same video, no duplicate segments."""
|
|
# First upload
|
|
resp1 = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path))
|
|
assert resp1.status_code == 200
|
|
data1 = resp1.json()
|
|
assert data1["is_reupload"] is False
|
|
video_id = data1["video_id"]
|
|
|
|
# Second upload (same file)
|
|
resp2 = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path))
|
|
assert resp2.status_code == 200
|
|
data2 = resp2.json()
|
|
assert data2["is_reupload"] is True
|
|
assert data2["video_id"] == video_id
|
|
|
|
# Verify DB: still only 1 SourceVideo and 5 segments (not 10)
|
|
video_count = await _count_rows(db_engine, SourceVideo)
|
|
assert video_count == 1, f"Expected 1 video, got {video_count}"
|
|
|
|
seg_count = await _count_rows(db_engine, TranscriptSegment)
|
|
assert seg_count == 5, f"Expected 5 segments, got {seg_count}"
|
|
|
|
|
|
async def test_ingest_saves_json_to_disk(client, sample_transcript_path, tmp_path):
|
|
"""Ingested transcript raw JSON is persisted to the filesystem."""
|
|
resp = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path))
|
|
assert resp.status_code == 200
|
|
|
|
# The ingest endpoint saves to {transcript_storage_path}/{creator_folder}/{source_file}.json
|
|
expected_path = tmp_path / "Skope" / "mixing-basics-ep1.mp4.json"
|
|
assert expected_path.exists(), f"Expected file at {expected_path}"
|
|
|
|
# Verify the saved JSON is valid and matches the source
|
|
saved = json.loads(expected_path.read_text())
|
|
source = json.loads(sample_transcript_path.read_text())
|
|
assert saved == source
|
|
|
|
|
|
# ── Error tests ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
async def test_ingest_rejects_invalid_json(client, tmp_path):
|
|
"""Uploading a non-JSON file returns 422."""
|
|
bad_file = tmp_path / "bad.json"
|
|
bad_file.write_text("this is not valid json {{{")
|
|
|
|
resp = await client.post(
|
|
INGEST_URL,
|
|
files={"file": ("bad.json", bad_file.read_bytes(), "application/json")},
|
|
)
|
|
assert resp.status_code == 422, f"Expected 422, got {resp.status_code}: {resp.text}"
|
|
assert "JSON parse error" in resp.json()["detail"]
|
|
|
|
|
|
async def test_ingest_rejects_missing_fields(client, tmp_path):
|
|
"""Uploading JSON without required fields returns 422."""
|
|
incomplete = tmp_path / "incomplete.json"
|
|
# Missing creator_folder and segments
|
|
incomplete.write_text(json.dumps({"source_file": "test.mp4", "duration_seconds": 100}))
|
|
|
|
resp = await client.post(
|
|
INGEST_URL,
|
|
files={"file": ("incomplete.json", incomplete.read_bytes(), "application/json")},
|
|
)
|
|
assert resp.status_code == 422, f"Expected 422, got {resp.status_code}: {resp.text}"
|
|
assert "Missing required keys" in resp.json()["detail"]
|