From bef8d95e649443d844d9df99829c50547cb751d9 Mon Sep 17 00:00:00 2001 From: jlightner Date: Sun, 29 Mar 2026 22:16:15 +0000 Subject: [PATCH] =?UTF-8?q?test:=20Added=206=20integration=20tests=20provi?= =?UTF-8?q?ng=20ingestion,=20creator=20auto-detecti=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - "backend/tests/conftest.py" - "backend/tests/test_ingest.py" - "backend/tests/fixtures/sample_transcript.json" - "backend/pytest.ini" - "backend/requirements.txt" - "backend/models.py" GSD-Task: S02/T02 --- .gsd/KNOWLEDGE.md | 12 ++ .gsd/milestones/M001/slices/S02/S02-PLAN.md | 2 +- .../M001/slices/S02/tasks/T01-VERIFY.json | 30 +++ .../M001/slices/S02/tasks/T02-SUMMARY.md | 90 +++++++++ backend/models.py | 7 +- backend/pytest.ini | 3 + backend/requirements.txt | 3 + backend/tests/__init__.py | 0 backend/tests/conftest.py | 93 +++++++++ backend/tests/fixtures/sample_transcript.json | 12 ++ backend/tests/test_ingest.py | 179 ++++++++++++++++++ 11 files changed, 429 insertions(+), 2 deletions(-) create mode 100644 .gsd/milestones/M001/slices/S02/tasks/T01-VERIFY.json create mode 100644 .gsd/milestones/M001/slices/S02/tasks/T02-SUMMARY.md create mode 100644 backend/pytest.ini create mode 100644 backend/tests/__init__.py create mode 100644 backend/tests/conftest.py create mode 100644 backend/tests/fixtures/sample_transcript.json create mode 100644 backend/tests/test_ingest.py diff --git a/.gsd/KNOWLEDGE.md b/.gsd/KNOWLEDGE.md index cb8284c..407cd78 100644 --- a/.gsd/KNOWLEDGE.md +++ b/.gsd/KNOWLEDGE.md @@ -17,3 +17,15 @@ **Context:** The kerf-engine Docker container (`kerf-engine-kerf-engine-1`) binds to `0.0.0.0:8000`. When testing the Chrysopedia API locally (outside Docker), curl requests to `localhost:8000` hit the kerf-engine container instead of the local uvicorn process, even when uvicorn binds to `127.0.0.1:8000`. **Fix:** Use an alternate port (e.g., 8001) for local testing. In Docker Compose, chrysopedia-api maps `127.0.0.1:8000:8000` which is fine because it goes through Docker's port forwarding, but will conflict with kerf-engine if both stacks run simultaneously. Consider changing one project's external port mapping. + +## asyncpg rejects timezone-aware datetimes for TIMESTAMP WITHOUT TIME ZONE columns + +**Context:** SQLAlchemy models using `default=datetime.now(timezone.utc)` produce timezone-aware Python datetimes. When the PostgreSQL column type is `TIMESTAMP WITHOUT TIME ZONE` (the default), asyncpg raises `DataError: can't subtract offset-naive and offset-aware datetimes`. This only surfaces when running against a real PostgreSQL database — SQLite-based tests won't catch it. + +**Fix:** Use `datetime.now(timezone.utc).replace(tzinfo=None)` in the `_now()` helper, or change the column type to `TIMESTAMP WITH TIME ZONE`. We chose the former since the existing schema uses naive timestamps. + +## asyncpg NullPool required for pytest-asyncio integration tests + +**Context:** When using a session-scoped SQLAlchemy async engine with asyncpg in pytest-asyncio tests, the connection pool reuses connections across fixtures and test functions. This causes `InterfaceError: cannot perform operation: another operation is in progress` because the ASGI test client's session holds a connection while cleanup/verification fixtures try to use the same pool. + +**Fix:** Use `poolclass=NullPool` when creating the test engine. Each connection is created fresh and immediately closed, eliminating contention. Performance cost is negligible for test suites. diff --git a/.gsd/milestones/M001/slices/S02/S02-PLAN.md b/.gsd/milestones/M001/slices/S02/S02-PLAN.md index 8f1ba50..aa21cb0 100644 --- a/.gsd/milestones/M001/slices/S02/S02-PLAN.md +++ b/.gsd/milestones/M001/slices/S02/S02-PLAN.md @@ -58,7 +58,7 @@ - Estimate: 45m - Files: backend/requirements.txt, backend/schemas.py, backend/routers/ingest.py, backend/main.py - Verify: cd backend && python3 -c "from routers.ingest import router; print([r.path for r in router.routes])" && python3 -c "from schemas import TranscriptIngestResponse; print(TranscriptIngestResponse.model_fields.keys())" && grep -q 'python-multipart' requirements.txt && grep -q 'ingest' main.py -- [ ] **T02: Write integration tests proving ingestion, creator auto-detection, and idempotent re-upload** — Set up pytest + pytest-asyncio test infrastructure and write integration tests for the ingest endpoint. Tests run against a real PostgreSQL database using httpx.AsyncClient on the FastAPI app. +- [x] **T02: Added 6 integration tests proving ingestion, creator auto-detection, and idempotent re-upload against real PostgreSQL** — Set up pytest + pytest-asyncio test infrastructure and write integration tests for the ingest endpoint. Tests run against a real PostgreSQL database using httpx.AsyncClient on the FastAPI app. ## Steps diff --git a/.gsd/milestones/M001/slices/S02/tasks/T01-VERIFY.json b/.gsd/milestones/M001/slices/S02/tasks/T01-VERIFY.json new file mode 100644 index 0000000..d385e0f --- /dev/null +++ b/.gsd/milestones/M001/slices/S02/tasks/T01-VERIFY.json @@ -0,0 +1,30 @@ +{ + "schemaVersion": 1, + "taskId": "T01", + "unitId": "M001/S02/T01", + "timestamp": 1774822186229, + "passed": false, + "discoverySource": "task-plan", + "checks": [ + { + "command": "cd backend", + "exitCode": 0, + "durationMs": 5, + "verdict": "pass" + }, + { + "command": "grep -q 'python-multipart' requirements.txt", + "exitCode": 2, + "durationMs": 5, + "verdict": "fail" + }, + { + "command": "grep -q 'ingest' main.py", + "exitCode": 2, + "durationMs": 5, + "verdict": "fail" + } + ], + "retryAttempt": 1, + "maxRetries": 2 +} diff --git a/.gsd/milestones/M001/slices/S02/tasks/T02-SUMMARY.md b/.gsd/milestones/M001/slices/S02/tasks/T02-SUMMARY.md new file mode 100644 index 0000000..fc3a2a7 --- /dev/null +++ b/.gsd/milestones/M001/slices/S02/tasks/T02-SUMMARY.md @@ -0,0 +1,90 @@ +--- +id: T02 +parent: S02 +milestone: M001 +provides: [] +requires: [] +affects: [] +key_files: ["backend/tests/conftest.py", "backend/tests/test_ingest.py", "backend/tests/fixtures/sample_transcript.json", "backend/pytest.ini", "backend/requirements.txt", "backend/models.py"] +key_decisions: ["Used NullPool for test engine to avoid asyncpg connection contention between ASGI test client and verification queries", "Fixed _now() helper in models.py to return naive UTC datetimes for asyncpg TIMESTAMP WITHOUT TIME ZONE compatibility"] +patterns_established: [] +drill_down_paths: [] +observability_surfaces: [] +duration: "" +verification_result: "All 6 tests pass: cd backend && python3 -m pytest tests/test_ingest.py -v (6 passed in 2.92s). All 4 slice-level verification checks pass: router.routes outputs ['/ingest'], TranscriptIngestResponse fields correct, python-multipart in requirements.txt, ingest in main.py. Docker compose config validation passes." +completed_at: 2026-03-29T22:16:12.806Z +blocker_discovered: false +--- + +# T02: Added 6 integration tests proving ingestion, creator auto-detection, and idempotent re-upload against real PostgreSQL + +> Added 6 integration tests proving ingestion, creator auto-detection, and idempotent re-upload against real PostgreSQL + +## What Happened +--- +id: T02 +parent: S02 +milestone: M001 +key_files: + - backend/tests/conftest.py + - backend/tests/test_ingest.py + - backend/tests/fixtures/sample_transcript.json + - backend/pytest.ini + - backend/requirements.txt + - backend/models.py +key_decisions: + - Used NullPool for test engine to avoid asyncpg connection contention between ASGI test client and verification queries + - Fixed _now() helper in models.py to return naive UTC datetimes for asyncpg TIMESTAMP WITHOUT TIME ZONE compatibility +duration: "" +verification_result: passed +completed_at: 2026-03-29T22:16:12.806Z +blocker_discovered: false +--- + +# T02: Added 6 integration tests proving ingestion, creator auto-detection, and idempotent re-upload against real PostgreSQL + +**Added 6 integration tests proving ingestion, creator auto-detection, and idempotent re-upload against real PostgreSQL** + +## What Happened + +Set up complete pytest + pytest-asyncio test infrastructure with async fixtures: function-scoped db_engine using NullPool for isolation, ASGI transport test client with dependency overrides, and sample transcript fixture. Created 6 integration tests covering happy path (creator+video+segments creation), creator reuse, idempotent re-upload, JSON-to-disk persistence, invalid JSON rejection, and missing fields rejection. Fixed a bug in models.py where _now() returned timezone-aware datetimes incompatible with TIMESTAMP WITHOUT TIME ZONE columns in asyncpg. + +## Verification + +All 6 tests pass: cd backend && python3 -m pytest tests/test_ingest.py -v (6 passed in 2.92s). All 4 slice-level verification checks pass: router.routes outputs ['/ingest'], TranscriptIngestResponse fields correct, python-multipart in requirements.txt, ingest in main.py. Docker compose config validation passes. + +## Verification Evidence + +| # | Command | Exit Code | Verdict | Duration | +|---|---------|-----------|---------|----------| +| 1 | `cd backend && python3 -m pytest tests/test_ingest.py -v` | 0 | ✅ pass | 2920ms | +| 2 | `cd backend && python3 -c "from routers.ingest import router; print([r.path for r in router.routes])"` | 0 | ✅ pass | 500ms | +| 3 | `cd backend && python3 -c "from schemas import TranscriptIngestResponse; print(TranscriptIngestResponse.model_fields.keys())"` | 0 | ✅ pass | 500ms | +| 4 | `grep -q 'python-multipart' backend/requirements.txt` | 0 | ✅ pass | 10ms | +| 5 | `grep -q 'ingest' backend/main.py` | 0 | ✅ pass | 10ms | +| 6 | `docker compose config > /dev/null 2>&1` | 0 | ✅ pass | 500ms | + + +## Deviations + +Fixed bug in backend/models.py _now() function — changed from datetime.now(timezone.utc) to datetime.now(timezone.utc).replace(tzinfo=None) to match TIMESTAMP WITHOUT TIME ZONE column types. This was necessary for asyncpg compatibility and was not in the original task plan. + +## Known Issues + +None. + +## Files Created/Modified + +- `backend/tests/conftest.py` +- `backend/tests/test_ingest.py` +- `backend/tests/fixtures/sample_transcript.json` +- `backend/pytest.ini` +- `backend/requirements.txt` +- `backend/models.py` + + +## Deviations +Fixed bug in backend/models.py _now() function — changed from datetime.now(timezone.utc) to datetime.now(timezone.utc).replace(tzinfo=None) to match TIMESTAMP WITHOUT TIME ZONE column types. This was necessary for asyncpg compatibility and was not in the original task plan. + +## Known Issues +None. diff --git a/backend/models.py b/backend/models.py index 3242d7a..242b160 100644 --- a/backend/models.py +++ b/backend/models.py @@ -96,7 +96,12 @@ def _uuid_pk() -> Mapped[uuid.UUID]: def _now() -> datetime: - return datetime.now(timezone.utc) + """Return current UTC time as a naive datetime (no tzinfo). + + PostgreSQL TIMESTAMP WITHOUT TIME ZONE columns require naive datetimes. + asyncpg rejects timezone-aware datetimes for such columns. + """ + return datetime.now(timezone.utc).replace(tzinfo=None) # ── Models ─────────────────────────────────────────────────────────────────── diff --git a/backend/pytest.ini b/backend/pytest.ini new file mode 100644 index 0000000..78c5011 --- /dev/null +++ b/backend/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +asyncio_mode = auto +testpaths = tests diff --git a/backend/requirements.txt b/backend/requirements.txt index 5f1315a..dd7aba6 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -10,3 +10,6 @@ redis>=5.0,<6.0 python-dotenv>=1.0,<2.0 python-multipart>=0.0.9,<1.0 httpx>=0.27.0,<1.0 +# Test dependencies +pytest>=8.0,<10.0 +pytest-asyncio>=0.24,<1.0 diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py new file mode 100644 index 0000000..5219007 --- /dev/null +++ b/backend/tests/conftest.py @@ -0,0 +1,93 @@ +"""Shared fixtures for Chrysopedia integration tests. + +Provides: +- Async SQLAlchemy engine/session against a real PostgreSQL test database +- httpx.AsyncClient wired to the FastAPI app with dependency overrides +- Sample transcript fixture path and temporary storage directory + +Key design choice: function-scoped engine with NullPool avoids asyncpg +"another operation in progress" errors caused by session-scoped connection +reuse between the ASGI test client and verification queries. +""" + +import os +import pathlib + +import pytest +import pytest_asyncio +from httpx import ASGITransport, AsyncClient +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine +from sqlalchemy.pool import NullPool + +# Ensure backend/ is on sys.path so "from models import ..." works +import sys +sys.path.insert(0, str(pathlib.Path(__file__).resolve().parent.parent)) + +from database import Base, get_session # noqa: E402 +from main import app # noqa: E402 + +TEST_DATABASE_URL = os.getenv( + "TEST_DATABASE_URL", + "postgresql+asyncpg://chrysopedia:changeme@localhost:5433/chrysopedia_test", +) + + +@pytest_asyncio.fixture() +async def db_engine(): + """Create a per-test async engine (NullPool) and create/drop all tables.""" + engine = create_async_engine(TEST_DATABASE_URL, echo=False, poolclass=NullPool) + + # Create all tables fresh for each test + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.drop_all) + await conn.run_sync(Base.metadata.create_all) + + yield engine + + # Drop all tables after test + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.drop_all) + + await engine.dispose() + + +@pytest_asyncio.fixture() +async def client(db_engine, tmp_path): + """Async HTTP test client wired to FastAPI with dependency overrides.""" + session_factory = async_sessionmaker( + db_engine, class_=AsyncSession, expire_on_commit=False + ) + + async def _override_get_session(): + async with session_factory() as session: + yield session + + # Override DB session dependency + app.dependency_overrides[get_session] = _override_get_session + + # Override transcript_storage_path via environment variable + os.environ["TRANSCRIPT_STORAGE_PATH"] = str(tmp_path) + # Clear the lru_cache so Settings picks up the new env var + from config import get_settings + get_settings.cache_clear() + + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://testserver") as ac: + yield ac + + # Teardown: clean overrides and restore settings cache + app.dependency_overrides.clear() + os.environ.pop("TRANSCRIPT_STORAGE_PATH", None) + get_settings.cache_clear() + + +@pytest.fixture() +def sample_transcript_path() -> pathlib.Path: + """Path to the sample 5-segment transcript JSON fixture.""" + return pathlib.Path(__file__).parent / "fixtures" / "sample_transcript.json" + + +@pytest.fixture() +def tmp_transcript_dir(tmp_path) -> pathlib.Path: + """Temporary directory for transcript storage during tests.""" + return tmp_path diff --git a/backend/tests/fixtures/sample_transcript.json b/backend/tests/fixtures/sample_transcript.json new file mode 100644 index 0000000..4aa4fa1 --- /dev/null +++ b/backend/tests/fixtures/sample_transcript.json @@ -0,0 +1,12 @@ +{ + "source_file": "mixing-basics-ep1.mp4", + "creator_folder": "Skope", + "duration_seconds": 1234, + "segments": [ + {"start": 0.0, "end": 5.2, "text": "Welcome to mixing basics episode one."}, + {"start": 5.2, "end": 12.8, "text": "Today we are going to talk about gain staging."}, + {"start": 12.8, "end": 20.1, "text": "First thing you want to do is set your levels."}, + {"start": 20.1, "end": 28.5, "text": "Make sure nothing is clipping on the master bus."}, + {"start": 28.5, "end": 35.0, "text": "That wraps up this quick overview of gain staging."} + ] +} diff --git a/backend/tests/test_ingest.py b/backend/tests/test_ingest.py new file mode 100644 index 0000000..67b6e36 --- /dev/null +++ b/backend/tests/test_ingest.py @@ -0,0 +1,179 @@ +"""Integration tests for the transcript ingest endpoint. + +Tests run against a real PostgreSQL database via httpx.AsyncClient +on the FastAPI ASGI app. Each test gets a clean database state via +TRUNCATE in the client fixture (conftest.py). +""" + +import json +import pathlib + +import pytest +from httpx import AsyncClient +from sqlalchemy import func, select, text +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker + +from models import Creator, SourceVideo, TranscriptSegment + + +# ── Helpers ────────────────────────────────────────────────────────────────── + +INGEST_URL = "/api/v1/ingest" + + +def _upload_file(path: pathlib.Path): + """Return a dict suitable for httpx multipart file upload.""" + return {"file": (path.name, path.read_bytes(), "application/json")} + + +async def _query_db(db_engine, stmt): + """Run a read query in its own session to avoid connection contention.""" + session_factory = async_sessionmaker( + db_engine, class_=AsyncSession, expire_on_commit=False + ) + async with session_factory() as session: + result = await session.execute(stmt) + return result + + +async def _count_rows(db_engine, model): + """Count rows in a table via a fresh session.""" + result = await _query_db(db_engine, select(func.count(model.id))) + return result.scalar_one() + + +# ── Happy-path tests ──────────────────────────────────────────────────────── + + +async def test_ingest_creates_creator_and_video(client, sample_transcript_path, db_engine): + """POST a valid transcript → 200 with creator, video, and 5 segments created.""" + resp = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path)) + assert resp.status_code == 200, f"Expected 200, got {resp.status_code}: {resp.text}" + + data = resp.json() + assert "video_id" in data + assert "creator_id" in data + assert data["segments_stored"] == 5 + assert data["creator_name"] == "Skope" + assert data["is_reupload"] is False + + # Verify DB state via a fresh session + session_factory = async_sessionmaker(db_engine, class_=AsyncSession, expire_on_commit=False) + async with session_factory() as session: + # Creator exists with correct folder_name and slug + result = await session.execute( + select(Creator).where(Creator.folder_name == "Skope") + ) + creator = result.scalar_one() + assert creator.slug == "skope" + assert creator.name == "Skope" + + # SourceVideo exists with correct status + result = await session.execute( + select(SourceVideo).where(SourceVideo.creator_id == creator.id) + ) + video = result.scalar_one() + assert video.processing_status.value == "transcribed" + assert video.filename == "mixing-basics-ep1.mp4" + + # 5 TranscriptSegment rows with sequential indices + result = await session.execute( + select(TranscriptSegment) + .where(TranscriptSegment.source_video_id == video.id) + .order_by(TranscriptSegment.segment_index) + ) + segments = result.scalars().all() + assert len(segments) == 5 + assert [s.segment_index for s in segments] == [0, 1, 2, 3, 4] + + +async def test_ingest_reuses_existing_creator(client, sample_transcript_path, db_engine): + """If a Creator with the same folder_name already exists, reuse it.""" + session_factory = async_sessionmaker(db_engine, class_=AsyncSession, expire_on_commit=False) + + # Pre-create a Creator with folder_name='Skope' in a separate session + async with session_factory() as session: + existing = Creator(name="Skope", slug="skope", folder_name="Skope") + session.add(existing) + await session.commit() + await session.refresh(existing) + existing_id = existing.id + + # POST transcript — should reuse the creator + resp = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path)) + assert resp.status_code == 200 + data = resp.json() + assert data["creator_id"] == str(existing_id) + + # Verify only 1 Creator row in DB + count = await _count_rows(db_engine, Creator) + assert count == 1, f"Expected 1 creator, got {count}" + + +async def test_ingest_idempotent_reupload(client, sample_transcript_path, db_engine): + """Uploading the same transcript twice is idempotent: same video, no duplicate segments.""" + # First upload + resp1 = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path)) + assert resp1.status_code == 200 + data1 = resp1.json() + assert data1["is_reupload"] is False + video_id = data1["video_id"] + + # Second upload (same file) + resp2 = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path)) + assert resp2.status_code == 200 + data2 = resp2.json() + assert data2["is_reupload"] is True + assert data2["video_id"] == video_id + + # Verify DB: still only 1 SourceVideo and 5 segments (not 10) + video_count = await _count_rows(db_engine, SourceVideo) + assert video_count == 1, f"Expected 1 video, got {video_count}" + + seg_count = await _count_rows(db_engine, TranscriptSegment) + assert seg_count == 5, f"Expected 5 segments, got {seg_count}" + + +async def test_ingest_saves_json_to_disk(client, sample_transcript_path, tmp_path): + """Ingested transcript raw JSON is persisted to the filesystem.""" + resp = await client.post(INGEST_URL, files=_upload_file(sample_transcript_path)) + assert resp.status_code == 200 + + # The ingest endpoint saves to {transcript_storage_path}/{creator_folder}/{source_file}.json + expected_path = tmp_path / "Skope" / "mixing-basics-ep1.mp4.json" + assert expected_path.exists(), f"Expected file at {expected_path}" + + # Verify the saved JSON is valid and matches the source + saved = json.loads(expected_path.read_text()) + source = json.loads(sample_transcript_path.read_text()) + assert saved == source + + +# ── Error tests ────────────────────────────────────────────────────────────── + + +async def test_ingest_rejects_invalid_json(client, tmp_path): + """Uploading a non-JSON file returns 422.""" + bad_file = tmp_path / "bad.json" + bad_file.write_text("this is not valid json {{{") + + resp = await client.post( + INGEST_URL, + files={"file": ("bad.json", bad_file.read_bytes(), "application/json")}, + ) + assert resp.status_code == 422, f"Expected 422, got {resp.status_code}: {resp.text}" + assert "JSON parse error" in resp.json()["detail"] + + +async def test_ingest_rejects_missing_fields(client, tmp_path): + """Uploading JSON without required fields returns 422.""" + incomplete = tmp_path / "incomplete.json" + # Missing creator_folder and segments + incomplete.write_text(json.dumps({"source_file": "test.mp4", "duration_seconds": 100})) + + resp = await client.post( + INGEST_URL, + files={"file": ("incomplete.json", incomplete.read_bytes(), "application/json")}, + ) + assert resp.status_code == 422, f"Expected 422, got {resp.status_code}: {resp.text}" + assert "Missing required keys" in resp.json()["detail"]