Auto-mode commit 7aa33cd accidentally deleted 78 files (14,814 lines) during M005
execution. Subsequent commits rebuilt some frontend files but backend/, alembic/,
tests/, whisper/, docker configs, and prompts were never restored in this repo.
This commit restores the full project tree by syncing from ub01's working directory,
which has all M001-M007 features running in production containers.
Restored: backend/ (config, models, routers, database, redis, search_service, worker),
alembic/ (6 migrations), docker/ (Dockerfiles, nginx, compose), prompts/ (4 stages),
tests/, whisper/, README.md, .env.example, chrysopedia-spec.md
284 lines
10 KiB
Python
284 lines
10 KiB
Python
"""Transcript ingestion endpoint for the Chrysopedia API.
|
|
|
|
Accepts a Whisper-format transcript JSON via multipart file upload, finds or
|
|
creates a Creator, upserts a SourceVideo, bulk-inserts TranscriptSegments,
|
|
persists the raw JSON to disk, and returns a structured response.
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import uuid
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException, UploadFile
|
|
from sqlalchemy import delete, select
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from config import get_settings
|
|
from database import get_session
|
|
from models import ContentType, Creator, ProcessingStatus, SourceVideo, TranscriptSegment
|
|
from schemas import TranscriptIngestResponse
|
|
|
|
logger = logging.getLogger("chrysopedia.ingest")
|
|
|
|
router = APIRouter(prefix="/ingest", tags=["ingest"])
|
|
|
|
REQUIRED_KEYS = {"source_file", "creator_folder", "duration_seconds", "segments"}
|
|
|
|
|
|
def slugify(value: str) -> str:
|
|
"""Lowercase, replace non-alphanumeric chars with hyphens, collapse/strip."""
|
|
value = value.lower()
|
|
value = re.sub(r"[^a-z0-9]+", "-", value)
|
|
value = value.strip("-")
|
|
value = re.sub(r"-{2,}", "-", value)
|
|
return value
|
|
|
|
|
|
|
|
def compute_content_hash(segments: list[dict]) -> str:
|
|
"""Compute a stable SHA-256 hash from transcript segment text.
|
|
|
|
Hashes only the segment text content in order, ignoring metadata like
|
|
filenames, timestamps, or dates. Two transcripts of the same audio will
|
|
produce identical hashes even if ingested with different filenames.
|
|
"""
|
|
h = hashlib.sha256()
|
|
for seg in segments:
|
|
h.update(str(seg.get("text", "")).encode("utf-8"))
|
|
return h.hexdigest()
|
|
|
|
|
|
@router.post("", response_model=TranscriptIngestResponse)
|
|
async def ingest_transcript(
|
|
file: UploadFile,
|
|
db: AsyncSession = Depends(get_session),
|
|
) -> TranscriptIngestResponse:
|
|
"""Ingest a Whisper transcript JSON file.
|
|
|
|
Workflow:
|
|
1. Parse and validate the uploaded JSON.
|
|
2. Find-or-create a Creator by folder_name.
|
|
3. Upsert a SourceVideo by (creator_id, filename).
|
|
4. Bulk-insert TranscriptSegment rows.
|
|
5. Save raw JSON to transcript_storage_path.
|
|
6. Return structured response.
|
|
"""
|
|
settings = get_settings()
|
|
|
|
# ── 1. Read & parse JSON ─────────────────────────────────────────────
|
|
try:
|
|
raw_bytes = await file.read()
|
|
raw_text = raw_bytes.decode("utf-8")
|
|
except Exception as exc:
|
|
raise HTTPException(status_code=400, detail=f"Invalid file: {exc}") from exc
|
|
|
|
try:
|
|
data = json.loads(raw_text)
|
|
except json.JSONDecodeError as exc:
|
|
raise HTTPException(
|
|
status_code=422, detail=f"JSON parse error: {exc}"
|
|
) from exc
|
|
|
|
if not isinstance(data, dict):
|
|
raise HTTPException(status_code=422, detail="Expected a JSON object at the top level")
|
|
|
|
missing = REQUIRED_KEYS - data.keys()
|
|
if missing:
|
|
raise HTTPException(
|
|
status_code=422,
|
|
detail=f"Missing required keys: {', '.join(sorted(missing))}",
|
|
)
|
|
|
|
source_file: str = data["source_file"]
|
|
creator_folder: str = data["creator_folder"]
|
|
duration_seconds: int | None = data.get("duration_seconds")
|
|
segments_data: list = data["segments"]
|
|
|
|
if not isinstance(segments_data, list):
|
|
raise HTTPException(status_code=422, detail="'segments' must be an array")
|
|
|
|
content_hash = compute_content_hash(segments_data)
|
|
logger.info("Content hash for %s: %s", source_file, content_hash)
|
|
|
|
# ── 2. Find-or-create Creator ────────────────────────────────────────
|
|
stmt = select(Creator).where(Creator.folder_name == creator_folder)
|
|
result = await db.execute(stmt)
|
|
creator = result.scalar_one_or_none()
|
|
|
|
if creator is None:
|
|
creator = Creator(
|
|
name=creator_folder,
|
|
slug=slugify(creator_folder),
|
|
folder_name=creator_folder,
|
|
)
|
|
db.add(creator)
|
|
await db.flush() # assign id
|
|
|
|
# ── 3. Upsert SourceVideo ────────────────────────────────────────────
|
|
# First check for exact filename match (original behavior)
|
|
stmt = select(SourceVideo).where(
|
|
SourceVideo.creator_id == creator.id,
|
|
SourceVideo.filename == source_file,
|
|
)
|
|
result = await db.execute(stmt)
|
|
existing_video = result.scalar_one_or_none()
|
|
|
|
# Tier 2: content hash match (same audio, different filename/metadata)
|
|
matched_video = None
|
|
match_reason = None
|
|
if existing_video is None:
|
|
stmt = select(SourceVideo).where(
|
|
SourceVideo.content_hash == content_hash,
|
|
)
|
|
result = await db.execute(stmt)
|
|
matched_video = result.scalar_one_or_none()
|
|
if matched_video:
|
|
match_reason = "content_hash"
|
|
|
|
# Tier 3: filename + duration match (same yt-dlp download, re-encoded)
|
|
if existing_video is None and matched_video is None and duration_seconds is not None:
|
|
# Strip common prefixes like dates (e.g. "2023-07-19 ") and extensions
|
|
# to get a normalized base name for fuzzy matching
|
|
base_name = re.sub(r"^\d{4}-\d{2}-\d{2}\s+", "", source_file)
|
|
base_name = re.sub(r"\s*\(\d+p\).*$", "", base_name) # strip resolution suffix
|
|
base_name = os.path.splitext(base_name)[0].strip()
|
|
|
|
stmt = select(SourceVideo).where(
|
|
SourceVideo.creator_id == creator.id,
|
|
SourceVideo.duration_seconds == duration_seconds,
|
|
)
|
|
result = await db.execute(stmt)
|
|
candidates = result.scalars().all()
|
|
for candidate in candidates:
|
|
cand_name = re.sub(r"^\d{4}-\d{2}-\d{2}\s+", "", candidate.filename)
|
|
cand_name = re.sub(r"\s*\(\d+p\).*$", "", cand_name)
|
|
cand_name = os.path.splitext(cand_name)[0].strip()
|
|
if cand_name == base_name:
|
|
matched_video = candidate
|
|
match_reason = "filename+duration"
|
|
break
|
|
|
|
is_reupload = existing_video is not None
|
|
is_duplicate_content = matched_video is not None
|
|
|
|
if is_duplicate_content:
|
|
logger.info(
|
|
"Duplicate detected via %s: '%s' matches existing video '%s' (%s)",
|
|
match_reason, source_file, matched_video.filename, matched_video.id,
|
|
)
|
|
|
|
if is_reupload:
|
|
video = existing_video
|
|
# Delete old segments for idempotent re-upload
|
|
await db.execute(
|
|
delete(TranscriptSegment).where(
|
|
TranscriptSegment.source_video_id == video.id
|
|
)
|
|
)
|
|
video.duration_seconds = duration_seconds
|
|
video.content_hash = content_hash
|
|
video.processing_status = ProcessingStatus.transcribed
|
|
elif is_duplicate_content:
|
|
# Same content, different filename — update the existing record
|
|
video = matched_video
|
|
await db.execute(
|
|
delete(TranscriptSegment).where(
|
|
TranscriptSegment.source_video_id == video.id
|
|
)
|
|
)
|
|
video.filename = source_file
|
|
video.file_path = f"{creator_folder}/{source_file}"
|
|
video.duration_seconds = duration_seconds
|
|
video.content_hash = content_hash
|
|
video.processing_status = ProcessingStatus.transcribed
|
|
is_reupload = True # Treat as reupload for response
|
|
else:
|
|
video = SourceVideo(
|
|
creator_id=creator.id,
|
|
filename=source_file,
|
|
file_path=f"{creator_folder}/{source_file}",
|
|
duration_seconds=duration_seconds,
|
|
content_type=ContentType.tutorial,
|
|
content_hash=content_hash,
|
|
processing_status=ProcessingStatus.transcribed,
|
|
)
|
|
db.add(video)
|
|
await db.flush() # assign id
|
|
|
|
# ── 4. Bulk-insert TranscriptSegments ────────────────────────────────
|
|
segment_objs = [
|
|
TranscriptSegment(
|
|
source_video_id=video.id,
|
|
start_time=float(seg["start"]),
|
|
end_time=float(seg["end"]),
|
|
text=str(seg["text"]),
|
|
segment_index=idx,
|
|
)
|
|
for idx, seg in enumerate(segments_data)
|
|
]
|
|
db.add_all(segment_objs)
|
|
|
|
# ── 5. Save raw JSON to disk ─────────────────────────────────────────
|
|
transcript_dir = os.path.join(
|
|
settings.transcript_storage_path, creator_folder
|
|
)
|
|
transcript_path = os.path.join(transcript_dir, f"{source_file}.json")
|
|
|
|
try:
|
|
os.makedirs(transcript_dir, exist_ok=True)
|
|
with open(transcript_path, "w", encoding="utf-8") as f:
|
|
f.write(raw_text)
|
|
except OSError as exc:
|
|
raise HTTPException(
|
|
status_code=500, detail=f"Failed to save transcript: {exc}"
|
|
) from exc
|
|
|
|
video.transcript_path = transcript_path
|
|
|
|
# ── 6. Commit & respond ──────────────────────────────────────────────
|
|
try:
|
|
await db.commit()
|
|
except Exception as exc:
|
|
await db.rollback()
|
|
logger.error("Database commit failed during ingest: %s", exc)
|
|
raise HTTPException(
|
|
status_code=500, detail="Database error during ingest"
|
|
) from exc
|
|
|
|
await db.refresh(video)
|
|
await db.refresh(creator)
|
|
|
|
# ── 7. Dispatch LLM pipeline (best-effort) ──────────────────────────
|
|
try:
|
|
from pipeline.stages import run_pipeline
|
|
|
|
run_pipeline.delay(str(video.id))
|
|
logger.info("Pipeline dispatched for video_id=%s", video.id)
|
|
except Exception as exc:
|
|
logger.warning(
|
|
"Pipeline dispatch failed for video_id=%s (ingest still succeeds): %s",
|
|
video.id,
|
|
exc,
|
|
)
|
|
|
|
logger.info(
|
|
"Ingested transcript: creator=%s, file=%s, segments=%d, reupload=%s",
|
|
creator.name,
|
|
source_file,
|
|
len(segment_objs),
|
|
is_reupload,
|
|
)
|
|
|
|
return TranscriptIngestResponse(
|
|
video_id=video.id,
|
|
creator_id=creator.id,
|
|
creator_name=creator.name,
|
|
filename=source_file,
|
|
segments_stored=len(segment_objs),
|
|
processing_status=video.processing_status.value,
|
|
is_reupload=is_reupload,
|
|
content_hash=content_hash,
|
|
)
|