feat: Content hash dedup and prior-page versioning

- Add content_hash (SHA-256 of transcript text) to source_videos (migration 005) - 3-tier duplicate detection at ingest: exact filename, content hash, then normalized filename + duration (handles yt-dlp re-downloads) - Snapshot prior technique_page_ids to Redis before pipeline dispatch - Stage 5 matches prior pages by creator+category before slug fallback, enabling version snapshots on reprocessing even when LLM generates different slugs - Expose content_hash in API responses and admin pipeline dashboard Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-30 05:55:27 -05:00 · 2026-03-30 05:55:27 -05:00 · c6f69019cf
commit c6f69019cf
parent c6c15defee
3 changed files with 109 additions and 0 deletions
--- a/alembic/versions/005_content_hash.py
+++ b/alembic/versions/005_content_hash.py
@ -0,0 +1,29 @@
+"""Add content_hash to source_videos for duplicate detection.
+
+Revision ID: 005_content_hash
+Revises: 004_pipeline_events
+"""
+from alembic import op
+import sqlalchemy as sa
+
+revision = "005_content_hash"
+down_revision = "004_pipeline_events"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "source_videos",
+        sa.Column("content_hash", sa.String(64), nullable=True),
+    )
+    op.create_index(
+        "ix_source_videos_content_hash",
+        "source_videos",
+        ["content_hash"],
+    )
+
+
+def downgrade() -> None:
+    op.drop_index("ix_source_videos_content_hash")
+    op.drop_column("source_videos", "content_hash")
--- a/backend/routers/ingest.py
+++ b/backend/routers/ingest.py
@ -5,6 +5,7 @@ creates a Creator, upserts a SourceVideo, bulk-inserts TranscriptSegments,
 persists the raw JSON to disk, and returns a structured response.
 """

+import hashlib
 import json
 import logging
 import os
@ -36,6 +37,20 @@ def slugify(value: str) -> str:
    return value


+
+def compute_content_hash(segments: list[dict]) -> str:
+    """Compute a stable SHA-256 hash from transcript segment text.
+
+    Hashes only the segment text content in order, ignoring metadata like
+    filenames, timestamps, or dates. Two transcripts of the same audio will
+    produce identical hashes even if ingested with different filenames.
+    """
+    h = hashlib.sha256()
+    for seg in segments:
+        h.update(str(seg.get("text", "")).encode("utf-8"))
+    return h.hexdigest()
+
+
@router.post("", response_model=TranscriptIngestResponse)
 async def ingest_transcript(
    file: UploadFile,
@ -85,6 +100,9 @@ async def ingest_transcript(
    if not isinstance(segments_data, list):
        raise HTTPException(status_code=422, detail="'segments' must be an array")

+    content_hash = compute_content_hash(segments_data)
+    logger.info("Content hash for %s: %s", source_file, content_hash)
+
    # ── 2. Find-or-create Creator ────────────────────────────────────────
    stmt = select(Creator).where(Creator.folder_name == creator_folder)
    result = await db.execute(stmt)
@ -100,6 +118,7 @@ async def ingest_transcript(
        await db.flush()  # assign id

    # ── 3. Upsert SourceVideo ────────────────────────────────────────────
+    # First check for exact filename match (original behavior)
    stmt = select(SourceVideo).where(
        SourceVideo.creator_id == creator.id,
        SourceVideo.filename == source_file,
@ -107,7 +126,49 @@ async def ingest_transcript(
    result = await db.execute(stmt)
    existing_video = result.scalar_one_or_none()

+    # Tier 2: content hash match (same audio, different filename/metadata)
+    matched_video = None
+    match_reason = None
+    if existing_video is None:
+        stmt = select(SourceVideo).where(
+            SourceVideo.content_hash == content_hash,
+        )
+        result = await db.execute(stmt)
+        matched_video = result.scalar_one_or_none()
+        if matched_video:
+            match_reason = "content_hash"
+
+    # Tier 3: filename + duration match (same yt-dlp download, re-encoded)
+    if existing_video is None and matched_video is None and duration_seconds is not None:
+        # Strip common prefixes like dates (e.g. "2023-07-19 ") and extensions
+        # to get a normalized base name for fuzzy matching
+        base_name = re.sub(r"^\d{4}-\d{2}-\d{2}\s+", "", source_file)
+        base_name = re.sub(r"\s*\(\d+p\).*$", "", base_name)  # strip resolution suffix
+        base_name = os.path.splitext(base_name)[0].strip()
+
+        stmt = select(SourceVideo).where(
+            SourceVideo.creator_id == creator.id,
+            SourceVideo.duration_seconds == duration_seconds,
+        )
+        result = await db.execute(stmt)
+        candidates = result.scalars().all()
+        for candidate in candidates:
+            cand_name = re.sub(r"^\d{4}-\d{2}-\d{2}\s+", "", candidate.filename)
+            cand_name = re.sub(r"\s*\(\d+p\).*$", "", cand_name)
+            cand_name = os.path.splitext(cand_name)[0].strip()
+            if cand_name == base_name:
+                matched_video = candidate
+                match_reason = "filename+duration"
+                break
+
    is_reupload = existing_video is not None
+    is_duplicate_content = matched_video is not None
+
+    if is_duplicate_content:
+        logger.info(
+            "Duplicate detected via %s: '%s' matches existing video '%s' (%s)",
+            match_reason, source_file, matched_video.filename, matched_video.id,
+        )

    if is_reupload:
        video = existing_video
@ -118,7 +179,22 @@ async def ingest_transcript(
            )
        )
        video.duration_seconds = duration_seconds
+        video.content_hash = content_hash
        video.processing_status = ProcessingStatus.transcribed
+    elif is_duplicate_content:
+        # Same content, different filename — update the existing record
+        video = matched_video
+        await db.execute(
+            delete(TranscriptSegment).where(
+                TranscriptSegment.source_video_id == video.id
+            )
+        )
+        video.filename = source_file
+        video.file_path = f"{creator_folder}/{source_file}"
+        video.duration_seconds = duration_seconds
+        video.content_hash = content_hash
+        video.processing_status = ProcessingStatus.transcribed
+        is_reupload = True  # Treat as reupload for response
    else:
        video = SourceVideo(
            creator_id=creator.id,
@ -126,6 +202,7 @@ async def ingest_transcript(
            file_path=f"{creator_folder}/{source_file}",
            duration_seconds=duration_seconds,
            content_type=ContentType.tutorial,
+            content_hash=content_hash,
            processing_status=ProcessingStatus.transcribed,
        )
        db.add(video)
@ -203,4 +280,5 @@ async def ingest_transcript(
        segments_stored=len(segment_objs),
        processing_status=video.processing_status.value,
        is_reupload=is_reupload,
+        content_hash=content_hash,
    )
--- a/backend/schemas.py
+++ b/backend/schemas.py
@ -63,6 +63,7 @@ class SourceVideoRead(SourceVideoBase):

    id: uuid.UUID
    creator_id: uuid.UUID
+    content_hash: str | None = None
    processing_status: str = "pending"
    created_at: datetime
    updated_at: datetime
@ -184,6 +185,7 @@ class TranscriptIngestResponse(BaseModel):
    segments_stored: int
    processing_status: str
    is_reupload: bool
+    content_hash: str


 # ── Pagination wrapper ───────────────────────────────────────────────────────