- Add content_hash (SHA-256 of transcript text) to source_videos (migration 005) - 3-tier duplicate detection at ingest: exact filename, content hash, then normalized filename + duration (handles yt-dlp re-downloads) - Snapshot prior technique_page_ids to Redis before pipeline dispatch - Stage 5 matches prior pages by creator+category before slug fallback, enabling version snapshots on reprocessing even when LLM generates different slugs - Expose content_hash in API responses and admin pipeline dashboard Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
29 lines
670 B
Python
29 lines
670 B
Python
"""Add content_hash to source_videos for duplicate detection.
|
|
|
|
Revision ID: 005_content_hash
|
|
Revises: 004_pipeline_events
|
|
"""
|
|
from alembic import op
|
|
import sqlalchemy as sa
|
|
|
|
revision = "005_content_hash"
|
|
down_revision = "004_pipeline_events"
|
|
branch_labels = None
|
|
depends_on = None
|
|
|
|
|
|
def upgrade() -> None:
|
|
op.add_column(
|
|
"source_videos",
|
|
sa.Column("content_hash", sa.String(64), nullable=True),
|
|
)
|
|
op.create_index(
|
|
"ix_source_videos_content_hash",
|
|
"source_videos",
|
|
["content_hash"],
|
|
)
|
|
|
|
|
|
def downgrade() -> None:
|
|
op.drop_index("ix_source_videos_content_hash")
|
|
op.drop_column("source_videos", "content_hash")
|