From 899e57c0e165c4733ae5511c48229e27b16f9902 Mon Sep 17 00:00:00 2001 From: jlightner Date: Sat, 4 Apr 2026 11:12:19 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20Created=20ASS=20subtitle=20generator=20?= =?UTF-8?q?with=20karaoke=20word-by-word=20highligh=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - "backend/pipeline/caption_generator.py" - "backend/pipeline/shorts_generator.py" - "backend/pipeline/stages.py" - "backend/models.py" - "alembic/versions/027_add_captions_enabled.py" - "backend/pipeline/test_caption_generator.py" GSD-Task: S04/T01 --- alembic/versions/027_add_captions_enabled.py | 30 ++++ backend/models.py | 4 + backend/pipeline/caption_generator.py | 155 ++++++++++++++++++ backend/pipeline/shorts_generator.py | 15 +- backend/pipeline/stages.py | 62 +++++++- backend/pipeline/test_caption_generator.py | 159 +++++++++++++++++++ 6 files changed, 423 insertions(+), 2 deletions(-) create mode 100644 alembic/versions/027_add_captions_enabled.py create mode 100644 backend/pipeline/caption_generator.py create mode 100644 backend/pipeline/test_caption_generator.py diff --git a/alembic/versions/027_add_captions_enabled.py b/alembic/versions/027_add_captions_enabled.py new file mode 100644 index 0000000..6d71df5 --- /dev/null +++ b/alembic/versions/027_add_captions_enabled.py @@ -0,0 +1,30 @@ +"""Add captions_enabled boolean to generated_shorts. + +Revision ID: 027_add_captions_enabled +Revises: 026_add_share_token +""" + +import sqlalchemy as sa + +from alembic import op + +revision = "027_add_captions_enabled" +down_revision = "026_add_share_token" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "generated_shorts", + sa.Column( + "captions_enabled", + sa.Boolean(), + nullable=False, + server_default=sa.text("false"), + ), + ) + + +def downgrade() -> None: + op.drop_column("generated_shorts", "captions_enabled") diff --git a/backend/models.py b/backend/models.py index 11e03fa..0d9d72c 100644 --- a/backend/models.py +++ b/backend/models.py @@ -867,5 +867,9 @@ class GeneratedShort(Base): default=_now, server_default=func.now(), onupdate=_now ) + captions_enabled: Mapped[bool] = mapped_column( + Boolean, default=False, server_default=text("'false'"), + ) + # relationships highlight_candidate: Mapped[HighlightCandidate] = sa_relationship() diff --git a/backend/pipeline/caption_generator.py b/backend/pipeline/caption_generator.py new file mode 100644 index 0000000..2ff3b15 --- /dev/null +++ b/backend/pipeline/caption_generator.py @@ -0,0 +1,155 @@ +r"""ASS (Advanced SubStation Alpha) caption generator for shorts. + +Converts word-level timings from Whisper transcripts into ASS subtitle +files with word-by-word karaoke highlighting. Each word gets its own +Dialogue line with {\k} tags that control highlight duration. + +Pure functions — no DB access, no Celery dependency. +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Any + +logger = logging.getLogger(__name__) + +# ── Default style configuration ────────────────────────────────────────────── + +DEFAULT_STYLE: dict[str, Any] = { + "font_name": "Arial", + "font_size": 48, + "primary_colour": "&H00FFFFFF", # white (BGR + alpha) + "secondary_colour": "&H0000FFFF", # yellow highlight + "outline_colour": "&H00000000", # black outline + "back_colour": "&H80000000", # semi-transparent black shadow + "bold": -1, # bold + "outline": 3, + "shadow": 1, + "alignment": 2, # bottom-center + "margin_v": 60, # 60px from bottom (~15% on 1920h) +} + + +def _format_ass_time(seconds: float) -> str: + """Convert seconds to ASS timestamp format: H:MM:SS.cc (centiseconds). + + >>> _format_ass_time(65.5) + '0:01:05.50' + >>> _format_ass_time(0.0) + '0:00:00.00' + """ + if seconds < 0: + seconds = 0.0 + h = int(seconds // 3600) + m = int((seconds % 3600) // 60) + s = seconds % 60 + return f"{h}:{m:02d}:{s:05.2f}" + + +def _build_ass_header(style_config: dict[str, Any]) -> str: + """Build ASS file header with script info and style definition.""" + cfg = {**DEFAULT_STYLE, **(style_config or {})} + + header = ( + "[Script Info]\n" + "Title: Chrysopedia Auto-Captions\n" + "ScriptType: v4.00+\n" + "PlayResX: 1080\n" + "PlayResY: 1920\n" + "WrapStyle: 0\n" + "ScaledBorderAndShadow: yes\n" + "\n" + "[V4+ Styles]\n" + "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, " + "OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, " + "ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, " + "Alignment, MarginL, MarginR, MarginV, Encoding\n" + f"Style: Default,{cfg['font_name']},{cfg['font_size']}," + f"{cfg['primary_colour']},{cfg['secondary_colour']}," + f"{cfg['outline_colour']},{cfg['back_colour']}," + f"{cfg['bold']},0,0,0," + f"100,100,0,0,1,{cfg['outline']},{cfg['shadow']}," + f"{cfg['alignment']},20,20,{cfg['margin_v']},1\n" + "\n" + "[Events]\n" + "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n" + ) + return header + + +def generate_ass_captions( + word_timings: list[dict[str, Any]], + clip_start: float, + style_config: dict[str, Any] | None = None, +) -> str: + """Generate ASS subtitle content from word-level timings. + + Each word is emitted as a separate Dialogue line with karaoke timing + (``{\\k}``) so that words highlight one-by-one. + + All word timestamps are offset by ``-clip_start`` to make them + clip-relative (i.e. the first frame of the clip is t=0). + + Parameters + ---------- + word_timings : list[dict] + Word-timing dicts with ``word``, ``start``, ``end`` keys. + ``start`` and ``end`` are absolute times in seconds. + clip_start : float + Absolute start time of the clip in seconds. Subtracted from + all word timestamps. + style_config : dict | None + Override style parameters (merged onto DEFAULT_STYLE). + + Returns + ------- + str — Full ASS file content. Empty dialogue section if no timings. + """ + header = _build_ass_header(style_config) + + if not word_timings: + logger.debug("No word timings provided — returning header-only ASS") + return header + + lines: list[str] = [header] + + for w in word_timings: + word_text = w.get("word", "").strip() + if not word_text: + continue + + abs_start = float(w.get("start", 0.0)) + abs_end = float(w.get("end", abs_start)) + + # Make clip-relative + rel_start = max(0.0, abs_start - clip_start) + rel_end = max(rel_start, abs_end - clip_start) + + # Karaoke duration in centiseconds + k_duration = max(1, round((rel_end - rel_start) * 100)) + + start_ts = _format_ass_time(rel_start) + end_ts = _format_ass_time(rel_end) + + # Dialogue line with karaoke tag + line = ( + f"Dialogue: 0,{start_ts},{end_ts},Default,,0,0,0,," + f"{{\\k{k_duration}}}{word_text}" + ) + lines.append(line) + + return "\n".join(lines) + "\n" + + +def write_ass_file(ass_content: str, output_path: Path) -> Path: + """Write ASS content to disk. + + Creates parent directories if needed. Returns the output path. + """ + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(ass_content, encoding="utf-8") + logger.debug("Wrote ASS captions to %s (%d bytes)", output_path, len(ass_content)) + return output_path diff --git a/backend/pipeline/shorts_generator.py b/backend/pipeline/shorts_generator.py index b5f63a3..0f7c711 100644 --- a/backend/pipeline/shorts_generator.py +++ b/backend/pipeline/shorts_generator.py @@ -72,18 +72,24 @@ def extract_clip( start_secs: float, end_secs: float, vf_filter: str, + ass_path: Path | str | None = None, ) -> None: """Extract a clip from a video file using ffmpeg. Seeks to *start_secs*, encodes until *end_secs*, and applies *vf_filter*. Uses ``-c:v libx264 -preset fast -crf 23`` for reasonable quality/speed. + When *ass_path* is provided, the ASS subtitle filter is appended to the + video filter chain so that captions are burned into the output video. + Args: input_path: Source video file. output_path: Destination mp4 file (parent dir must exist). start_secs: Start time in seconds. end_secs: End time in seconds. vf_filter: ffmpeg ``-vf`` filter string. + ass_path: Optional path to an ASS subtitle file. When provided, + ``ass=`` is appended to the filter chain. Raises: subprocess.CalledProcessError: If ffmpeg exits non-zero. @@ -97,13 +103,20 @@ def extract_clip( f"(duration={duration}s)" ) + # Build the video filter chain — ASS burn-in comes after scale/pad + effective_vf = vf_filter + if ass_path is not None: + # Escape colons and backslashes in the path for ffmpeg filter syntax + escaped = str(ass_path).replace("\\", "\\\\").replace(":", "\\:") + effective_vf = f"{vf_filter},ass={escaped}" + cmd = [ "ffmpeg", "-y", # overwrite output "-ss", str(start_secs), # seek before input (fast) "-i", str(input_path), "-t", str(duration), - "-vf", vf_filter, + "-vf", effective_vf, "-c:v", "libx264", "-preset", "fast", "-crf", "23", diff --git a/backend/pipeline/stages.py b/backend/pipeline/stages.py index c3a4e28..03c50d2 100644 --- a/backend/pipeline/stages.py +++ b/backend/pipeline/stages.py @@ -2876,7 +2876,8 @@ def stage_generate_shorts(self, highlight_candidate_id: str) -> str: Returns the highlight_candidate_id on completion. """ from pipeline.shorts_generator import PRESETS, extract_clip, resolve_video_path - from models import FormatPreset, GeneratedShort, ShortStatus + from pipeline.caption_generator import generate_ass_captions, write_ass_file + from models import FormatPreset, GeneratedShort, ShortStatus, SourceVideo start = time.monotonic() session = _get_sync_session() @@ -2954,6 +2955,56 @@ def stage_generate_shorts(self, highlight_candidate_id: str) -> str: clip_start, clip_end, ) + # ── Generate captions from transcript (if available) ──────────── + ass_path: Path | None = None + captions_ok = False + try: + transcript_data: list | None = None + if source_video.transcript_path: + try: + with open(source_video.transcript_path, "r") as fh: + raw = json.load(fh) + if isinstance(raw, dict): + transcript_data = raw.get("segments", raw.get("results", [])) + elif isinstance(raw, list): + transcript_data = raw + except (FileNotFoundError, json.JSONDecodeError, OSError) as io_exc: + logger.warning( + "Failed to load transcript for captions highlight=%s: %s", + highlight_candidate_id, io_exc, + ) + + if transcript_data: + from pipeline.highlight_scorer import extract_word_timings + + word_timings = extract_word_timings(transcript_data, clip_start, clip_end) + if word_timings: + ass_content = generate_ass_captions(word_timings, clip_start) + ass_path = write_ass_file( + ass_content, + Path(f"/tmp/captions_{highlight_candidate_id}.ass"), + ) + captions_ok = True + logger.info( + "Generated captions for highlight=%s (%d words)", + highlight_candidate_id, len(word_timings), + ) + else: + logger.warning( + "No word timings in transcript window [%.1f–%.1f]s for highlight=%s — proceeding without captions", + clip_start, clip_end, highlight_candidate_id, + ) + else: + logger.info( + "No transcript available for highlight=%s — proceeding without captions", + highlight_candidate_id, + ) + except Exception as cap_exc: + logger.warning( + "Caption generation failed for highlight=%s: %s — proceeding without captions", + highlight_candidate_id, cap_exc, + ) + # ── Process each preset independently ─────────────────────────── for preset in FormatPreset: spec = PRESETS[preset] @@ -2983,6 +3034,7 @@ def stage_generate_shorts(self, highlight_candidate_id: str) -> str: start_secs=clip_start, end_secs=clip_end, vf_filter=spec.vf_filter, + ass_path=ass_path, ) # Upload to MinIO @@ -3000,6 +3052,7 @@ def stage_generate_shorts(self, highlight_candidate_id: str) -> str: short.status = ShortStatus.complete short.file_size_bytes = file_size short.minio_object_key = minio_key + short.captions_enabled = captions_ok short.share_token = secrets.token_urlsafe(8) session.commit() @@ -3035,6 +3088,13 @@ def stage_generate_shorts(self, highlight_candidate_id: str) -> str: except OSError: pass + # Clean up temp ASS caption file + if ass_path is not None and ass_path.exists(): + try: + ass_path.unlink() + except OSError: + pass + elapsed = time.monotonic() - start logger.info( "Shorts generation complete for highlight=%s in %.1fs", diff --git a/backend/pipeline/test_caption_generator.py b/backend/pipeline/test_caption_generator.py new file mode 100644 index 0000000..249faac --- /dev/null +++ b/backend/pipeline/test_caption_generator.py @@ -0,0 +1,159 @@ +"""Unit tests for caption_generator module.""" + +from __future__ import annotations + +import re +import tempfile +from pathlib import Path + +import pytest + +from pipeline.caption_generator import ( + DEFAULT_STYLE, + _format_ass_time, + generate_ass_captions, + write_ass_file, +) + + +# ── Fixtures ───────────────────────────────────────────────────────────────── + +@pytest.fixture +def sample_word_timings() -> list[dict]: + """Realistic word timings as produced by extract_word_timings.""" + return [ + {"word": "This", "start": 10.0, "end": 10.3}, + {"word": "is", "start": 10.3, "end": 10.5}, + {"word": "a", "start": 10.5, "end": 10.6}, + {"word": "test", "start": 10.6, "end": 11.0}, + {"word": "sentence", "start": 11.1, "end": 11.6}, + ] + + +# ── Time formatting ───────────────────────────────────────────────────────── + +class TestFormatAssTime: + def test_zero(self): + assert _format_ass_time(0.0) == "0:00:00.00" + + def test_sub_second(self): + assert _format_ass_time(0.5) == "0:00:00.50" + + def test_minutes(self): + assert _format_ass_time(65.5) == "0:01:05.50" + + def test_hours(self): + assert _format_ass_time(3661.25) == "1:01:01.25" + + def test_negative_clamps_to_zero(self): + assert _format_ass_time(-5.0) == "0:00:00.00" + + +# ── ASS generation ────────────────────────────────────────────────────────── + +class TestGenerateAssCaptions: + def test_empty_timings_returns_header_only(self): + result = generate_ass_captions([], clip_start=0.0) + assert "[Script Info]" in result + assert "[Events]" in result + # No Dialogue lines + assert "Dialogue:" not in result + + def test_structure_has_required_sections(self, sample_word_timings): + result = generate_ass_captions(sample_word_timings, clip_start=10.0) + assert "[Script Info]" in result + assert "[V4+ Styles]" in result + assert "[Events]" in result + assert "Dialogue:" in result + + def test_clip_offset_applied(self, sample_word_timings): + """Word at t=10.5 with clip_start=10.0 should become t=0.5 in ASS.""" + result = generate_ass_captions(sample_word_timings, clip_start=10.0) + lines = result.strip().split("\n") + dialogue_lines = [l for l in lines if l.startswith("Dialogue:")] + + # First word "This" starts at 10.0, clip_start=10.0 → relative 0.0 + assert dialogue_lines[0].startswith("Dialogue: 0,0:00:00.00,") + + # Third word "a" starts at 10.5, clip_start=10.0 → relative 0.5 + assert "0:00:00.50" in dialogue_lines[2] + + def test_karaoke_tags_present(self, sample_word_timings): + result = generate_ass_captions(sample_word_timings, clip_start=10.0) + lines = result.strip().split("\n") + dialogue_lines = [l for l in lines if l.startswith("Dialogue:")] + + for line in dialogue_lines: + # Each line should have a \kN tag + assert re.search(r"\{\\k\d+\}", line), f"Missing karaoke tag in: {line}" + + def test_karaoke_duration_math(self, sample_word_timings): + """Word "This" at [10.0, 10.3] → 0.3s → k30 (30 centiseconds).""" + result = generate_ass_captions(sample_word_timings, clip_start=10.0) + lines = result.strip().split("\n") + dialogue_lines = [l for l in lines if l.startswith("Dialogue:")] + + # "This" duration: 10.3 - 10.0 = 0.3s = 30cs + assert "{\\k30}This" in dialogue_lines[0] + + # "test" duration: 11.0 - 10.6 = 0.4s = 40cs + assert "{\\k40}test" in dialogue_lines[3] + + def test_word_count_matches(self, sample_word_timings): + result = generate_ass_captions(sample_word_timings, clip_start=10.0) + lines = result.strip().split("\n") + dialogue_lines = [l for l in lines if l.startswith("Dialogue:")] + assert len(dialogue_lines) == 5 + + def test_empty_word_text_skipped(self): + timings = [ + {"word": "hello", "start": 0.0, "end": 0.5}, + {"word": " ", "start": 0.5, "end": 0.7}, # whitespace-only + {"word": "", "start": 0.7, "end": 0.8}, # empty + {"word": "world", "start": 0.8, "end": 1.2}, + ] + result = generate_ass_captions(timings, clip_start=0.0) + lines = result.strip().split("\n") + dialogue_lines = [l for l in lines if l.startswith("Dialogue:")] + assert len(dialogue_lines) == 2 # only "hello" and "world" + + def test_custom_style_overrides(self, sample_word_timings): + result = generate_ass_captions( + sample_word_timings, + clip_start=10.0, + style_config={"font_size": 72, "font_name": "Roboto"}, + ) + assert "Roboto" in result + assert ",72," in result + + def test_negative_relative_time_clamped(self): + """Words before clip_start should clamp to 0.""" + timings = [{"word": "early", "start": 5.0, "end": 5.5}] + result = generate_ass_captions(timings, clip_start=10.0) + lines = [l for l in result.strip().split("\n") if l.startswith("Dialogue:")] + # Both start and end clamped to 0 + assert lines[0].startswith("Dialogue: 0,0:00:00.00,0:00:00.00,") + + +# ── File writing ───────────────────────────────────────────────────────────── + +class TestWriteAssFile: + def test_writes_content(self): + content = "[Script Info]\ntest content\n" + with tempfile.TemporaryDirectory() as td: + out = write_ass_file(content, Path(td) / "sub.ass") + assert out.exists() + assert out.read_text() == content + + def test_creates_parent_dirs(self): + content = "test" + with tempfile.TemporaryDirectory() as td: + out = write_ass_file(content, Path(td) / "nested" / "deep" / "sub.ass") + assert out.exists() + + def test_returns_path(self): + content = "test" + with tempfile.TemporaryDirectory() as td: + target = Path(td) / "sub.ass" + result = write_ass_file(content, target) + assert result == target