feat: Created ASS subtitle generator with karaoke word-by-word highligh…

- "backend/pipeline/caption_generator.py" - "backend/pipeline/shorts_generator.py" - "backend/pipeline/stages.py" - "backend/models.py" - "alembic/versions/027_add_captions_enabled.py" - "backend/pipeline/test_caption_generator.py" GSD-Task: S04/T01
2026-04-04 11:12:19 +00:00 · 2026-04-04 11:12:19 +00:00 · 899e57c0e1
commit 899e57c0e1
parent ddb283cc28
6 changed files with 423 additions and 2 deletions
--- a/alembic/versions/027_add_captions_enabled.py
+++ b/alembic/versions/027_add_captions_enabled.py
@ -0,0 +1,30 @@
 """Add captions_enabled boolean to generated_shorts.
 Revision ID: 027_add_captions_enabled
 Revises: 026_add_share_token
 """
 import sqlalchemy as sa
 from alembic import op
 revision = "027_add_captions_enabled"
 down_revision = "026_add_share_token"
 branch_labels = None
 depends_on = None
 def upgrade() -> None:
    op.add_column(
        "generated_shorts",
        sa.Column(
            "captions_enabled",
            sa.Boolean(),
            nullable=False,
            server_default=sa.text("false"),
        ),
    )
 def downgrade() -> None:
    op.drop_column("generated_shorts", "captions_enabled")
--- a/backend/models.py
+++ b/backend/models.py
@ -867,5 +867,9 @@ class GeneratedShort(Base):
        default=_now, server_default=func.now(), onupdate=_now
    )
    captions_enabled: Mapped[bool] = mapped_column(
        Boolean, default=False, server_default=text("'false'"),
    )
    # relationships
    highlight_candidate: Mapped[HighlightCandidate] = sa_relationship()
--- a/backend/pipeline/caption_generator.py
+++ b/backend/pipeline/caption_generator.py
@ -0,0 +1,155 @@
 r"""ASS (Advanced SubStation Alpha) caption generator for shorts.
 Converts word-level timings from Whisper transcripts into ASS subtitle
 files with word-by-word karaoke highlighting. Each word gets its own
 Dialogue line with {\k} tags that control highlight duration.
 Pure functions — no DB access, no Celery dependency.
 """
 from __future__ import annotations
 import logging
 from pathlib import Path
 from typing import Any
 logger = logging.getLogger(__name__)
 # ── Default style configuration ──────────────────────────────────────────────
 DEFAULT_STYLE: dict[str, Any] = {
    "font_name": "Arial",
    "font_size": 48,
    "primary_colour": "&H00FFFFFF",   # white (BGR + alpha)
    "secondary_colour": "&H0000FFFF",  # yellow highlight
    "outline_colour": "&H00000000",   # black outline
    "back_colour": "&H80000000",      # semi-transparent black shadow
    "bold": -1,                       # bold
    "outline": 3,
    "shadow": 1,
    "alignment": 2,                   # bottom-center
    "margin_v": 60,                   # 60px from bottom (~15% on 1920h)
 }
 def _format_ass_time(seconds: float) -> str:
    """Convert seconds to ASS timestamp format: H:MM:SS.cc (centiseconds).
    >>> _format_ass_time(65.5)
    '0:01:05.50'
    >>> _format_ass_time(0.0)
    '0:00:00.00'
    """
    if seconds < 0:
        seconds = 0.0
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = seconds % 60
    return f"{h}:{m:02d}:{s:05.2f}"
 def _build_ass_header(style_config: dict[str, Any]) -> str:
    """Build ASS file header with script info and style definition."""
    cfg = {**DEFAULT_STYLE, **(style_config or {})}
    header = (
        "[Script Info]\n"
        "Title: Chrysopedia Auto-Captions\n"
        "ScriptType: v4.00+\n"
        "PlayResX: 1080\n"
        "PlayResY: 1920\n"
        "WrapStyle: 0\n"
        "ScaledBorderAndShadow: yes\n"
        "\n"
        "[V4+ Styles]\n"
        "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, "
        "OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, "
        "ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
        "Alignment, MarginL, MarginR, MarginV, Encoding\n"
        f"Style: Default,{cfg['font_name']},{cfg['font_size']},"
        f"{cfg['primary_colour']},{cfg['secondary_colour']},"
        f"{cfg['outline_colour']},{cfg['back_colour']},"
        f"{cfg['bold']},0,0,0,"
        f"100,100,0,0,1,{cfg['outline']},{cfg['shadow']},"
        f"{cfg['alignment']},20,20,{cfg['margin_v']},1\n"
        "\n"
        "[Events]\n"
        "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"
    )
    return header
 def generate_ass_captions(
    word_timings: list[dict[str, Any]],
    clip_start: float,
    style_config: dict[str, Any] | None = None,
 ) -> str:
    """Generate ASS subtitle content from word-level timings.
    Each word is emitted as a separate Dialogue line with karaoke timing
    (``{\\k<centiseconds>}``) so that words highlight one-by-one.
    All word timestamps are offset by ``-clip_start`` to make them
    clip-relative (i.e. the first frame of the clip is t=0).
    Parameters
    ----------
    word_timings : list[dict]
        Word-timing dicts with ``word``, ``start``, ``end`` keys.
        ``start`` and ``end`` are absolute times in seconds.
    clip_start : float
        Absolute start time of the clip in seconds. Subtracted from
        all word timestamps.
    style_config : dict | None
        Override style parameters (merged onto DEFAULT_STYLE).
    Returns
    -------
    str — Full ASS file content. Empty dialogue section if no timings.
    """
    header = _build_ass_header(style_config)
    if not word_timings:
        logger.debug("No word timings provided — returning header-only ASS")
        return header
    lines: list[str] = [header]
    for w in word_timings:
        word_text = w.get("word", "").strip()
        if not word_text:
            continue
        abs_start = float(w.get("start", 0.0))
        abs_end = float(w.get("end", abs_start))
        # Make clip-relative
        rel_start = max(0.0, abs_start - clip_start)
        rel_end = max(rel_start, abs_end - clip_start)
        # Karaoke duration in centiseconds
        k_duration = max(1, round((rel_end - rel_start) * 100))
        start_ts = _format_ass_time(rel_start)
        end_ts = _format_ass_time(rel_end)
        # Dialogue line with karaoke tag
        line = (
            f"Dialogue: 0,{start_ts},{end_ts},Default,,0,0,0,,"
            f"{{\\k{k_duration}}}{word_text}"
        )
        lines.append(line)
    return "\n".join(lines) + "\n"
 def write_ass_file(ass_content: str, output_path: Path) -> Path:
    """Write ASS content to disk.
    Creates parent directories if needed. Returns the output path.
    """
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(ass_content, encoding="utf-8")
    logger.debug("Wrote ASS captions to %s (%d bytes)", output_path, len(ass_content))
    return output_path
--- a/backend/pipeline/shorts_generator.py
+++ b/backend/pipeline/shorts_generator.py
@ -72,18 +72,24 @@ def extract_clip(
    start_secs: float,
    end_secs: float,
    vf_filter: str,
    ass_path: Path | str | None = None,
 ) -> None:
    """Extract a clip from a video file using ffmpeg.
    Seeks to *start_secs*, encodes until *end_secs*, and applies *vf_filter*.
    Uses ``-c:v libx264 -preset fast -crf 23`` for reasonable quality/speed.
    When *ass_path* is provided, the ASS subtitle filter is appended to the
    video filter chain so that captions are burned into the output video.
    Args:
        input_path: Source video file.
        output_path: Destination mp4 file (parent dir must exist).
        start_secs: Start time in seconds.
        end_secs: End time in seconds.
        vf_filter: ffmpeg ``-vf`` filter string.
        ass_path: Optional path to an ASS subtitle file. When provided,
            ``ass=<path>`` is appended to the filter chain.
    Raises:
        subprocess.CalledProcessError: If ffmpeg exits non-zero.
@ -97,13 +103,20 @@ def extract_clip(
            f"(duration={duration}s)"
        )
    # Build the video filter chain — ASS burn-in comes after scale/pad
    effective_vf = vf_filter
    if ass_path is not None:
        # Escape colons and backslashes in the path for ffmpeg filter syntax
        escaped = str(ass_path).replace("\\", "\\\\").replace(":", "\\:")
        effective_vf = f"{vf_filter},ass={escaped}"
    cmd = [
        "ffmpeg",
        "-y",                          # overwrite output
        "-ss", str(start_secs),        # seek before input (fast)
        "-i", str(input_path),
        "-t", str(duration),
-        "-vf", vf_filter,
+        "-vf", effective_vf,
        "-c:v", "libx264",
        "-preset", "fast",
        "-crf", "23",
--- a/backend/pipeline/stages.py
+++ b/backend/pipeline/stages.py
@ -2876,7 +2876,8 @@ def stage_generate_shorts(self, highlight_candidate_id: str) -> str:
    Returns the highlight_candidate_id on completion.
    """
    from pipeline.shorts_generator import PRESETS, extract_clip, resolve_video_path
-    from models import FormatPreset, GeneratedShort, ShortStatus
+    from pipeline.caption_generator import generate_ass_captions, write_ass_file
    from models import FormatPreset, GeneratedShort, ShortStatus, SourceVideo
    start = time.monotonic()
    session = _get_sync_session()
@ -2954,6 +2955,56 @@ def stage_generate_shorts(self, highlight_candidate_id: str) -> str:
            clip_start, clip_end,
        )
        # ── Generate captions from transcript (if available) ────────────
        ass_path: Path | None = None
        captions_ok = False
        try:
            transcript_data: list | None = None
            if source_video.transcript_path:
                try:
                    with open(source_video.transcript_path, "r") as fh:
                        raw = json.load(fh)
                    if isinstance(raw, dict):
                        transcript_data = raw.get("segments", raw.get("results", []))
                    elif isinstance(raw, list):
                        transcript_data = raw
                except (FileNotFoundError, json.JSONDecodeError, OSError) as io_exc:
                    logger.warning(
                        "Failed to load transcript for captions highlight=%s: %s",
                        highlight_candidate_id, io_exc,
                    )
            if transcript_data:
                from pipeline.highlight_scorer import extract_word_timings
                word_timings = extract_word_timings(transcript_data, clip_start, clip_end)
                if word_timings:
                    ass_content = generate_ass_captions(word_timings, clip_start)
                    ass_path = write_ass_file(
                        ass_content,
                        Path(f"/tmp/captions_{highlight_candidate_id}.ass"),
                    )
                    captions_ok = True
                    logger.info(
                        "Generated captions for highlight=%s (%d words)",
                        highlight_candidate_id, len(word_timings),
                    )
                else:
                    logger.warning(
                        "No word timings in transcript window [%.1f–%.1f]s for highlight=%s — proceeding without captions",
                        clip_start, clip_end, highlight_candidate_id,
                    )
            else:
                logger.info(
                    "No transcript available for highlight=%s — proceeding without captions",
                    highlight_candidate_id,
                )
        except Exception as cap_exc:
            logger.warning(
                "Caption generation failed for highlight=%s: %s — proceeding without captions",
                highlight_candidate_id, cap_exc,
            )
        # ── Process each preset independently ───────────────────────────
        for preset in FormatPreset:
            spec = PRESETS[preset]
@ -2983,6 +3034,7 @@ def stage_generate_shorts(self, highlight_candidate_id: str) -> str:
                    start_secs=clip_start,
                    end_secs=clip_end,
                    vf_filter=spec.vf_filter,
                    ass_path=ass_path,
                )
                # Upload to MinIO
@ -3000,6 +3052,7 @@ def stage_generate_shorts(self, highlight_candidate_id: str) -> str:
                short.status = ShortStatus.complete
                short.file_size_bytes = file_size
                short.minio_object_key = minio_key
                short.captions_enabled = captions_ok
                short.share_token = secrets.token_urlsafe(8)
                session.commit()
@ -3035,6 +3088,13 @@ def stage_generate_shorts(self, highlight_candidate_id: str) -> str:
                    except OSError:
                        pass
        # Clean up temp ASS caption file
        if ass_path is not None and ass_path.exists():
            try:
                ass_path.unlink()
            except OSError:
                pass
        elapsed = time.monotonic() - start
        logger.info(
            "Shorts generation complete for highlight=%s in %.1fs",
--- a/backend/pipeline/test_caption_generator.py
+++ b/backend/pipeline/test_caption_generator.py
@ -0,0 +1,159 @@
 """Unit tests for caption_generator module."""
 from __future__ import annotations
 import re
 import tempfile
 from pathlib import Path
 import pytest
 from pipeline.caption_generator import (
    DEFAULT_STYLE,
    _format_ass_time,
    generate_ass_captions,
    write_ass_file,
 )
 # ── Fixtures ─────────────────────────────────────────────────────────────────
@pytest.fixture
 def sample_word_timings() -> list[dict]:
    """Realistic word timings as produced by extract_word_timings."""
    return [
        {"word": "This", "start": 10.0, "end": 10.3},
        {"word": "is", "start": 10.3, "end": 10.5},
        {"word": "a", "start": 10.5, "end": 10.6},
        {"word": "test", "start": 10.6, "end": 11.0},
        {"word": "sentence", "start": 11.1, "end": 11.6},
    ]
 # ── Time formatting ─────────────────────────────────────────────────────────
 class TestFormatAssTime:
    def test_zero(self):
        assert _format_ass_time(0.0) == "0:00:00.00"
    def test_sub_second(self):
        assert _format_ass_time(0.5) == "0:00:00.50"
    def test_minutes(self):
        assert _format_ass_time(65.5) == "0:01:05.50"
    def test_hours(self):
        assert _format_ass_time(3661.25) == "1:01:01.25"
    def test_negative_clamps_to_zero(self):
        assert _format_ass_time(-5.0) == "0:00:00.00"
 # ── ASS generation ──────────────────────────────────────────────────────────
 class TestGenerateAssCaptions:
    def test_empty_timings_returns_header_only(self):
        result = generate_ass_captions([], clip_start=0.0)
        assert "[Script Info]" in result
        assert "[Events]" in result
        # No Dialogue lines
        assert "Dialogue:" not in result
    def test_structure_has_required_sections(self, sample_word_timings):
        result = generate_ass_captions(sample_word_timings, clip_start=10.0)
        assert "[Script Info]" in result
        assert "[V4+ Styles]" in result
        assert "[Events]" in result
        assert "Dialogue:" in result
    def test_clip_offset_applied(self, sample_word_timings):
        """Word at t=10.5 with clip_start=10.0 should become t=0.5 in ASS."""
        result = generate_ass_captions(sample_word_timings, clip_start=10.0)
        lines = result.strip().split("\n")
        dialogue_lines = [l for l in lines if l.startswith("Dialogue:")]
        # First word "This" starts at 10.0, clip_start=10.0 → relative 0.0
        assert dialogue_lines[0].startswith("Dialogue: 0,0:00:00.00,")
        # Third word "a" starts at 10.5, clip_start=10.0 → relative 0.5
        assert "0:00:00.50" in dialogue_lines[2]
    def test_karaoke_tags_present(self, sample_word_timings):
        result = generate_ass_captions(sample_word_timings, clip_start=10.0)
        lines = result.strip().split("\n")
        dialogue_lines = [l for l in lines if l.startswith("Dialogue:")]
        for line in dialogue_lines:
            # Each line should have a \kN tag
            assert re.search(r"\{\\k\d+\}", line), f"Missing karaoke tag in: {line}"
    def test_karaoke_duration_math(self, sample_word_timings):
        """Word "This" at [10.0, 10.3] → 0.3s → k30 (30 centiseconds)."""
        result = generate_ass_captions(sample_word_timings, clip_start=10.0)
        lines = result.strip().split("\n")
        dialogue_lines = [l for l in lines if l.startswith("Dialogue:")]
        # "This" duration: 10.3 - 10.0 = 0.3s = 30cs
        assert "{\\k30}This" in dialogue_lines[0]
        # "test" duration: 11.0 - 10.6 = 0.4s = 40cs
        assert "{\\k40}test" in dialogue_lines[3]
    def test_word_count_matches(self, sample_word_timings):
        result = generate_ass_captions(sample_word_timings, clip_start=10.0)
        lines = result.strip().split("\n")
        dialogue_lines = [l for l in lines if l.startswith("Dialogue:")]
        assert len(dialogue_lines) == 5
    def test_empty_word_text_skipped(self):
        timings = [
            {"word": "hello", "start": 0.0, "end": 0.5},
            {"word": "  ", "start": 0.5, "end": 0.7},  # whitespace-only
            {"word": "", "start": 0.7, "end": 0.8},     # empty
            {"word": "world", "start": 0.8, "end": 1.2},
        ]
        result = generate_ass_captions(timings, clip_start=0.0)
        lines = result.strip().split("\n")
        dialogue_lines = [l for l in lines if l.startswith("Dialogue:")]
        assert len(dialogue_lines) == 2  # only "hello" and "world"
    def test_custom_style_overrides(self, sample_word_timings):
        result = generate_ass_captions(
            sample_word_timings,
            clip_start=10.0,
            style_config={"font_size": 72, "font_name": "Roboto"},
        )
        assert "Roboto" in result
        assert ",72," in result
    def test_negative_relative_time_clamped(self):
        """Words before clip_start should clamp to 0."""
        timings = [{"word": "early", "start": 5.0, "end": 5.5}]
        result = generate_ass_captions(timings, clip_start=10.0)
        lines = [l for l in result.strip().split("\n") if l.startswith("Dialogue:")]
        # Both start and end clamped to 0
        assert lines[0].startswith("Dialogue: 0,0:00:00.00,0:00:00.00,")
 # ── File writing ─────────────────────────────────────────────────────────────
 class TestWriteAssFile:
    def test_writes_content(self):
        content = "[Script Info]\ntest content\n"
        with tempfile.TemporaryDirectory() as td:
            out = write_ass_file(content, Path(td) / "sub.ass")
            assert out.exists()
            assert out.read_text() == content
    def test_creates_parent_dirs(self):
        content = "test"
        with tempfile.TemporaryDirectory() as td:
            out = write_ass_file(content, Path(td) / "nested" / "deep" / "sub.ass")
            assert out.exists()
    def test_returns_path(self):
        content = "test"
        with tempfile.TemporaryDirectory() as td:
            target = Path(td) / "sub.ass"
            result = write_ass_file(content, target)
            assert result == target