chrysopedia/backend/pipeline/caption_generator.py

r"""ASS (Advanced SubStation Alpha) caption generator for shorts.

Converts word-level timings from Whisper transcripts into ASS subtitle
files with word-by-word karaoke highlighting. Each word gets its own
Dialogue line with {\k} tags that control highlight duration.

Pure functions — no DB access, no Celery dependency.
"""

from __future__ import annotations

import logging
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)

# ── Default style configuration ──────────────────────────────────────────────

DEFAULT_STYLE: dict[str, Any] = {
    "font_name": "Arial",
    "font_size": 48,
    "primary_colour": "&H00FFFFFF",   # white (BGR + alpha)
    "secondary_colour": "&H0000FFFF",  # yellow highlight
    "outline_colour": "&H00000000",   # black outline
    "back_colour": "&H80000000",      # semi-transparent black shadow
    "bold": -1,                       # bold
    "outline": 3,
    "shadow": 1,
    "alignment": 2,                   # bottom-center
    "margin_v": 60,                   # 60px from bottom (~15% on 1920h)
}


def _format_ass_time(seconds: float) -> str:
    """Convert seconds to ASS timestamp format: H:MM:SS.cc (centiseconds).

    >>> _format_ass_time(65.5)
    '0:01:05.50'
    >>> _format_ass_time(0.0)
    '0:00:00.00'
    """
    if seconds < 0:
        seconds = 0.0
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = seconds % 60
    return f"{h}:{m:02d}:{s:05.2f}"


def _build_ass_header(style_config: dict[str, Any]) -> str:
    """Build ASS file header with script info and style definition."""
    cfg = {**DEFAULT_STYLE, **(style_config or {})}

    header = (
        "[Script Info]\n"
        "Title: Chrysopedia Auto-Captions\n"
        "ScriptType: v4.00+\n"
        "PlayResX: 1080\n"
        "PlayResY: 1920\n"
        "WrapStyle: 0\n"
        "ScaledBorderAndShadow: yes\n"
        "\n"
        "[V4+ Styles]\n"
        "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, "
        "OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, "
        "ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
        "Alignment, MarginL, MarginR, MarginV, Encoding\n"
        f"Style: Default,{cfg['font_name']},{cfg['font_size']},"
        f"{cfg['primary_colour']},{cfg['secondary_colour']},"
        f"{cfg['outline_colour']},{cfg['back_colour']},"
        f"{cfg['bold']},0,0,0,"
        f"100,100,0,0,1,{cfg['outline']},{cfg['shadow']},"
        f"{cfg['alignment']},20,20,{cfg['margin_v']},1\n"
        "\n"
        "[Events]\n"
        "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"
    )
    return header


def generate_ass_captions(
    word_timings: list[dict[str, Any]],
    clip_start: float,
    style_config: dict[str, Any] | None = None,
) -> str:
    """Generate ASS subtitle content from word-level timings.

    Each word is emitted as a separate Dialogue line with karaoke timing
    (``{\\k<centiseconds>}``) so that words highlight one-by-one.

    All word timestamps are offset by ``-clip_start`` to make them
    clip-relative (i.e. the first frame of the clip is t=0).

    Parameters
    ----------
    word_timings : list[dict]
        Word-timing dicts with ``word``, ``start``, ``end`` keys.
        ``start`` and ``end`` are absolute times in seconds.
    clip_start : float
        Absolute start time of the clip in seconds. Subtracted from
        all word timestamps.
    style_config : dict | None
        Override style parameters (merged onto DEFAULT_STYLE).

    Returns
    -------
    str — Full ASS file content. Empty dialogue section if no timings.
    """
    header = _build_ass_header(style_config)

    if not word_timings:
        logger.debug("No word timings provided — returning header-only ASS")
        return header

    lines: list[str] = [header]

    for w in word_timings:
        word_text = w.get("word", "").strip()
        if not word_text:
            continue

        abs_start = float(w.get("start", 0.0))
        abs_end = float(w.get("end", abs_start))

        # Make clip-relative
        rel_start = max(0.0, abs_start - clip_start)
        rel_end = max(rel_start, abs_end - clip_start)

        # Karaoke duration in centiseconds
        k_duration = max(1, round((rel_end - rel_start) * 100))

        start_ts = _format_ass_time(rel_start)
        end_ts = _format_ass_time(rel_end)

        # Dialogue line with karaoke tag
        line = (
            f"Dialogue: 0,{start_ts},{end_ts},Default,,0,0,0,,"
            f"{{\\k{k_duration}}}{word_text}"
        )
        lines.append(line)

    return "\n".join(lines) + "\n"


def write_ass_file(ass_content: str, output_path: Path) -> Path:
    """Write ASS content to disk.

    Creates parent directories if needed. Returns the output path.
    """
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(ass_content, encoding="utf-8")
    logger.debug("Wrote ASS captions to %s (%d bytes)", output_path, len(ass_content))
    return output_path