- "backend/pipeline/caption_generator.py" - "backend/pipeline/shorts_generator.py" - "backend/pipeline/stages.py" - "backend/models.py" - "alembic/versions/027_add_captions_enabled.py" - "backend/pipeline/test_caption_generator.py" GSD-Task: S04/T01
155 lines
5.1 KiB
Python
155 lines
5.1 KiB
Python
r"""ASS (Advanced SubStation Alpha) caption generator for shorts.
|
|
|
|
Converts word-level timings from Whisper transcripts into ASS subtitle
|
|
files with word-by-word karaoke highlighting. Each word gets its own
|
|
Dialogue line with {\k} tags that control highlight duration.
|
|
|
|
Pure functions — no DB access, no Celery dependency.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ── Default style configuration ──────────────────────────────────────────────
|
|
|
|
DEFAULT_STYLE: dict[str, Any] = {
|
|
"font_name": "Arial",
|
|
"font_size": 48,
|
|
"primary_colour": "&H00FFFFFF", # white (BGR + alpha)
|
|
"secondary_colour": "&H0000FFFF", # yellow highlight
|
|
"outline_colour": "&H00000000", # black outline
|
|
"back_colour": "&H80000000", # semi-transparent black shadow
|
|
"bold": -1, # bold
|
|
"outline": 3,
|
|
"shadow": 1,
|
|
"alignment": 2, # bottom-center
|
|
"margin_v": 60, # 60px from bottom (~15% on 1920h)
|
|
}
|
|
|
|
|
|
def _format_ass_time(seconds: float) -> str:
|
|
"""Convert seconds to ASS timestamp format: H:MM:SS.cc (centiseconds).
|
|
|
|
>>> _format_ass_time(65.5)
|
|
'0:01:05.50'
|
|
>>> _format_ass_time(0.0)
|
|
'0:00:00.00'
|
|
"""
|
|
if seconds < 0:
|
|
seconds = 0.0
|
|
h = int(seconds // 3600)
|
|
m = int((seconds % 3600) // 60)
|
|
s = seconds % 60
|
|
return f"{h}:{m:02d}:{s:05.2f}"
|
|
|
|
|
|
def _build_ass_header(style_config: dict[str, Any]) -> str:
|
|
"""Build ASS file header with script info and style definition."""
|
|
cfg = {**DEFAULT_STYLE, **(style_config or {})}
|
|
|
|
header = (
|
|
"[Script Info]\n"
|
|
"Title: Chrysopedia Auto-Captions\n"
|
|
"ScriptType: v4.00+\n"
|
|
"PlayResX: 1080\n"
|
|
"PlayResY: 1920\n"
|
|
"WrapStyle: 0\n"
|
|
"ScaledBorderAndShadow: yes\n"
|
|
"\n"
|
|
"[V4+ Styles]\n"
|
|
"Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, "
|
|
"OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, "
|
|
"ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
|
|
"Alignment, MarginL, MarginR, MarginV, Encoding\n"
|
|
f"Style: Default,{cfg['font_name']},{cfg['font_size']},"
|
|
f"{cfg['primary_colour']},{cfg['secondary_colour']},"
|
|
f"{cfg['outline_colour']},{cfg['back_colour']},"
|
|
f"{cfg['bold']},0,0,0,"
|
|
f"100,100,0,0,1,{cfg['outline']},{cfg['shadow']},"
|
|
f"{cfg['alignment']},20,20,{cfg['margin_v']},1\n"
|
|
"\n"
|
|
"[Events]\n"
|
|
"Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"
|
|
)
|
|
return header
|
|
|
|
|
|
def generate_ass_captions(
|
|
word_timings: list[dict[str, Any]],
|
|
clip_start: float,
|
|
style_config: dict[str, Any] | None = None,
|
|
) -> str:
|
|
"""Generate ASS subtitle content from word-level timings.
|
|
|
|
Each word is emitted as a separate Dialogue line with karaoke timing
|
|
(``{\\k<centiseconds>}``) so that words highlight one-by-one.
|
|
|
|
All word timestamps are offset by ``-clip_start`` to make them
|
|
clip-relative (i.e. the first frame of the clip is t=0).
|
|
|
|
Parameters
|
|
----------
|
|
word_timings : list[dict]
|
|
Word-timing dicts with ``word``, ``start``, ``end`` keys.
|
|
``start`` and ``end`` are absolute times in seconds.
|
|
clip_start : float
|
|
Absolute start time of the clip in seconds. Subtracted from
|
|
all word timestamps.
|
|
style_config : dict | None
|
|
Override style parameters (merged onto DEFAULT_STYLE).
|
|
|
|
Returns
|
|
-------
|
|
str — Full ASS file content. Empty dialogue section if no timings.
|
|
"""
|
|
header = _build_ass_header(style_config)
|
|
|
|
if not word_timings:
|
|
logger.debug("No word timings provided — returning header-only ASS")
|
|
return header
|
|
|
|
lines: list[str] = [header]
|
|
|
|
for w in word_timings:
|
|
word_text = w.get("word", "").strip()
|
|
if not word_text:
|
|
continue
|
|
|
|
abs_start = float(w.get("start", 0.0))
|
|
abs_end = float(w.get("end", abs_start))
|
|
|
|
# Make clip-relative
|
|
rel_start = max(0.0, abs_start - clip_start)
|
|
rel_end = max(rel_start, abs_end - clip_start)
|
|
|
|
# Karaoke duration in centiseconds
|
|
k_duration = max(1, round((rel_end - rel_start) * 100))
|
|
|
|
start_ts = _format_ass_time(rel_start)
|
|
end_ts = _format_ass_time(rel_end)
|
|
|
|
# Dialogue line with karaoke tag
|
|
line = (
|
|
f"Dialogue: 0,{start_ts},{end_ts},Default,,0,0,0,,"
|
|
f"{{\\k{k_duration}}}{word_text}"
|
|
)
|
|
lines.append(line)
|
|
|
|
return "\n".join(lines) + "\n"
|
|
|
|
|
|
def write_ass_file(ass_content: str, output_path: Path) -> Path:
|
|
"""Write ASS content to disk.
|
|
|
|
Creates parent directories if needed. Returns the output path.
|
|
"""
|
|
output_path = Path(output_path)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
output_path.write_text(ass_content, encoding="utf-8")
|
|
logger.debug("Wrote ASS captions to %s (%d bytes)", output_path, len(ass_content))
|
|
return output_path
|