chrysopedia/whisper/batch_transcribe.py
jlightner 4b0914b12b fix: restore complete project tree from ub01 canonical state
Auto-mode commit 7aa33cd accidentally deleted 78 files (14,814 lines) during M005
execution. Subsequent commits rebuilt some frontend files but backend/, alembic/,
tests/, whisper/, docker configs, and prompts were never restored in this repo.

This commit restores the full project tree by syncing from ub01's working directory,
which has all M001-M007 features running in production containers.

Restored: backend/ (config, models, routers, database, redis, search_service, worker),
alembic/ (6 migrations), docker/ (Dockerfiles, nginx, compose), prompts/ (4 stages),
tests/, whisper/, README.md, .env.example, chrysopedia-spec.md
2026-03-31 02:10:41 +00:00

319 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Chrysopedia — Batch Transcription Runner
Recursively iterates all creator subdirectories under a content root and runs
transcribe.py against each leaf directory containing videos. Outputs are
written to a parallel directory structure under --output-dir that mirrors
the source hierarchy:
output-dir/
├── au5/
│ ├── Patreon/
│ │ ├── video1.mp4.json
│ │ └── video2.mp4.json
│ └── Youtube/
│ └── ...
├── Skope/
│ └── ...
The --creator flag passed to transcribe.py is always the top-level folder
name (the artist), regardless of subdirectory nesting depth.
Resumable: transcribe.py already skips videos whose output JSON exists.
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
LOG_FORMAT = "%(asctime)s [%(levelname)s] %(message)s"
logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)
logger = logging.getLogger("chrysopedia.batch_transcribe")
SUPPORTED_EXTENSIONS = {".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv"}
def find_video_dirs(content_root: Path) -> list[dict]:
"""
Recursively find all directories containing video files.
Returns list of dicts with folder path, creator name, and video count.
"""
results = []
for dirpath, dirnames, filenames in os.walk(content_root):
dirpath = Path(dirpath)
videos = [
f for f in filenames
if Path(f).suffix.lower() in SUPPORTED_EXTENSIONS
]
if not videos:
continue
# Creator is always the top-level subdirectory under content_root
rel = dirpath.relative_to(content_root)
creator = rel.parts[0]
results.append({
"folder": dirpath,
"creator": creator,
"rel_path": rel,
"video_count": len(videos),
})
return sorted(results, key=lambda x: (x["creator"], str(x["rel_path"])))
def count_existing_transcripts(output_dir: Path) -> int:
"""Count existing transcript JSONs in an output folder."""
if not output_dir.exists():
return 0
return sum(1 for p in output_dir.iterdir() if p.suffix == ".json")
def main() -> int:
parser = argparse.ArgumentParser(
description="Batch transcription runner for Chrysopedia"
)
parser.add_argument(
"--content-root",
required=True,
help="Root directory containing creator subdirectories with videos",
)
parser.add_argument(
"--output-dir",
required=True,
help="Root output directory for transcript JSONs",
)
parser.add_argument(
"--script",
default=None,
help="Path to transcribe.py (default: same directory as this script)",
)
parser.add_argument(
"--python",
default=sys.executable,
help="Python interpreter to use (default: current interpreter)",
)
parser.add_argument(
"--model",
default="large-v3",
help="Whisper model name (default: large-v3)",
)
parser.add_argument(
"--device",
default="cuda",
help="Compute device (default: cuda)",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="List what would be transcribed without running",
)
args = parser.parse_args()
content_root = Path(args.content_root)
output_root = Path(args.output_dir)
script_path = Path(args.script) if args.script else Path(__file__).parent / "transcribe.py"
if not content_root.is_dir():
logger.error("Content root does not exist: %s", content_root)
return 1
if not script_path.is_file():
logger.error("transcribe.py not found at: %s", script_path)
return 1
# Discover all directories with videos (recursive)
video_dirs = find_video_dirs(content_root)
if not video_dirs:
logger.warning("No video files found anywhere under %s", content_root)
return 0
# Build work plan with existing transcript counts
total_videos = 0
total_existing = 0
work_plan = []
for item in video_dirs:
# Output mirrors the relative source structure
out_dir = output_root / item["rel_path"]
n_existing = count_existing_transcripts(out_dir)
n_remaining = item["video_count"] - n_existing
total_videos += item["video_count"]
total_existing += n_existing
work_plan.append({
**item,
"output": out_dir,
"existing": n_existing,
"remaining": n_remaining,
})
# Aggregate by creator for summary
creator_stats = {}
for item in work_plan:
c = item["creator"]
if c not in creator_stats:
creator_stats[c] = {"videos": 0, "existing": 0, "remaining": 0, "dirs": 0}
creator_stats[c]["videos"] += item["video_count"]
creator_stats[c]["existing"] += item["existing"]
creator_stats[c]["remaining"] += item["remaining"]
creator_stats[c]["dirs"] += 1
logger.info("=" * 70)
logger.info("BATCH TRANSCRIPTION PLAN")
logger.info("=" * 70)
logger.info("Content root: %s", content_root)
logger.info("Output root: %s", output_root)
logger.info("Creators: %d", len(creator_stats))
logger.info("Directories: %d", len(work_plan))
logger.info("Total videos: %d", total_videos)
logger.info("Already done: %d", total_existing)
logger.info("Remaining: %d", total_videos - total_existing)
logger.info("=" * 70)
for name in sorted(creator_stats.keys()):
s = creator_stats[name]
status = "DONE" if s["remaining"] == 0 else f"{s['remaining']} to do"
logger.info(
" %-35s %4d videos (%d dirs), %4d done [%s]",
name, s["videos"], s["dirs"], s["existing"], status,
)
logger.info("=" * 70)
if args.dry_run:
logger.info("DRY RUN — exiting without transcribing.")
return 0
# Execute
manifest = {
"started_at": datetime.now(timezone.utc).isoformat(),
"content_root": str(content_root),
"output_root": str(output_root),
"model": args.model,
"device": args.device,
"results": [],
}
total_processed = 0
total_skipped = 0
total_failed_dirs = 0
batch_start = time.time()
for i, item in enumerate(work_plan, 1):
if item["remaining"] == 0:
logger.info("[%d/%d] SKIP %s (all %d videos already transcribed)",
i, len(work_plan), item["rel_path"], item["video_count"])
total_skipped += item["video_count"]
continue
logger.info("=" * 70)
logger.info("[%d/%d] TRANSCRIBING: %s (creator: %s)",
i, len(work_plan), item["rel_path"], item["creator"])
logger.info(" %d videos, %d remaining",
item["video_count"], item["remaining"])
logger.info("=" * 70)
dir_start = time.time()
cmd = [
args.python,
str(script_path),
"--input", str(item["folder"]),
"--output-dir", str(item["output"]),
"--model", args.model,
"--device", args.device,
"--creator", item["creator"],
]
try:
result = subprocess.run(
cmd,
capture_output=False,
text=True,
timeout=14400, # 4-hour timeout per directory
)
dir_elapsed = time.time() - dir_start
n_after = count_existing_transcripts(item["output"])
n_new = n_after - item["existing"]
manifest["results"].append({
"creator": item["creator"],
"rel_path": str(item["rel_path"]),
"videos": item["video_count"],
"new_transcripts": n_new,
"total_transcripts": n_after,
"exit_code": result.returncode,
"elapsed_seconds": round(dir_elapsed, 1),
})
if result.returncode == 0:
total_processed += n_new
total_skipped += item["existing"]
logger.info("Completed %s: %d new transcripts in %.1f s",
item["rel_path"], n_new, dir_elapsed)
else:
total_failed_dirs += 1
logger.error("FAILED %s (exit code %d) after %.1f s",
item["rel_path"], result.returncode, dir_elapsed)
except subprocess.TimeoutExpired:
total_failed_dirs += 1
logger.error("TIMEOUT: %s exceeded 4-hour limit", item["rel_path"])
manifest["results"].append({
"creator": item["creator"],
"rel_path": str(item["rel_path"]),
"videos": item["video_count"],
"error": "timeout",
})
except Exception as exc:
total_failed_dirs += 1
logger.exception("ERROR processing %s: %s", item["rel_path"], exc)
manifest["results"].append({
"creator": item["creator"],
"rel_path": str(item["rel_path"]),
"videos": item["video_count"],
"error": str(exc),
})
batch_elapsed = time.time() - batch_start
manifest["completed_at"] = datetime.now(timezone.utc).isoformat()
manifest["elapsed_seconds"] = round(batch_elapsed, 1)
manifest["summary"] = {
"processed": total_processed,
"skipped": total_skipped,
"failed_dirs": total_failed_dirs,
}
# Write manifest
manifest_path = output_root / "batch_manifest.json"
output_root.mkdir(parents=True, exist_ok=True)
with open(manifest_path, "w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2, ensure_ascii=False)
logger.info("=" * 70)
logger.info("BATCH COMPLETE")
logger.info(" Processed: %d new transcripts", total_processed)
logger.info(" Skipped: %d (already existed)", total_skipped)
logger.info(" Failed dirs: %d", total_failed_dirs)
logger.info(" Elapsed: %.1f s (%.1f hours)", batch_elapsed, batch_elapsed / 3600)
logger.info(" Manifest: %s", manifest_path)
logger.info("=" * 70)
return 1 if total_failed_dirs > 0 else 0
if __name__ == "__main__":
sys.exit(main())