#!/usr/bin/env python3 """ Chrysopedia — Batch Transcription Runner Recursively iterates all creator subdirectories under a content root and runs transcribe.py against each leaf directory containing videos. Outputs are written to a parallel directory structure under --output-dir that mirrors the source hierarchy: output-dir/ ├── au5/ │ ├── Patreon/ │ │ ├── video1.mp4.json │ │ └── video2.mp4.json │ └── Youtube/ │ └── ... ├── Skope/ │ └── ... The --creator flag passed to transcribe.py is always the top-level folder name (the artist), regardless of subdirectory nesting depth. Resumable: transcribe.py already skips videos whose output JSON exists. """ from __future__ import annotations import argparse import json import logging import os import subprocess import sys import time from datetime import datetime, timezone from pathlib import Path LOG_FORMAT = "%(asctime)s [%(levelname)s] %(message)s" logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) logger = logging.getLogger("chrysopedia.batch_transcribe") SUPPORTED_EXTENSIONS = {".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv"} def find_video_dirs(content_root: Path) -> list[dict]: """ Recursively find all directories containing video files. Returns list of dicts with folder path, creator name, and video count. """ results = [] for dirpath, dirnames, filenames in os.walk(content_root): dirpath = Path(dirpath) videos = [ f for f in filenames if Path(f).suffix.lower() in SUPPORTED_EXTENSIONS ] if not videos: continue # Creator is always the top-level subdirectory under content_root rel = dirpath.relative_to(content_root) creator = rel.parts[0] results.append({ "folder": dirpath, "creator": creator, "rel_path": rel, "video_count": len(videos), }) return sorted(results, key=lambda x: (x["creator"], str(x["rel_path"]))) def count_existing_transcripts(output_dir: Path) -> int: """Count existing transcript JSONs in an output folder.""" if not output_dir.exists(): return 0 return sum(1 for p in output_dir.iterdir() if p.suffix == ".json") def main() -> int: parser = argparse.ArgumentParser( description="Batch transcription runner for Chrysopedia" ) parser.add_argument( "--content-root", required=True, help="Root directory containing creator subdirectories with videos", ) parser.add_argument( "--output-dir", required=True, help="Root output directory for transcript JSONs", ) parser.add_argument( "--script", default=None, help="Path to transcribe.py (default: same directory as this script)", ) parser.add_argument( "--python", default=sys.executable, help="Python interpreter to use (default: current interpreter)", ) parser.add_argument( "--model", default="large-v3", help="Whisper model name (default: large-v3)", ) parser.add_argument( "--device", default="cuda", help="Compute device (default: cuda)", ) parser.add_argument( "--dry-run", action="store_true", help="List what would be transcribed without running", ) args = parser.parse_args() content_root = Path(args.content_root) output_root = Path(args.output_dir) script_path = Path(args.script) if args.script else Path(__file__).parent / "transcribe.py" if not content_root.is_dir(): logger.error("Content root does not exist: %s", content_root) return 1 if not script_path.is_file(): logger.error("transcribe.py not found at: %s", script_path) return 1 # Discover all directories with videos (recursive) video_dirs = find_video_dirs(content_root) if not video_dirs: logger.warning("No video files found anywhere under %s", content_root) return 0 # Build work plan with existing transcript counts total_videos = 0 total_existing = 0 work_plan = [] for item in video_dirs: # Output mirrors the relative source structure out_dir = output_root / item["rel_path"] n_existing = count_existing_transcripts(out_dir) n_remaining = item["video_count"] - n_existing total_videos += item["video_count"] total_existing += n_existing work_plan.append({ **item, "output": out_dir, "existing": n_existing, "remaining": n_remaining, }) # Aggregate by creator for summary creator_stats = {} for item in work_plan: c = item["creator"] if c not in creator_stats: creator_stats[c] = {"videos": 0, "existing": 0, "remaining": 0, "dirs": 0} creator_stats[c]["videos"] += item["video_count"] creator_stats[c]["existing"] += item["existing"] creator_stats[c]["remaining"] += item["remaining"] creator_stats[c]["dirs"] += 1 logger.info("=" * 70) logger.info("BATCH TRANSCRIPTION PLAN") logger.info("=" * 70) logger.info("Content root: %s", content_root) logger.info("Output root: %s", output_root) logger.info("Creators: %d", len(creator_stats)) logger.info("Directories: %d", len(work_plan)) logger.info("Total videos: %d", total_videos) logger.info("Already done: %d", total_existing) logger.info("Remaining: %d", total_videos - total_existing) logger.info("=" * 70) for name in sorted(creator_stats.keys()): s = creator_stats[name] status = "DONE" if s["remaining"] == 0 else f"{s['remaining']} to do" logger.info( " %-35s %4d videos (%d dirs), %4d done [%s]", name, s["videos"], s["dirs"], s["existing"], status, ) logger.info("=" * 70) if args.dry_run: logger.info("DRY RUN — exiting without transcribing.") return 0 # Execute manifest = { "started_at": datetime.now(timezone.utc).isoformat(), "content_root": str(content_root), "output_root": str(output_root), "model": args.model, "device": args.device, "results": [], } total_processed = 0 total_skipped = 0 total_failed_dirs = 0 batch_start = time.time() for i, item in enumerate(work_plan, 1): if item["remaining"] == 0: logger.info("[%d/%d] SKIP %s (all %d videos already transcribed)", i, len(work_plan), item["rel_path"], item["video_count"]) total_skipped += item["video_count"] continue logger.info("=" * 70) logger.info("[%d/%d] TRANSCRIBING: %s (creator: %s)", i, len(work_plan), item["rel_path"], item["creator"]) logger.info(" %d videos, %d remaining", item["video_count"], item["remaining"]) logger.info("=" * 70) dir_start = time.time() cmd = [ args.python, str(script_path), "--input", str(item["folder"]), "--output-dir", str(item["output"]), "--model", args.model, "--device", args.device, "--creator", item["creator"], ] try: result = subprocess.run( cmd, capture_output=False, text=True, timeout=14400, # 4-hour timeout per directory ) dir_elapsed = time.time() - dir_start n_after = count_existing_transcripts(item["output"]) n_new = n_after - item["existing"] manifest["results"].append({ "creator": item["creator"], "rel_path": str(item["rel_path"]), "videos": item["video_count"], "new_transcripts": n_new, "total_transcripts": n_after, "exit_code": result.returncode, "elapsed_seconds": round(dir_elapsed, 1), }) if result.returncode == 0: total_processed += n_new total_skipped += item["existing"] logger.info("Completed %s: %d new transcripts in %.1f s", item["rel_path"], n_new, dir_elapsed) else: total_failed_dirs += 1 logger.error("FAILED %s (exit code %d) after %.1f s", item["rel_path"], result.returncode, dir_elapsed) except subprocess.TimeoutExpired: total_failed_dirs += 1 logger.error("TIMEOUT: %s exceeded 4-hour limit", item["rel_path"]) manifest["results"].append({ "creator": item["creator"], "rel_path": str(item["rel_path"]), "videos": item["video_count"], "error": "timeout", }) except Exception as exc: total_failed_dirs += 1 logger.exception("ERROR processing %s: %s", item["rel_path"], exc) manifest["results"].append({ "creator": item["creator"], "rel_path": str(item["rel_path"]), "videos": item["video_count"], "error": str(exc), }) batch_elapsed = time.time() - batch_start manifest["completed_at"] = datetime.now(timezone.utc).isoformat() manifest["elapsed_seconds"] = round(batch_elapsed, 1) manifest["summary"] = { "processed": total_processed, "skipped": total_skipped, "failed_dirs": total_failed_dirs, } # Write manifest manifest_path = output_root / "batch_manifest.json" output_root.mkdir(parents=True, exist_ok=True) with open(manifest_path, "w", encoding="utf-8") as f: json.dump(manifest, f, indent=2, ensure_ascii=False) logger.info("=" * 70) logger.info("BATCH COMPLETE") logger.info(" Processed: %d new transcripts", total_processed) logger.info(" Skipped: %d (already existed)", total_skipped) logger.info(" Failed dirs: %d", total_failed_dirs) logger.info(" Elapsed: %.1f s (%.1f hours)", batch_elapsed, batch_elapsed / 3600) logger.info(" Manifest: %s", manifest_path) logger.info("=" * 70) return 1 if total_failed_dirs > 0 else 0 if __name__ == "__main__": sys.exit(main())