chrysopedia/whisper/batch_transcribe.py

#!/usr/bin/env python3
"""
Chrysopedia — Batch Transcription Runner

Recursively iterates all creator subdirectories under a content root and runs
transcribe.py against each leaf directory containing videos. Outputs are
written to a parallel directory structure under --output-dir that mirrors
the source hierarchy:

    output-dir/
    ├── au5/
    │   ├── Patreon/
    │   │   ├── video1.mp4.json
    │   │   └── video2.mp4.json
    │   └── Youtube/
    │       └── ...
    ├── Skope/
    │   └── ...

The --creator flag passed to transcribe.py is always the top-level folder
name (the artist), regardless of subdirectory nesting depth.

Resumable: transcribe.py already skips videos whose output JSON exists.
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path

LOG_FORMAT = "%(asctime)s [%(levelname)s] %(message)s"
logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)
logger = logging.getLogger("chrysopedia.batch_transcribe")

SUPPORTED_EXTENSIONS = {".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv"}


def find_video_dirs(content_root: Path) -> list[dict]:
    """
    Recursively find all directories containing video files.
    Returns list of dicts with folder path, creator name, and video count.
    """
    results = []
    for dirpath, dirnames, filenames in os.walk(content_root):
        dirpath = Path(dirpath)
        videos = [
            f for f in filenames
            if Path(f).suffix.lower() in SUPPORTED_EXTENSIONS
        ]
        if not videos:
            continue

        # Creator is always the top-level subdirectory under content_root
        rel = dirpath.relative_to(content_root)
        creator = rel.parts[0]

        results.append({
            "folder": dirpath,
            "creator": creator,
            "rel_path": rel,
            "video_count": len(videos),
        })

    return sorted(results, key=lambda x: (x["creator"], str(x["rel_path"])))


def count_existing_transcripts(output_dir: Path) -> int:
    """Count existing transcript JSONs in an output folder."""
    if not output_dir.exists():
        return 0
    return sum(1 for p in output_dir.iterdir() if p.suffix == ".json")


def main() -> int:
    parser = argparse.ArgumentParser(
        description="Batch transcription runner for Chrysopedia"
    )
    parser.add_argument(
        "--content-root",
        required=True,
        help="Root directory containing creator subdirectories with videos",
    )
    parser.add_argument(
        "--output-dir",
        required=True,
        help="Root output directory for transcript JSONs",
    )
    parser.add_argument(
        "--script",
        default=None,
        help="Path to transcribe.py (default: same directory as this script)",
    )
    parser.add_argument(
        "--python",
        default=sys.executable,
        help="Python interpreter to use (default: current interpreter)",
    )
    parser.add_argument(
        "--model",
        default="large-v3",
        help="Whisper model name (default: large-v3)",
    )
    parser.add_argument(
        "--device",
        default="cuda",
        help="Compute device (default: cuda)",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="List what would be transcribed without running",
    )
    args = parser.parse_args()

    content_root = Path(args.content_root)
    output_root = Path(args.output_dir)
    script_path = Path(args.script) if args.script else Path(__file__).parent / "transcribe.py"

    if not content_root.is_dir():
        logger.error("Content root does not exist: %s", content_root)
        return 1

    if not script_path.is_file():
        logger.error("transcribe.py not found at: %s", script_path)
        return 1

    # Discover all directories with videos (recursive)
    video_dirs = find_video_dirs(content_root)

    if not video_dirs:
        logger.warning("No video files found anywhere under %s", content_root)
        return 0

    # Build work plan with existing transcript counts
    total_videos = 0
    total_existing = 0
    work_plan = []

    for item in video_dirs:
        # Output mirrors the relative source structure
        out_dir = output_root / item["rel_path"]
        n_existing = count_existing_transcripts(out_dir)
        n_remaining = item["video_count"] - n_existing

        total_videos += item["video_count"]
        total_existing += n_existing

        work_plan.append({
            **item,
            "output": out_dir,
            "existing": n_existing,
            "remaining": n_remaining,
        })

    # Aggregate by creator for summary
    creator_stats = {}
    for item in work_plan:
        c = item["creator"]
        if c not in creator_stats:
            creator_stats[c] = {"videos": 0, "existing": 0, "remaining": 0, "dirs": 0}
        creator_stats[c]["videos"] += item["video_count"]
        creator_stats[c]["existing"] += item["existing"]
        creator_stats[c]["remaining"] += item["remaining"]
        creator_stats[c]["dirs"] += 1

    logger.info("=" * 70)
    logger.info("BATCH TRANSCRIPTION PLAN")
    logger.info("=" * 70)
    logger.info("Content root:  %s", content_root)
    logger.info("Output root:   %s", output_root)
    logger.info("Creators:      %d", len(creator_stats))
    logger.info("Directories:   %d", len(work_plan))
    logger.info("Total videos:  %d", total_videos)
    logger.info("Already done:  %d", total_existing)
    logger.info("Remaining:     %d", total_videos - total_existing)
    logger.info("=" * 70)

    for name in sorted(creator_stats.keys()):
        s = creator_stats[name]
        status = "DONE" if s["remaining"] == 0 else f"{s['remaining']} to do"
        logger.info(
            "  %-35s %4d videos (%d dirs), %4d done  [%s]",
            name, s["videos"], s["dirs"], s["existing"], status,
        )

    logger.info("=" * 70)

    if args.dry_run:
        logger.info("DRY RUN — exiting without transcribing.")
        return 0

    # Execute
    manifest = {
        "started_at": datetime.now(timezone.utc).isoformat(),
        "content_root": str(content_root),
        "output_root": str(output_root),
        "model": args.model,
        "device": args.device,
        "results": [],
    }

    total_processed = 0
    total_skipped = 0
    total_failed_dirs = 0
    batch_start = time.time()

    for i, item in enumerate(work_plan, 1):
        if item["remaining"] == 0:
            logger.info("[%d/%d] SKIP %s (all %d videos already transcribed)",
                        i, len(work_plan), item["rel_path"], item["video_count"])
            total_skipped += item["video_count"]
            continue

        logger.info("=" * 70)
        logger.info("[%d/%d] TRANSCRIBING: %s  (creator: %s)",
                    i, len(work_plan), item["rel_path"], item["creator"])
        logger.info("  %d videos, %d remaining",
                    item["video_count"], item["remaining"])
        logger.info("=" * 70)

        dir_start = time.time()

        cmd = [
            args.python,
            str(script_path),
            "--input", str(item["folder"]),
            "--output-dir", str(item["output"]),
            "--model", args.model,
            "--device", args.device,
            "--creator", item["creator"],
        ]

        try:
            result = subprocess.run(
                cmd,
                capture_output=False,
                text=True,
                timeout=14400,  # 4-hour timeout per directory
            )
            dir_elapsed = time.time() - dir_start

            n_after = count_existing_transcripts(item["output"])
            n_new = n_after - item["existing"]

            manifest["results"].append({
                "creator": item["creator"],
                "rel_path": str(item["rel_path"]),
                "videos": item["video_count"],
                "new_transcripts": n_new,
                "total_transcripts": n_after,
                "exit_code": result.returncode,
                "elapsed_seconds": round(dir_elapsed, 1),
            })

            if result.returncode == 0:
                total_processed += n_new
                total_skipped += item["existing"]
                logger.info("Completed %s: %d new transcripts in %.1f s",
                            item["rel_path"], n_new, dir_elapsed)
            else:
                total_failed_dirs += 1
                logger.error("FAILED %s (exit code %d) after %.1f s",
                             item["rel_path"], result.returncode, dir_elapsed)

        except subprocess.TimeoutExpired:
            total_failed_dirs += 1
            logger.error("TIMEOUT: %s exceeded 4-hour limit", item["rel_path"])
            manifest["results"].append({
                "creator": item["creator"],
                "rel_path": str(item["rel_path"]),
                "videos": item["video_count"],
                "error": "timeout",
            })
        except Exception as exc:
            total_failed_dirs += 1
            logger.exception("ERROR processing %s: %s", item["rel_path"], exc)
            manifest["results"].append({
                "creator": item["creator"],
                "rel_path": str(item["rel_path"]),
                "videos": item["video_count"],
                "error": str(exc),
            })

    batch_elapsed = time.time() - batch_start
    manifest["completed_at"] = datetime.now(timezone.utc).isoformat()
    manifest["elapsed_seconds"] = round(batch_elapsed, 1)
    manifest["summary"] = {
        "processed": total_processed,
        "skipped": total_skipped,
        "failed_dirs": total_failed_dirs,
    }

    # Write manifest
    manifest_path = output_root / "batch_manifest.json"
    output_root.mkdir(parents=True, exist_ok=True)
    with open(manifest_path, "w", encoding="utf-8") as f:
        json.dump(manifest, f, indent=2, ensure_ascii=False)

    logger.info("=" * 70)
    logger.info("BATCH COMPLETE")
    logger.info("  Processed:    %d new transcripts", total_processed)
    logger.info("  Skipped:      %d (already existed)", total_skipped)
    logger.info("  Failed dirs:  %d", total_failed_dirs)
    logger.info("  Elapsed:      %.1f s (%.1f hours)", batch_elapsed, batch_elapsed / 3600)
    logger.info("  Manifest:     %s", manifest_path)
    logger.info("=" * 70)

    return 1 if total_failed_dirs > 0 else 0


if __name__ == "__main__":
    sys.exit(main())