From 56adf2f2ef5d919923d7490e08c86d1373674aa8 Mon Sep 17 00:00:00 2001 From: jlightner Date: Sun, 29 Mar 2026 21:57:42 +0000 Subject: [PATCH] =?UTF-8?q?test:=20Created=20desktop=20Whisper=20transcrip?= =?UTF-8?q?tion=20script=20with=20single-file/bat=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - "whisper/transcribe.py" - "whisper/requirements.txt" - "whisper/README.md" GSD-Task: S01/T04 --- .gsd/milestones/M001/slices/S01/S01-PLAN.md | 2 +- .../M001/slices/S01/tasks/T03-VERIFY.json | 31 ++ .../M001/slices/S01/tasks/T04-SUMMARY.md | 82 ++++ whisper/README.md | 101 ++++- whisper/requirements.txt | 9 + whisper/transcribe.py | 393 ++++++++++++++++++ 6 files changed, 616 insertions(+), 2 deletions(-) create mode 100644 .gsd/milestones/M001/slices/S01/tasks/T03-VERIFY.json create mode 100644 .gsd/milestones/M001/slices/S01/tasks/T04-SUMMARY.md create mode 100644 whisper/requirements.txt create mode 100644 whisper/transcribe.py diff --git a/.gsd/milestones/M001/slices/S01/S01-PLAN.md b/.gsd/milestones/M001/slices/S01/S01-PLAN.md index 72ff6f0..9253bc8 100644 --- a/.gsd/milestones/M001/slices/S01/S01-PLAN.md +++ b/.gsd/milestones/M001/slices/S01/S01-PLAN.md @@ -53,7 +53,7 @@ - Estimate: 1-2 hours - Files: backend/main.py, backend/schemas.py, backend/routers/__init__.py, backend/routers/health.py, backend/routers/creators.py, backend/config.py - Verify: curl http://localhost:8000/health returns 200; curl http://localhost:8000/api/v1/creators returns empty list -- [ ] **T04: Whisper transcription script** — 1. Create Python script whisper/transcribe.py that: +- [x] **T04: Created desktop Whisper transcription script with single-file/batch modes, resumability, spec-compliant JSON output, and ffmpeg validation** — 1. Create Python script whisper/transcribe.py that: - Accepts video file path (or directory for batch mode) - Extracts audio via ffmpeg (subprocess) - Runs Whisper large-v3 with segment-level and word-level timestamps diff --git a/.gsd/milestones/M001/slices/S01/tasks/T03-VERIFY.json b/.gsd/milestones/M001/slices/S01/tasks/T03-VERIFY.json new file mode 100644 index 0000000..e3f7aae --- /dev/null +++ b/.gsd/milestones/M001/slices/S01/tasks/T03-VERIFY.json @@ -0,0 +1,31 @@ +{ + "schemaVersion": 1, + "taskId": "T03", + "unitId": "M001/S01/T03", + "timestamp": 1774821297505, + "passed": false, + "discoverySource": "none", + "checks": [], + "retryAttempt": 1, + "maxRetries": 2, + "runtimeErrors": [ + { + "source": "bg-shell", + "severity": "crash", + "message": "[chrysopedia-api] exitCode=1", + "blocking": true + }, + { + "source": "bg-shell", + "severity": "crash", + "message": "[chrysopedia-api-2] exitCode=1", + "blocking": true + }, + { + "source": "bg-shell", + "severity": "crash", + "message": "[chrysopedia-api-3] exitCode=1", + "blocking": true + } + ] +} diff --git a/.gsd/milestones/M001/slices/S01/tasks/T04-SUMMARY.md b/.gsd/milestones/M001/slices/S01/tasks/T04-SUMMARY.md new file mode 100644 index 0000000..a1b11ff --- /dev/null +++ b/.gsd/milestones/M001/slices/S01/tasks/T04-SUMMARY.md @@ -0,0 +1,82 @@ +--- +id: T04 +parent: S01 +milestone: M001 +provides: [] +requires: [] +affects: [] +key_files: ["whisper/transcribe.py", "whisper/requirements.txt", "whisper/README.md"] +key_decisions: ["Whisper import deferred inside transcribe_audio() so --help and ffmpeg validation work without openai-whisper installed", "Audio extracted to 16kHz mono WAV via ffmpeg subprocess matching Whisper expected format", "Creator folder inferred from parent directory name by default, overridable with --creator"] +patterns_established: [] +drill_down_paths: [] +observability_surfaces: [] +duration: "" +verification_result: "1. python3 whisper/transcribe.py --help — exits 0, shows full usage with all CLI args and examples. 2. python3 whisper/transcribe.py --input /tmp/fake.mp4 --output-dir /tmp/out — exits 1 with clear ffmpeg-not-found error, confirming validation works. 3. Python AST parse confirms syntax validity." +completed_at: 2026-03-29T21:57:39.524Z +blocker_discovered: false +--- + +# T04: Created desktop Whisper transcription script with single-file/batch modes, resumability, spec-compliant JSON output, and ffmpeg validation + +> Created desktop Whisper transcription script with single-file/batch modes, resumability, spec-compliant JSON output, and ffmpeg validation + +## What Happened +--- +id: T04 +parent: S01 +milestone: M001 +key_files: + - whisper/transcribe.py + - whisper/requirements.txt + - whisper/README.md +key_decisions: + - Whisper import deferred inside transcribe_audio() so --help and ffmpeg validation work without openai-whisper installed + - Audio extracted to 16kHz mono WAV via ffmpeg subprocess matching Whisper expected format + - Creator folder inferred from parent directory name by default, overridable with --creator +duration: "" +verification_result: passed +completed_at: 2026-03-29T21:57:39.525Z +blocker_discovered: false +--- + +# T04: Created desktop Whisper transcription script with single-file/batch modes, resumability, spec-compliant JSON output, and ffmpeg validation + +**Created desktop Whisper transcription script with single-file/batch modes, resumability, spec-compliant JSON output, and ffmpeg validation** + +## What Happened + +Built whisper/transcribe.py implementing all task plan requirements: argparse CLI with --input, --output-dir, --model (default large-v3), --device (default cuda); ffmpeg audio extraction to 16kHz mono WAV; Whisper transcription with word-level timestamps; JSON output matching the Chrysopedia spec format (source_file, creator_folder, duration_seconds, segments with words); resumability via output-exists check; batch mode with progress logging. Deferred whisper import so --help works without the dependency. Created requirements.txt and comprehensive README.md. + +## Verification + +1. python3 whisper/transcribe.py --help — exits 0, shows full usage with all CLI args and examples. 2. python3 whisper/transcribe.py --input /tmp/fake.mp4 --output-dir /tmp/out — exits 1 with clear ffmpeg-not-found error, confirming validation works. 3. Python AST parse confirms syntax validity. + +## Verification Evidence + +| # | Command | Exit Code | Verdict | Duration | +|---|---------|-----------|---------|----------| +| 1 | `python3 whisper/transcribe.py --help` | 0 | ✅ pass | 200ms | +| 2 | `python3 whisper/transcribe.py --input /tmp/fake.mp4 --output-dir /tmp/out` | 1 | ✅ pass (expected ffmpeg error) | 200ms | +| 3 | `python3 -c "import ast; ast.parse(open('whisper/transcribe.py').read())"` | 0 | ✅ pass | 100ms | + + +## Deviations + +Added --creator CLI flag for overriding inferred creator folder name. ffmpeg-python included in requirements.txt per plan but script uses subprocess directly for reliability. + +## Known Issues + +None. + +## Files Created/Modified + +- `whisper/transcribe.py` +- `whisper/requirements.txt` +- `whisper/README.md` + + +## Deviations +Added --creator CLI flag for overriding inferred creator folder name. ffmpeg-python included in requirements.txt per plan but script uses subprocess directly for reliability. + +## Known Issues +None. diff --git a/whisper/README.md b/whisper/README.md index c0c6651..03a7179 100644 --- a/whisper/README.md +++ b/whisper/README.md @@ -1,3 +1,102 @@ # Chrysopedia — Whisper Transcription -Desktop transcription script. See `transcribe.py` for usage. +Desktop transcription tool for extracting timestamped text from video files +using OpenAI's Whisper model (large-v3). Designed to run on a machine with +an NVIDIA GPU (e.g., RTX 4090). + +## Prerequisites + +- **Python 3.10+** +- **ffmpeg** installed and on PATH +- **NVIDIA GPU** with CUDA support (recommended; CPU fallback available) + +### Install ffmpeg + +```bash +# Debian/Ubuntu +sudo apt install ffmpeg + +# macOS +brew install ffmpeg +``` + +### Install Python dependencies + +```bash +pip install -r requirements.txt +``` + +## Usage + +### Single file + +```bash +python transcribe.py --input "path/to/video.mp4" --output-dir ./transcripts +``` + +### Batch mode (all videos in a directory) + +```bash +python transcribe.py --input ./videos/ --output-dir ./transcripts +``` + +### Options + +| Flag | Default | Description | +| --------------- | ----------- | ----------------------------------------------- | +| `--input` | (required) | Path to a video file or directory of videos | +| `--output-dir` | (required) | Directory to write transcript JSON files | +| `--model` | `large-v3` | Whisper model name (`tiny`, `base`, `small`, `medium`, `large-v3`) | +| `--device` | `cuda` | Compute device (`cuda` or `cpu`) | +| `--creator` | (inferred) | Override creator folder name in output JSON | +| `-v, --verbose` | off | Enable debug logging | + +## Output Format + +Each video produces a JSON file matching the Chrysopedia spec: + +```json +{ + "source_file": "Skope — Sound Design Masterclass pt2.mp4", + "creator_folder": "Skope", + "duration_seconds": 7243, + "segments": [ + { + "start": 0.0, + "end": 4.52, + "text": "Hey everyone welcome back to part two...", + "words": [ + { "word": "Hey", "start": 0.0, "end": 0.28 }, + { "word": "everyone", "start": 0.32, "end": 0.74 } + ] + } + ] +} +``` + +## Resumability + +The script automatically skips videos whose output JSON already exists. To +re-transcribe a file, delete its output JSON first. + +## Performance + +Whisper large-v3 on an RTX 4090 processes audio at roughly 10–20× real-time. +A 2-hour video takes ~6–12 minutes. For 300 videos averaging 1.5 hours each, +the initial transcription pass takes roughly 15–40 hours of GPU time. + +## Directory Convention + +The script infers the `creator_folder` field from the parent directory of each +video file. Organize videos like: + +``` +videos/ +├── Skope/ +│ ├── Sound Design Masterclass pt1.mp4 +│ └── Sound Design Masterclass pt2.mp4 +├── Mr Bill/ +│ └── Glitch Techniques.mp4 +``` + +Override with `--creator` when processing files outside this structure. diff --git a/whisper/requirements.txt b/whisper/requirements.txt new file mode 100644 index 0000000..809a2e0 --- /dev/null +++ b/whisper/requirements.txt @@ -0,0 +1,9 @@ +# Chrysopedia — Whisper transcription dependencies +# Install: pip install -r requirements.txt +# +# Note: openai-whisper requires ffmpeg to be installed on the system. +# sudo apt install ffmpeg (Debian/Ubuntu) +# brew install ffmpeg (macOS) + +openai-whisper>=20231117 +ffmpeg-python>=0.2.0 diff --git a/whisper/transcribe.py b/whisper/transcribe.py new file mode 100644 index 0000000..41b9f94 --- /dev/null +++ b/whisper/transcribe.py @@ -0,0 +1,393 @@ +#!/usr/bin/env python3 +""" +Chrysopedia — Whisper Transcription Script + +Desktop transcription tool for extracting timestamped text from video files +using OpenAI's Whisper model (large-v3). Designed to run on a machine with +an NVIDIA GPU (e.g., RTX 4090). + +Outputs JSON matching the Chrysopedia spec format: +{ + "source_file": "filename.mp4", + "creator_folder": "CreatorName", + "duration_seconds": 7243, + "segments": [ + { + "start": 0.0, + "end": 4.52, + "text": "...", + "words": [{"word": "Hey", "start": 0.0, "end": 0.28}, ...] + } + ] +} +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import shutil +import subprocess +import sys +import tempfile +import time +from pathlib import Path + +# --------------------------------------------------------------------------- +# Logging +# --------------------------------------------------------------------------- + +LOG_FORMAT = "%(asctime)s [%(levelname)s] %(message)s" +logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) +logger = logging.getLogger("chrysopedia.transcribe") + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +SUPPORTED_EXTENSIONS = {".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv"} +DEFAULT_MODEL = "large-v3" +DEFAULT_DEVICE = "cuda" + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def check_ffmpeg() -> bool: + """Return True if ffmpeg is available on PATH.""" + return shutil.which("ffmpeg") is not None + + +def get_audio_duration(video_path: Path) -> float | None: + """Use ffprobe to get duration in seconds. Returns None on failure.""" + ffprobe = shutil.which("ffprobe") + if ffprobe is None: + return None + try: + result = subprocess.run( + [ + ffprobe, + "-v", "error", + "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", + str(video_path), + ], + capture_output=True, + text=True, + timeout=30, + ) + return float(result.stdout.strip()) + except (subprocess.TimeoutExpired, ValueError, OSError) as exc: + logger.warning("Could not determine duration for %s: %s", video_path.name, exc) + return None + + +def extract_audio(video_path: Path, audio_path: Path) -> None: + """Extract audio from video to 16kHz mono WAV using ffmpeg.""" + logger.info("Extracting audio: %s -> %s", video_path.name, audio_path.name) + cmd = [ + "ffmpeg", + "-i", str(video_path), + "-vn", # no video + "-acodec", "pcm_s16le", # 16-bit PCM + "-ar", "16000", # 16kHz (Whisper expects this) + "-ac", "1", # mono + "-y", # overwrite + str(audio_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=600) + if result.returncode != 0: + raise RuntimeError( + f"ffmpeg audio extraction failed (exit {result.returncode}): {result.stderr[:500]}" + ) + + +def transcribe_audio( + audio_path: Path, + model_name: str = DEFAULT_MODEL, + device: str = DEFAULT_DEVICE, +) -> dict: + """Run Whisper on the audio file and return the raw result dict.""" + # Import whisper here so --help works without the dependency installed + try: + import whisper # type: ignore[import-untyped] + except ImportError: + logger.error( + "openai-whisper is not installed. " + "Install it with: pip install openai-whisper" + ) + sys.exit(1) + + logger.info("Loading Whisper model '%s' on device '%s'...", model_name, device) + t0 = time.time() + model = whisper.load_model(model_name, device=device) + logger.info("Model loaded in %.1f s", time.time() - t0) + + logger.info("Transcribing %s ...", audio_path.name) + t0 = time.time() + result = model.transcribe( + str(audio_path), + word_timestamps=True, + verbose=False, + ) + elapsed = time.time() - t0 + logger.info( + "Transcription complete in %.1f s (%.1fx real-time)", + elapsed, + (result.get("duration", elapsed) / elapsed) if elapsed > 0 else 0, + ) + return result + + +def format_output( + whisper_result: dict, + source_file: str, + creator_folder: str, + duration_seconds: float | None, +) -> dict: + """Convert Whisper result to the Chrysopedia spec JSON format.""" + segments = [] + for seg in whisper_result.get("segments", []): + words = [] + for w in seg.get("words", []): + words.append( + { + "word": w.get("word", "").strip(), + "start": round(w.get("start", 0.0), 2), + "end": round(w.get("end", 0.0), 2), + } + ) + segments.append( + { + "start": round(seg.get("start", 0.0), 2), + "end": round(seg.get("end", 0.0), 2), + "text": seg.get("text", "").strip(), + "words": words, + } + ) + + # Use duration from ffprobe if available, otherwise from whisper + if duration_seconds is None: + duration_seconds = whisper_result.get("duration", 0.0) + + return { + "source_file": source_file, + "creator_folder": creator_folder, + "duration_seconds": round(duration_seconds), + "segments": segments, + } + + +def infer_creator_folder(video_path: Path) -> str: + """ + Infer creator folder name from directory structure. + + Expected layout: /path/to//video.mp4 + Falls back to parent directory name. + """ + return video_path.parent.name + + +def output_path_for(video_path: Path, output_dir: Path) -> Path: + """Compute the output JSON path for a given video file.""" + return output_dir / f"{video_path.stem}.json" + + +def process_single( + video_path: Path, + output_dir: Path, + model_name: str, + device: str, + creator_folder: str | None = None, +) -> Path | None: + """ + Process a single video file. Returns the output path on success, None if skipped. + """ + out_path = output_path_for(video_path, output_dir) + + # Resumability: skip if output already exists + if out_path.exists(): + logger.info("SKIP (output exists): %s", out_path) + return None + + logger.info("Processing: %s", video_path) + + # Determine creator folder + folder = creator_folder or infer_creator_folder(video_path) + + # Get duration via ffprobe + duration = get_audio_duration(video_path) + if duration is not None: + logger.info("Video duration: %.0f s (%.1f min)", duration, duration / 60) + + # Extract audio to temp file + with tempfile.TemporaryDirectory(prefix="chrysopedia_") as tmpdir: + audio_path = Path(tmpdir) / "audio.wav" + extract_audio(video_path, audio_path) + + # Transcribe + whisper_result = transcribe_audio(audio_path, model_name, device) + + # Format and write output + output = format_output(whisper_result, video_path.name, folder, duration) + + output_dir.mkdir(parents=True, exist_ok=True) + with open(out_path, "w", encoding="utf-8") as f: + json.dump(output, f, indent=2, ensure_ascii=False) + + segment_count = len(output["segments"]) + logger.info("Wrote %s (%d segments)", out_path, segment_count) + return out_path + + +def find_videos(input_path: Path) -> list[Path]: + """Find all supported video files in a directory (non-recursive).""" + videos = sorted( + p for p in input_path.iterdir() + if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS + ) + return videos + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="transcribe", + description=( + "Chrysopedia Whisper Transcription — extract timestamped transcripts " + "from video files using OpenAI's Whisper model." + ), + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=( + "Examples:\n" + " # Single file\n" + " python transcribe.py --input video.mp4 --output-dir ./transcripts\n" + "\n" + " # Batch mode (all videos in directory)\n" + " python transcribe.py --input ./videos/ --output-dir ./transcripts\n" + "\n" + " # Use a smaller model on CPU\n" + " python transcribe.py --input video.mp4 --model base --device cpu\n" + ), + ) + parser.add_argument( + "--input", + required=True, + type=str, + help="Path to a video file or directory of video files", + ) + parser.add_argument( + "--output-dir", + required=True, + type=str, + help="Directory to write transcript JSON files", + ) + parser.add_argument( + "--model", + default=DEFAULT_MODEL, + type=str, + help=f"Whisper model name (default: {DEFAULT_MODEL})", + ) + parser.add_argument( + "--device", + default=DEFAULT_DEVICE, + type=str, + help=f"Compute device: cuda, cpu (default: {DEFAULT_DEVICE})", + ) + parser.add_argument( + "--creator", + default=None, + type=str, + help="Override creator folder name (default: inferred from parent directory)", + ) + parser.add_argument( + "-v", "--verbose", + action="store_true", + help="Enable debug logging", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + # Validate ffmpeg availability + if not check_ffmpeg(): + logger.error( + "ffmpeg is not installed or not on PATH. " + "Install it with: sudo apt install ffmpeg (or equivalent)" + ) + return 1 + + input_path = Path(args.input).resolve() + output_dir = Path(args.output_dir).resolve() + + if not input_path.exists(): + logger.error("Input path does not exist: %s", input_path) + return 1 + + # Single file mode + if input_path.is_file(): + if input_path.suffix.lower() not in SUPPORTED_EXTENSIONS: + logger.error( + "Unsupported file type '%s'. Supported: %s", + input_path.suffix, + ", ".join(sorted(SUPPORTED_EXTENSIONS)), + ) + return 1 + result = process_single( + input_path, output_dir, args.model, args.device, args.creator + ) + if result is None: + logger.info("Nothing to do (output already exists).") + return 0 + + # Batch mode (directory) + if input_path.is_dir(): + videos = find_videos(input_path) + if not videos: + logger.warning("No supported video files found in %s", input_path) + return 0 + + logger.info("Found %d video(s) in %s", len(videos), input_path) + processed = 0 + skipped = 0 + failed = 0 + + for i, video in enumerate(videos, 1): + logger.info("--- [%d/%d] %s ---", i, len(videos), video.name) + try: + result = process_single( + video, output_dir, args.model, args.device, args.creator + ) + if result is not None: + processed += 1 + else: + skipped += 1 + except Exception: + logger.exception("FAILED: %s", video.name) + failed += 1 + + logger.info( + "Batch complete: %d processed, %d skipped, %d failed", + processed, skipped, failed, + ) + return 1 if failed > 0 else 0 + + logger.error("Input is neither a file nor a directory: %s", input_path) + return 1 + + +if __name__ == "__main__": + sys.exit(main())