From b9cc6121f338de2331156df683e72e63ad30b309 Mon Sep 17 00:00:00 2001 From: John Lightner Date: Sat, 11 Apr 2026 02:07:27 -0500 Subject: [PATCH] docs(01-core-pipeline): create phase plan --- .planning/ROADMAP.md | 7 +- .../phases/01-core-pipeline/01-01-PLAN.md | 197 ++++++++++++++++++ .../phases/01-core-pipeline/01-02-PLAN.md | 144 +++++++++++++ 3 files changed, 344 insertions(+), 4 deletions(-) create mode 100644 .planning/phases/01-core-pipeline/01-01-PLAN.md create mode 100644 .planning/phases/01-core-pipeline/01-02-PLAN.md diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md index 4851e31..f2c20a6 100644 --- a/.planning/ROADMAP.md +++ b/.planning/ROADMAP.md @@ -27,12 +27,11 @@ This roadmap delivers a voice-to-instrument pipeline built on ACE-Step 1.5 XL-SF 3. Output audio preserves the rhythmic timing of the input humming 4. Output sounds like a coherent instrument performance, not garbled noise 5. User can specify the target instrument (e.g., piano, guitar) and the output reflects that instrument -**Plans**: TBD +**Plans**: 2 plans Plans: -- [ ] 01-01: TBD -- [ ] 01-02: TBD -- [ ] 01-03: TBD +- [ ] 01-01-PLAN.md — Build hum2inst.py CLI pipeline and archive old scripts +- [ ] 01-02-PLAN.md — End-to-end test and human verification of melody fidelity ### Phase 2: Instrument Variety & Fidelity Control **Goal**: User can choose from multiple instruments that sound distinctly different, and control how closely the output follows the input melody diff --git a/.planning/phases/01-core-pipeline/01-01-PLAN.md b/.planning/phases/01-core-pipeline/01-01-PLAN.md new file mode 100644 index 0000000..0b663ae --- /dev/null +++ b/.planning/phases/01-core-pipeline/01-01-PLAN.md @@ -0,0 +1,197 @@ +--- +phase: 01-core-pipeline +plan: 01 +type: execute +wave: 1 +depends_on: [] +files_modified: + - hum2inst.py + - archive/midi_to_audio.py + - archive/musicgen_melody.py +autonomous: true +requirements: + - MEL-01 + - MEL-02 + - MEL-04 + - INST-01 + - INP-01 + - OUT-02 + - PIPE-01 + +must_haves: + truths: + - "User can run `python hum2inst.py input.wav --instrument piano` and get a WAV file in ./output/" + - "Script reads input WAV duration and passes it to ACE-Step so output length matches input" + - "Script builds a caption from the instrument name that guides timbre selection" + - "Script checks for CUDA GPU and exits with clear error if unavailable" + - "Script detects silent/failed output and exits with non-zero code" + - "Output WAV filename includes instrument name and timestamp" + - "Old experimental scripts are moved to /archive and no longer in project root" + artifacts: + - path: "hum2inst.py" + provides: "Complete CLI pipeline: argparse, CUDA check, ACE-Step init, generation, output rename, error handling" + min_lines: 100 + - path: "archive/midi_to_audio.py" + provides: "Archived experimental script" + - path: "archive/musicgen_melody.py" + provides: "Archived experimental script" + key_links: + - from: "hum2inst.py" + to: "acestep.inference.generate_music" + via: "direct Python import with sys.path insert" + pattern: "from acestep\\.inference import" + - from: "hum2inst.py" + to: "acestep.handler.AceStepHandler" + via: "handler initialization with XL-SFT config" + pattern: "initialize_service.*acestep-v15-xl-sft" + - from: "hum2inst.py" + to: "torchaudio.info" + via: "duration detection from input WAV" + pattern: "torchaudio\\.info" +--- + + +Build the complete hum-to-instrument CLI pipeline as a single Python script, and archive old experimental scripts. + +Purpose: Deliver the core end-to-end functionality -- a user hums into a WAV file, runs one command, and gets an instrument rendition that follows their melody. +Output: Working `hum2inst.py` script at project root, old scripts moved to `archive/`. + + + +@C:/Users/jlightner/.claude/get-shit-done/workflows/execute-plan.md +@C:/Users/jlightner/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/01-core-pipeline/01-CONTEXT.md +@.planning/phases/01-core-pipeline/01-RESEARCH.md + + + + + + Task 1: Archive experimental scripts + archive/midi_to_audio.py, archive/musicgen_melody.py + +Create an `archive/` directory at the project root. Move `midi_to_audio.py` and `musicgen_melody.py` from the project root into `archive/`. Use git mv if the repo tracks these files, otherwise use regular file move. + +Verify both files exist at their new locations and are removed from the project root. + + + ls archive/midi_to_audio.py archive/musicgen_melody.py && test ! -f midi_to_audio.py && test ! -f musicgen_melody.py && echo "PASS" + + Both experimental scripts exist in archive/ and are gone from project root + + + + Task 2: Create hum2inst.py CLI pipeline script + hum2inst.py + +Create `hum2inst.py` at the project root. This is a single-file CLI script that wraps ACE-Step XL-SFT cover mode. The script must implement ALL of the following (per user decisions in CONTEXT.md): + +**CLI interface (argparse):** +- Positional argument: input WAV file path +- `--instrument` (required): target instrument name (e.g., piano, guitar, saxophone) +- `--output` (optional): output directory, defaults to `./output/` +- `--strength` (optional): audio_cover_strength float, defaults to 0.9 (high fidelity per user preference, within locked range 0.8-1.0) +- `--duration` (optional): override output duration in seconds; if not provided, auto-detect from input WAV + +**Startup checks:** +- Validate input WAV file exists and is readable; exit 1 with clear message if not +- Check `torch.cuda.is_available()`; exit 1 with message "ERROR: CUDA GPU required. No CUDA-capable GPU detected." if false +- Create output directory if it doesn't exist + +**ACE-Step initialization (per RESEARCH.md Pattern 1):** +- Add ace-step directory to sys.path: `sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "ace-step"))` +- Import from acestep: `AceStepHandler`, `LLMHandler`, `GenerationParams`, `GenerationConfig`, `generate_music`, `get_gpu_config`, `set_global_gpu_config` +- Call `get_gpu_config()` and `set_global_gpu_config()` +- Initialize `AceStepHandler` and call `initialize_service()` with `project_root` pointing to the ace-step directory, `config_path="acestep-v15-xl-sft"`, `device="cuda"`, and `use_flash_attention` from handler detection +- Initialize `LLMHandler()` (instantiate only, no model load -- cover mode is in skip_lm_tasks) + +**Duration detection (per RESEARCH.md Pattern 3):** +- Use `torchaudio.info(input_path)` to get `num_frames` and `sample_rate` +- Compute duration as `num_frames / sample_rate` +- Round to nearest integer for the `duration` parameter +- Print detected duration: "Input duration: {duration}s" + +**Caption construction (per RESEARCH.md Pattern 4 + code examples):** +- Include a dictionary of curated caption templates for common instruments: piano, guitar, saxophone, violin, flute (use the exact templates from RESEARCH.md code examples section) +- For instruments not in the dictionary, use generic fallback: `f"solo {instrument}, clear and expressive melody, warm tone"` +- Print the caption being used: "Caption: {caption}" + +**Generation (per RESEARCH.md Pattern 2):** +- Create `GenerationParams` with: + - `task_type="cover"` + - `src_audio` = absolute path to input WAV + - `caption` = built caption string + - `lyrics=""` + - `instrumental=True` + - `duration` = detected or overridden duration (rounded int) + - `audio_cover_strength` = user's --strength value (default 0.9) + - `inference_steps=50` (XL-SFT model requirement -- NOT 8 which is turbo) + - `guidance_scale=5.0` + - `thinking=False` (no LLM needed for cover) + - `bpm=120` (default, not critical for cover mode) +- Create `GenerationConfig` with `batch_size=1`, `use_random_seed=True`, `audio_format="wav"` +- Call `generate_music(dit_handler, llm_handler, params, config, save_dir=temp_output_dir)` where temp_output_dir is a temporary subdirectory to isolate ACE-Step's UUID-named output +- Print progress: "Generating {instrument} cover..." before the call + +**Output handling (per RESEARCH.md Pitfall 2 + user decisions):** +- After generation, check `result.success` and `result.audios` -- if failed, print error to stderr and exit 1 +- Get the generated file path from the result +- Rename/copy to user-friendly filename: `{instrument}_{YYYYMMDD}_{HHMMSS}.wav` in the user's output directory +- Use `datetime.now().strftime("%Y%m%d_%H%M%S")` for the timestamp +- Print the final output path: "Output saved: {path}" + +**Silence detection (per RESEARCH.md Pitfall 4 + user decisions):** +- After generation, load the output audio with torchaudio and compute RMS energy +- If RMS is below a conservative threshold (e.g., -60 dBFS), print warning to stderr: "WARNING: Output audio appears to be near-silent. The generation may have failed." +- Do NOT hard-fail on borderline cases -- just warn (per research recommendation) + +**Error handling (per user decisions):** +- Wrap the main execution in try/except +- On any exception during generation: print clear error message to stderr and exit 1 +- No silent failures -- every error path must print something useful + +**Progress feedback (Claude's discretion):** +- Print status messages at key stages: loading model, detecting duration, generating, saving output +- Keep it simple -- print statements, no progress bar library needed + + + python -c "import ast; ast.parse(open('hum2inst.py').read()); print('SYNTAX OK')" && grep -q "argparse" hum2inst.py && grep -q "generate_music" hum2inst.py && grep -q "cover" hum2inst.py && grep -q "instrument" hum2inst.py && grep -q "torchaudio" hum2inst.py && echo "PASS" + Review script structure: argparse setup, CUDA check, ACE-Step init, generation call, output rename, error handling all present + + hum2inst.py exists at project root with valid Python syntax, contains argparse CLI with --instrument/--output/--strength/--duration flags, imports ACE-Step API, uses cover mode with XL-SFT config, detects input duration, builds caption from instrument name, renames output with instrument+timestamp, checks for CUDA, detects silence, and handles errors with non-zero exit codes + + + + + +1. `hum2inst.py` exists at project root and passes Python syntax check +2. Script imports from acestep package (generate_music, AceStepHandler, etc.) +3. Script uses argparse with required --instrument flag and optional --output, --strength, --duration +4. Script checks CUDA availability before model loading +5. Old scripts archived in archive/ directory +6. Script contains cover mode configuration (task_type="cover", inference_steps=50) +7. Script contains duration detection via torchaudio +8. Script contains output file renaming with instrument + timestamp pattern + + + +- hum2inst.py is syntactically valid Python +- All required CLI flags are present (--instrument, --output, --strength, --duration) +- ACE-Step cover mode is configured correctly (XL-SFT, 50 steps, cover task type) +- Input duration auto-detection is implemented +- Output naming includes instrument and timestamp +- CUDA check is present +- Silence detection is present +- Error handling covers generation failure, missing input, no GPU +- Old scripts moved to archive/ + + + +After completion, create `.planning/phases/01-core-pipeline/01-01-SUMMARY.md` + diff --git a/.planning/phases/01-core-pipeline/01-02-PLAN.md b/.planning/phases/01-core-pipeline/01-02-PLAN.md new file mode 100644 index 0000000..7045799 --- /dev/null +++ b/.planning/phases/01-core-pipeline/01-02-PLAN.md @@ -0,0 +1,144 @@ +--- +phase: 01-core-pipeline +plan: 02 +type: execute +wave: 2 +depends_on: + - 01-01 +files_modified: + - hum2inst.py +autonomous: false +requirements: + - MEL-01 + - MEL-02 + - MEL-04 + - INST-01 + - INP-01 + - OUT-02 + - PIPE-01 + +must_haves: + truths: + - "Running hum2inst.py with a real humming WAV produces an output WAV file" + - "Output audio audibly follows the pitch contour of the input humming" + - "Output audio preserves the rhythmic timing of the input humming" + - "Output sounds like the specified instrument, not garbled noise" + - "Output filename contains the instrument name and a timestamp" + artifacts: + - path: "output/" + provides: "Generated instrument WAV file from test run" + key_links: + - from: "hum2inst.py" + to: "output/*.wav" + via: "end-to-end generation from humming input" + pattern: "Output saved:" +--- + + +Run the complete pipeline end-to-end with a real humming WAV file and verify the output meets quality requirements. + +Purpose: Validate that the script actually produces instrument audio that follows the hummed melody -- the core value proposition of the project. +Output: A verified instrument rendition WAV file and confirmation that the pipeline works. + + + +@C:/Users/jlightner/.claude/get-shit-done/workflows/execute-plan.md +@C:/Users/jlightner/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/01-core-pipeline/01-CONTEXT.md +@.planning/phases/01-core-pipeline/01-01-SUMMARY.md + + + + + + Task 1: Run end-to-end pipeline test + hum2inst.py + +Run the hum2inst.py script with a real humming WAV file from the input/ directory. Use the ace-step venv Python interpreter. + +Execute this command: +``` +ace-step/.venv/Scripts/python.exe hum2inst.py "input/humming_step_down_jl [2026-04-11 004915].wav" --instrument piano +``` + +Verify: +1. The script runs without errors +2. An output WAV file is created in ./output/ with a filename containing "piano" and a timestamp +3. The script prints status messages (loading model, detecting duration, generating, output path) +4. Exit code is 0 + +If the script fails: +- Read the error message carefully +- Fix the issue in hum2inst.py (common issues: import paths, API parameter names, missing attributes on result object) +- Re-run until it succeeds +- Document any fixes made + +After successful run, also test the --help flag to verify argparse setup: +``` +ace-step/.venv/Scripts/python.exe hum2inst.py --help +``` + + + ls output/piano_*.wav 2>/dev/null | head -1 | xargs -I{} test -f "{}" && echo "PASS" || echo "FAIL: no piano output WAV found" + Check script console output shows: duration detection, caption, generation progress, output path + + Pipeline produces a piano WAV output file from real humming input without errors + + + + Task 2: Human verification of melody fidelity and instrument quality + output/piano_*.wav + +Present the generated output to the user for listening verification. The user needs to compare the input humming with the generated instrument output. + +What was built: Complete hum-to-instrument pipeline -- hum2inst.py takes a humming WAV and produces an instrument rendition via ACE-Step XL-SFT cover mode. + +How to verify: +1. Listen to the INPUT humming file: `input/humming_step_down_jl [2026-04-11 004915].wav` +2. Listen to the OUTPUT piano file in `output/` (the latest piano_*.wav file) +3. Verify these qualities: + - Does the output follow the same melody (pitch contour) as the humming? (MEL-01) + - Does the output preserve the timing/rhythm of the humming? (MEL-02) + - Does the output sound like a coherent piano performance, not garbled noise? (MEL-04) + - Is the output audibly a piano (not generic synth or noise)? (INST-01) +4. Optionally, try a second instrument: + `ace-step/.venv/Scripts/python.exe hum2inst.py "input/humming_step_down_jl [2026-04-11 004915].wav" --instrument guitar` + Listen and verify it sounds distinctly different from the piano output. + +Resume signal: Type "approved" if the output sounds good, or describe any issues with melody fidelity, instrument quality, or other problems. + + + ls output/piano_*.wav 2>/dev/null && echo "Output file exists" + Human listens and confirms melody fidelity, rhythm preservation, instrument quality, and musical coherence + + Human confirms output audio follows the hummed melody's pitch contour and rhythm, sounds like the specified instrument, and is musically coherent + + + + + +1. hum2inst.py runs successfully with real humming input +2. Output WAV file exists with correct naming convention +3. Human confirms output follows input melody contour +4. Human confirms output preserves rhythmic timing +5. Human confirms output sounds like specified instrument +6. Human confirms output is musically coherent + + + +- Pipeline runs end-to-end without errors on real humming input +- Output WAV file produced with instrument+timestamp filename +- Human verifies melody fidelity (pitch contour and rhythm preserved) +- Human verifies instrument quality (sounds like specified instrument) +- Human verifies musical coherence (not garbled noise) + + + +After completion, create `.planning/phases/01-core-pipeline/01-02-SUMMARY.md` +