chore(01-01): archive experimental scripts

- Move midi_to_audio.py to archive/ - Move musicgen_melody.py to archive/
2026-04-11 02:10:59 -05:00 · 2026-04-11 02:10:59 -05:00 · 262ee6f7d1
commit 262ee6f7d1
parent b9cc6121f3
2 changed files with 306 additions and 0 deletions
--- a/archive/midi_to_audio.py
+++ b/archive/midi_to_audio.py
@ -0,0 +1,142 @@
+"""
+MIDI to Audio Synthesizer
+=========================
+Renders MIDI files to clean WAV audio using sine-wave synthesis with pitch bend support.
+Produces a clean melody signal suitable for MusicGen melody conditioning.
+
+Usage:
+  python midi_to_audio.py input.mid                    # outputs input_synth.wav
+  python midi_to_audio.py input.mid -o output.wav      # custom output path
+  python midi_to_audio.py input.mid --sample-rate 32000 # match MusicGen's 32kHz
+"""
+
+import argparse
+import os
+
+import mido
+import numpy as np
+import scipy.io.wavfile as wavfile
+
+
+def midi_note_to_freq(note: int) -> float:
+    """Convert MIDI note number to frequency in Hz."""
+    return 440.0 * (2.0 ** ((note - 69) / 12.0))
+
+
+def render_midi(midi_path: str, sample_rate: int = 32000) -> np.ndarray:
+    """Render a MIDI file to audio using sine synthesis with pitch bend."""
+    mid = mido.MidiFile(midi_path)
+
+    # Calculate total duration
+    total_seconds = mid.length
+    total_samples = int(total_seconds * sample_rate) + sample_rate  # +1s padding
+    audio = np.zeros(total_samples, dtype=np.float64)
+
+    # Process each track
+    for track in mid.tracks:
+        current_time = 0.0  # in seconds
+        tempo = 500000  # default 120 BPM
+        active_notes = {}  # note -> (start_sample, velocity)
+        current_pitch_bend = 0  # in MIDI pitch bend units (-8192 to 8191)
+        pitch_bend_range = 2  # semitones (standard GM default)
+
+        for msg in track:
+            # Advance time
+            if msg.time > 0:
+                delta_seconds = mido.tick2second(msg.time, mid.ticks_per_beat, tempo)
+                current_time += delta_seconds
+
+            if msg.type == 'set_tempo':
+                tempo = msg.tempo
+
+            elif msg.type == 'pitchwheel':
+                current_pitch_bend = msg.pitch
+
+            elif msg.type == 'note_on' and msg.velocity > 0:
+                sample_pos = int(current_time * sample_rate)
+                active_notes[msg.note] = (sample_pos, msg.velocity, current_pitch_bend)
+
+            elif msg.type == 'note_off' or (msg.type == 'note_on' and msg.velocity == 0):
+                if msg.note in active_notes:
+                    start_sample, velocity, start_bend = active_notes.pop(msg.note)
+                    end_sample = int(current_time * sample_rate)
+
+                    if end_sample <= start_sample:
+                        continue
+
+                    # Average the pitch bend (simplification - uses start bend)
+                    bend_semitones = (start_bend / 8192.0) * pitch_bend_range
+                    freq = midi_note_to_freq(msg.note + bend_semitones)
+
+                    # Generate tone
+                    n_samples = end_sample - start_sample
+                    t = np.arange(n_samples) / sample_rate
+                    tone = np.sin(2 * np.pi * freq * t)
+
+                    # Apply ADSR envelope
+                    envelope = np.ones(n_samples)
+                    attack = min(int(0.01 * sample_rate), n_samples)  # 10ms attack
+                    release = min(int(0.05 * sample_rate), n_samples)  # 50ms release
+                    if attack > 0:
+                        envelope[:attack] = np.linspace(0, 1, attack)
+                    if release > 0:
+                        envelope[-release:] = np.linspace(1, 0, release)
+
+                    # Scale by velocity
+                    amplitude = velocity / 127.0 * 0.5
+                    tone *= envelope * amplitude
+
+                    # Mix into output
+                    end_idx = min(start_sample + n_samples, len(audio))
+                    actual_len = end_idx - start_sample
+                    audio[start_sample:end_idx] += tone[:actual_len]
+
+    # Normalize
+    peak = np.max(np.abs(audio))
+    if peak > 0:
+        audio = audio / peak * 0.9
+
+    # Trim trailing silence
+    nonzero = np.nonzero(np.abs(audio) > 0.001)[0]
+    if len(nonzero) > 0:
+        end = min(nonzero[-1] + sample_rate, len(audio))  # +1s tail
+        audio = audio[:end]
+
+    return audio.astype(np.float32)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Render MIDI to clean audio")
+    parser.add_argument("input", help="Input MIDI file")
+    parser.add_argument("-o", "--output", help="Output WAV path")
+    parser.add_argument("--sample-rate", "-sr", type=int, default=32000,
+                        help="Sample rate (default: 32000, matches MusicGen)")
+    args = parser.parse_args()
+
+    print(f"Reading MIDI: {args.input}")
+    mid = mido.MidiFile(args.input)
+    print(f"  Duration: {mid.length:.1f}s")
+    print(f"  Tracks: {len(mid.tracks)}")
+
+    # Count notes
+    note_count = sum(1 for track in mid.tracks for msg in track
+                     if msg.type == 'note_on' and msg.velocity > 0)
+    print(f"  Notes: {note_count}")
+
+    print(f"Rendering at {args.sample_rate}Hz...")
+    audio = render_midi(args.input, args.sample_rate)
+    print(f"  Output duration: {len(audio) / args.sample_rate:.1f}s")
+
+    if args.output:
+        output_path = args.output
+    else:
+        base = os.path.splitext(args.input)[0]
+        output_path = f"{base}_synth.wav"
+
+    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+    wavfile.write(output_path, args.sample_rate, audio)
+    print(f"Saved: {output_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/archive/musicgen_melody.py
+++ b/archive/musicgen_melody.py
@ -0,0 +1,164 @@
+"""
+MusicGen Melody Large - Hum to Instrument
+==========================================
+Feed it a WAV/MP3 of you humming + a text prompt describing the instrument,
+and it outputs that melody played on the described instrument.
+
+Usage:
+  python musicgen_melody.py --input hum.wav --prompt "solo acoustic piano, gentle and warm"
+  python musicgen_melody.py --input hum.wav --prompt "solo electric guitar, jazz improvisation" --duration 20
+  python musicgen_melody.py --input hum.wav --prompt "solo saxophone, smooth jazz" --output sax_output.wav
+
+Without --input, generates from text prompt only (no melody conditioning).
+"""
+
+import argparse
+import os
+import sys
+import time
+
+import numpy as np
+import torch
+import torchaudio
+from transformers import AutoProcessor, MusicgenMelodyForConditionalGeneration
+
+
+def load_audio(path: str, target_sr: int = 32000):
+    """Load audio file and resample to target sample rate."""
+    waveform, sr = torchaudio.load(path)
+    # Convert to mono if stereo
+    if waveform.shape[0] > 1:
+        waveform = waveform.mean(dim=0, keepdim=True)
+    # Resample if needed
+    if sr != target_sr:
+        resampler = torchaudio.transforms.Resample(sr, target_sr)
+        waveform = resampler(waveform)
+    return waveform, target_sr
+
+
+def main():
+    parser = argparse.ArgumentParser(description="MusicGen Melody - Hum to Instrument")
+    parser.add_argument("--input", "-i", type=str, default=None,
+                        help="Path to input audio (WAV/MP3) of humming/melody (8-30 seconds)")
+    parser.add_argument("--prompt", "-p", type=str, required=True,
+                        help="Text prompt describing desired instrument and style")
+    parser.add_argument("--duration", "-d", type=int, default=None,
+                        help="Output duration in seconds (default: match input length, max 30)")
+    parser.add_argument("--output", "-o", type=str, default=None,
+                        help="Output WAV path (default: auto-generated in output/musicgen/)")
+    parser.add_argument("--guidance", "-g", type=float, default=3.0,
+                        help="Classifier-free guidance scale (default: 3.0, range 1-5)")
+    parser.add_argument("--top-k", type=int, default=250,
+                        help="Top-k sampling (default: 250)")
+    parser.add_argument("--top-p", type=float, default=0.0,
+                        help="Top-p nucleus sampling (default: 0.0 = disabled)")
+    parser.add_argument("--temperature", "-t", type=float, default=1.0,
+                        help="Sampling temperature (default: 1.0)")
+    parser.add_argument("--seed", "-s", type=int, default=-1,
+                        help="Random seed (-1 = random)")
+    args = parser.parse_args()
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Device: {device}")
+    if device == "cuda":
+        print(f"GPU: {torch.cuda.get_device_name(0)}")
+
+    # Load model
+    print("Loading MusicGen Melody Large...")
+    t0 = time.time()
+    processor = AutoProcessor.from_pretrained("facebook/musicgen-melody-large")
+    model = MusicgenMelodyForConditionalGeneration.from_pretrained("facebook/musicgen-melody-large")
+    model = model.to(device)
+    print(f"Model loaded in {time.time() - t0:.1f}s")
+    print(f"VRAM used: {torch.cuda.memory_allocated() / 1e9:.1f} GB")
+
+    # Determine duration
+    sample_rate = model.config.audio_encoder.sampling_rate  # 32000
+    max_duration = 30
+    duration = min(args.duration or max_duration, max_duration)
+
+    # Prepare inputs
+    if args.input:
+        print(f"Loading input audio: {args.input}")
+        waveform, sr = load_audio(args.input, target_sr=sample_rate)
+        input_duration = waveform.shape[1] / sr
+        print(f"Input duration: {input_duration:.1f}s")
+
+        if input_duration > max_duration:
+            print(f"Warning: Input longer than {max_duration}s, truncating.")
+            waveform = waveform[:, :max_duration * sr]
+            input_duration = max_duration
+
+        if args.duration is None:
+            duration = min(int(input_duration) + 2, max_duration)  # slight padding
+            print(f"Auto duration: {duration}s (input + 2s padding)")
+
+        audio_np = waveform.squeeze(0).numpy().astype(np.float32)
+        inputs = processor(
+            audio=audio_np,
+            sampling_rate=sample_rate,
+            text=[args.prompt],
+            padding=True,
+            return_tensors="pt",
+        ).to(device)
+        print(f"Melody conditioning active: input_features shape = {inputs['input_features'].shape}")
+    else:
+        print("No input audio - generating from text prompt only.")
+        inputs = processor(
+            text=[args.prompt],
+            padding=True,
+            return_tensors="pt",
+        ).to(device)
+
+    # Calculate max new tokens from duration
+    # MusicGen generates at ~50 tokens per second of audio
+    max_new_tokens = int(duration * 50)
+
+    # Seed right before generation (after model loading) so conditioning is not overridden
+    if args.seed >= 0:
+        torch.manual_seed(args.seed)
+        if device == "cuda":
+            torch.cuda.manual_seed(args.seed)
+        print(f"Seed: {args.seed}")
+
+    print(f"\nGenerating {duration}s of audio...")
+    print(f"  Prompt: {args.prompt}")
+    print(f"  Guidance: {args.guidance}")
+    print(f"  Temperature: {args.temperature}")
+    print(f"  Top-k: {args.top_k}")
+
+    t0 = time.time()
+    with torch.inference_mode():
+        audio_values = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            guidance_scale=args.guidance,
+            do_sample=True,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p if args.top_p > 0 else None,
+        )
+    gen_time = time.time() - t0
+    print(f"Generated in {gen_time:.1f}s (RTF: {gen_time/duration:.2f}x)")
+
+    # Save output
+    audio = audio_values[0, 0].cpu()  # [samples]
+
+    # Create output directory
+    output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "output", "musicgen")
+    os.makedirs(output_dir, exist_ok=True)
+
+    if args.output:
+        output_path = args.output
+    else:
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        safe_prompt = args.prompt[:40].replace(" ", "_").replace(",", "")
+        output_path = os.path.join(output_dir, f"{timestamp}_{safe_prompt}.wav")
+
+    torchaudio.save(output_path, audio.unsqueeze(0), sample_rate)
+    print(f"\nSaved: {output_path}")
+    print(f"Duration: {audio.shape[0] / sample_rate:.1f}s @ {sample_rate}Hz")
+
+
+if __name__ == "__main__":
+    main()