diff --git a/archive/midi_to_audio.py b/archive/midi_to_audio.py new file mode 100644 index 0000000..e2fa850 --- /dev/null +++ b/archive/midi_to_audio.py @@ -0,0 +1,142 @@ +""" +MIDI to Audio Synthesizer +========================= +Renders MIDI files to clean WAV audio using sine-wave synthesis with pitch bend support. +Produces a clean melody signal suitable for MusicGen melody conditioning. + +Usage: + python midi_to_audio.py input.mid # outputs input_synth.wav + python midi_to_audio.py input.mid -o output.wav # custom output path + python midi_to_audio.py input.mid --sample-rate 32000 # match MusicGen's 32kHz +""" + +import argparse +import os + +import mido +import numpy as np +import scipy.io.wavfile as wavfile + + +def midi_note_to_freq(note: int) -> float: + """Convert MIDI note number to frequency in Hz.""" + return 440.0 * (2.0 ** ((note - 69) / 12.0)) + + +def render_midi(midi_path: str, sample_rate: int = 32000) -> np.ndarray: + """Render a MIDI file to audio using sine synthesis with pitch bend.""" + mid = mido.MidiFile(midi_path) + + # Calculate total duration + total_seconds = mid.length + total_samples = int(total_seconds * sample_rate) + sample_rate # +1s padding + audio = np.zeros(total_samples, dtype=np.float64) + + # Process each track + for track in mid.tracks: + current_time = 0.0 # in seconds + tempo = 500000 # default 120 BPM + active_notes = {} # note -> (start_sample, velocity) + current_pitch_bend = 0 # in MIDI pitch bend units (-8192 to 8191) + pitch_bend_range = 2 # semitones (standard GM default) + + for msg in track: + # Advance time + if msg.time > 0: + delta_seconds = mido.tick2second(msg.time, mid.ticks_per_beat, tempo) + current_time += delta_seconds + + if msg.type == 'set_tempo': + tempo = msg.tempo + + elif msg.type == 'pitchwheel': + current_pitch_bend = msg.pitch + + elif msg.type == 'note_on' and msg.velocity > 0: + sample_pos = int(current_time * sample_rate) + active_notes[msg.note] = (sample_pos, msg.velocity, current_pitch_bend) + + elif msg.type == 'note_off' or (msg.type == 'note_on' and msg.velocity == 0): + if msg.note in active_notes: + start_sample, velocity, start_bend = active_notes.pop(msg.note) + end_sample = int(current_time * sample_rate) + + if end_sample <= start_sample: + continue + + # Average the pitch bend (simplification - uses start bend) + bend_semitones = (start_bend / 8192.0) * pitch_bend_range + freq = midi_note_to_freq(msg.note + bend_semitones) + + # Generate tone + n_samples = end_sample - start_sample + t = np.arange(n_samples) / sample_rate + tone = np.sin(2 * np.pi * freq * t) + + # Apply ADSR envelope + envelope = np.ones(n_samples) + attack = min(int(0.01 * sample_rate), n_samples) # 10ms attack + release = min(int(0.05 * sample_rate), n_samples) # 50ms release + if attack > 0: + envelope[:attack] = np.linspace(0, 1, attack) + if release > 0: + envelope[-release:] = np.linspace(1, 0, release) + + # Scale by velocity + amplitude = velocity / 127.0 * 0.5 + tone *= envelope * amplitude + + # Mix into output + end_idx = min(start_sample + n_samples, len(audio)) + actual_len = end_idx - start_sample + audio[start_sample:end_idx] += tone[:actual_len] + + # Normalize + peak = np.max(np.abs(audio)) + if peak > 0: + audio = audio / peak * 0.9 + + # Trim trailing silence + nonzero = np.nonzero(np.abs(audio) > 0.001)[0] + if len(nonzero) > 0: + end = min(nonzero[-1] + sample_rate, len(audio)) # +1s tail + audio = audio[:end] + + return audio.astype(np.float32) + + +def main(): + parser = argparse.ArgumentParser(description="Render MIDI to clean audio") + parser.add_argument("input", help="Input MIDI file") + parser.add_argument("-o", "--output", help="Output WAV path") + parser.add_argument("--sample-rate", "-sr", type=int, default=32000, + help="Sample rate (default: 32000, matches MusicGen)") + args = parser.parse_args() + + print(f"Reading MIDI: {args.input}") + mid = mido.MidiFile(args.input) + print(f" Duration: {mid.length:.1f}s") + print(f" Tracks: {len(mid.tracks)}") + + # Count notes + note_count = sum(1 for track in mid.tracks for msg in track + if msg.type == 'note_on' and msg.velocity > 0) + print(f" Notes: {note_count}") + + print(f"Rendering at {args.sample_rate}Hz...") + audio = render_midi(args.input, args.sample_rate) + print(f" Output duration: {len(audio) / args.sample_rate:.1f}s") + + if args.output: + output_path = args.output + else: + base = os.path.splitext(args.input)[0] + output_path = f"{base}_synth.wav" + + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + wavfile.write(output_path, args.sample_rate, audio) + print(f"Saved: {output_path}") + + +if __name__ == "__main__": + main() diff --git a/archive/musicgen_melody.py b/archive/musicgen_melody.py new file mode 100644 index 0000000..25f373e --- /dev/null +++ b/archive/musicgen_melody.py @@ -0,0 +1,164 @@ +""" +MusicGen Melody Large - Hum to Instrument +========================================== +Feed it a WAV/MP3 of you humming + a text prompt describing the instrument, +and it outputs that melody played on the described instrument. + +Usage: + python musicgen_melody.py --input hum.wav --prompt "solo acoustic piano, gentle and warm" + python musicgen_melody.py --input hum.wav --prompt "solo electric guitar, jazz improvisation" --duration 20 + python musicgen_melody.py --input hum.wav --prompt "solo saxophone, smooth jazz" --output sax_output.wav + +Without --input, generates from text prompt only (no melody conditioning). +""" + +import argparse +import os +import sys +import time + +import numpy as np +import torch +import torchaudio +from transformers import AutoProcessor, MusicgenMelodyForConditionalGeneration + + +def load_audio(path: str, target_sr: int = 32000): + """Load audio file and resample to target sample rate.""" + waveform, sr = torchaudio.load(path) + # Convert to mono if stereo + if waveform.shape[0] > 1: + waveform = waveform.mean(dim=0, keepdim=True) + # Resample if needed + if sr != target_sr: + resampler = torchaudio.transforms.Resample(sr, target_sr) + waveform = resampler(waveform) + return waveform, target_sr + + +def main(): + parser = argparse.ArgumentParser(description="MusicGen Melody - Hum to Instrument") + parser.add_argument("--input", "-i", type=str, default=None, + help="Path to input audio (WAV/MP3) of humming/melody (8-30 seconds)") + parser.add_argument("--prompt", "-p", type=str, required=True, + help="Text prompt describing desired instrument and style") + parser.add_argument("--duration", "-d", type=int, default=None, + help="Output duration in seconds (default: match input length, max 30)") + parser.add_argument("--output", "-o", type=str, default=None, + help="Output WAV path (default: auto-generated in output/musicgen/)") + parser.add_argument("--guidance", "-g", type=float, default=3.0, + help="Classifier-free guidance scale (default: 3.0, range 1-5)") + parser.add_argument("--top-k", type=int, default=250, + help="Top-k sampling (default: 250)") + parser.add_argument("--top-p", type=float, default=0.0, + help="Top-p nucleus sampling (default: 0.0 = disabled)") + parser.add_argument("--temperature", "-t", type=float, default=1.0, + help="Sampling temperature (default: 1.0)") + parser.add_argument("--seed", "-s", type=int, default=-1, + help="Random seed (-1 = random)") + args = parser.parse_args() + + device = "cuda" if torch.cuda.is_available() else "cpu" + print(f"Device: {device}") + if device == "cuda": + print(f"GPU: {torch.cuda.get_device_name(0)}") + + # Load model + print("Loading MusicGen Melody Large...") + t0 = time.time() + processor = AutoProcessor.from_pretrained("facebook/musicgen-melody-large") + model = MusicgenMelodyForConditionalGeneration.from_pretrained("facebook/musicgen-melody-large") + model = model.to(device) + print(f"Model loaded in {time.time() - t0:.1f}s") + print(f"VRAM used: {torch.cuda.memory_allocated() / 1e9:.1f} GB") + + # Determine duration + sample_rate = model.config.audio_encoder.sampling_rate # 32000 + max_duration = 30 + duration = min(args.duration or max_duration, max_duration) + + # Prepare inputs + if args.input: + print(f"Loading input audio: {args.input}") + waveform, sr = load_audio(args.input, target_sr=sample_rate) + input_duration = waveform.shape[1] / sr + print(f"Input duration: {input_duration:.1f}s") + + if input_duration > max_duration: + print(f"Warning: Input longer than {max_duration}s, truncating.") + waveform = waveform[:, :max_duration * sr] + input_duration = max_duration + + if args.duration is None: + duration = min(int(input_duration) + 2, max_duration) # slight padding + print(f"Auto duration: {duration}s (input + 2s padding)") + + audio_np = waveform.squeeze(0).numpy().astype(np.float32) + inputs = processor( + audio=audio_np, + sampling_rate=sample_rate, + text=[args.prompt], + padding=True, + return_tensors="pt", + ).to(device) + print(f"Melody conditioning active: input_features shape = {inputs['input_features'].shape}") + else: + print("No input audio - generating from text prompt only.") + inputs = processor( + text=[args.prompt], + padding=True, + return_tensors="pt", + ).to(device) + + # Calculate max new tokens from duration + # MusicGen generates at ~50 tokens per second of audio + max_new_tokens = int(duration * 50) + + # Seed right before generation (after model loading) so conditioning is not overridden + if args.seed >= 0: + torch.manual_seed(args.seed) + if device == "cuda": + torch.cuda.manual_seed(args.seed) + print(f"Seed: {args.seed}") + + print(f"\nGenerating {duration}s of audio...") + print(f" Prompt: {args.prompt}") + print(f" Guidance: {args.guidance}") + print(f" Temperature: {args.temperature}") + print(f" Top-k: {args.top_k}") + + t0 = time.time() + with torch.inference_mode(): + audio_values = model.generate( + **inputs, + max_new_tokens=max_new_tokens, + guidance_scale=args.guidance, + do_sample=True, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p if args.top_p > 0 else None, + ) + gen_time = time.time() - t0 + print(f"Generated in {gen_time:.1f}s (RTF: {gen_time/duration:.2f}x)") + + # Save output + audio = audio_values[0, 0].cpu() # [samples] + + # Create output directory + output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "output", "musicgen") + os.makedirs(output_dir, exist_ok=True) + + if args.output: + output_path = args.output + else: + timestamp = time.strftime("%Y%m%d_%H%M%S") + safe_prompt = args.prompt[:40].replace(" ", "_").replace(",", "") + output_path = os.path.join(output_dir, f"{timestamp}_{safe_prompt}.wav") + + torchaudio.save(output_path, audio.unsqueeze(0), sample_rate) + print(f"\nSaved: {output_path}") + print(f"Duration: {audio.shape[0] / sample_rate:.1f}s @ {sample_rate}Hz") + + +if __name__ == "__main__": + main()