chore(01-01): archive experimental scripts

- Move midi_to_audio.py to archive/
- Move musicgen_melody.py to archive/
This commit is contained in:
John Lightner 2026-04-11 02:10:59 -05:00
parent b9cc6121f3
commit 262ee6f7d1
2 changed files with 306 additions and 0 deletions

142
archive/midi_to_audio.py Normal file
View file

@ -0,0 +1,142 @@
"""
MIDI to Audio Synthesizer
=========================
Renders MIDI files to clean WAV audio using sine-wave synthesis with pitch bend support.
Produces a clean melody signal suitable for MusicGen melody conditioning.
Usage:
python midi_to_audio.py input.mid # outputs input_synth.wav
python midi_to_audio.py input.mid -o output.wav # custom output path
python midi_to_audio.py input.mid --sample-rate 32000 # match MusicGen's 32kHz
"""
import argparse
import os
import mido
import numpy as np
import scipy.io.wavfile as wavfile
def midi_note_to_freq(note: int) -> float:
"""Convert MIDI note number to frequency in Hz."""
return 440.0 * (2.0 ** ((note - 69) / 12.0))
def render_midi(midi_path: str, sample_rate: int = 32000) -> np.ndarray:
"""Render a MIDI file to audio using sine synthesis with pitch bend."""
mid = mido.MidiFile(midi_path)
# Calculate total duration
total_seconds = mid.length
total_samples = int(total_seconds * sample_rate) + sample_rate # +1s padding
audio = np.zeros(total_samples, dtype=np.float64)
# Process each track
for track in mid.tracks:
current_time = 0.0 # in seconds
tempo = 500000 # default 120 BPM
active_notes = {} # note -> (start_sample, velocity)
current_pitch_bend = 0 # in MIDI pitch bend units (-8192 to 8191)
pitch_bend_range = 2 # semitones (standard GM default)
for msg in track:
# Advance time
if msg.time > 0:
delta_seconds = mido.tick2second(msg.time, mid.ticks_per_beat, tempo)
current_time += delta_seconds
if msg.type == 'set_tempo':
tempo = msg.tempo
elif msg.type == 'pitchwheel':
current_pitch_bend = msg.pitch
elif msg.type == 'note_on' and msg.velocity > 0:
sample_pos = int(current_time * sample_rate)
active_notes[msg.note] = (sample_pos, msg.velocity, current_pitch_bend)
elif msg.type == 'note_off' or (msg.type == 'note_on' and msg.velocity == 0):
if msg.note in active_notes:
start_sample, velocity, start_bend = active_notes.pop(msg.note)
end_sample = int(current_time * sample_rate)
if end_sample <= start_sample:
continue
# Average the pitch bend (simplification - uses start bend)
bend_semitones = (start_bend / 8192.0) * pitch_bend_range
freq = midi_note_to_freq(msg.note + bend_semitones)
# Generate tone
n_samples = end_sample - start_sample
t = np.arange(n_samples) / sample_rate
tone = np.sin(2 * np.pi * freq * t)
# Apply ADSR envelope
envelope = np.ones(n_samples)
attack = min(int(0.01 * sample_rate), n_samples) # 10ms attack
release = min(int(0.05 * sample_rate), n_samples) # 50ms release
if attack > 0:
envelope[:attack] = np.linspace(0, 1, attack)
if release > 0:
envelope[-release:] = np.linspace(1, 0, release)
# Scale by velocity
amplitude = velocity / 127.0 * 0.5
tone *= envelope * amplitude
# Mix into output
end_idx = min(start_sample + n_samples, len(audio))
actual_len = end_idx - start_sample
audio[start_sample:end_idx] += tone[:actual_len]
# Normalize
peak = np.max(np.abs(audio))
if peak > 0:
audio = audio / peak * 0.9
# Trim trailing silence
nonzero = np.nonzero(np.abs(audio) > 0.001)[0]
if len(nonzero) > 0:
end = min(nonzero[-1] + sample_rate, len(audio)) # +1s tail
audio = audio[:end]
return audio.astype(np.float32)
def main():
parser = argparse.ArgumentParser(description="Render MIDI to clean audio")
parser.add_argument("input", help="Input MIDI file")
parser.add_argument("-o", "--output", help="Output WAV path")
parser.add_argument("--sample-rate", "-sr", type=int, default=32000,
help="Sample rate (default: 32000, matches MusicGen)")
args = parser.parse_args()
print(f"Reading MIDI: {args.input}")
mid = mido.MidiFile(args.input)
print(f" Duration: {mid.length:.1f}s")
print(f" Tracks: {len(mid.tracks)}")
# Count notes
note_count = sum(1 for track in mid.tracks for msg in track
if msg.type == 'note_on' and msg.velocity > 0)
print(f" Notes: {note_count}")
print(f"Rendering at {args.sample_rate}Hz...")
audio = render_midi(args.input, args.sample_rate)
print(f" Output duration: {len(audio) / args.sample_rate:.1f}s")
if args.output:
output_path = args.output
else:
base = os.path.splitext(args.input)[0]
output_path = f"{base}_synth.wav"
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
wavfile.write(output_path, args.sample_rate, audio)
print(f"Saved: {output_path}")
if __name__ == "__main__":
main()

164
archive/musicgen_melody.py Normal file
View file

@ -0,0 +1,164 @@
"""
MusicGen Melody Large - Hum to Instrument
==========================================
Feed it a WAV/MP3 of you humming + a text prompt describing the instrument,
and it outputs that melody played on the described instrument.
Usage:
python musicgen_melody.py --input hum.wav --prompt "solo acoustic piano, gentle and warm"
python musicgen_melody.py --input hum.wav --prompt "solo electric guitar, jazz improvisation" --duration 20
python musicgen_melody.py --input hum.wav --prompt "solo saxophone, smooth jazz" --output sax_output.wav
Without --input, generates from text prompt only (no melody conditioning).
"""
import argparse
import os
import sys
import time
import numpy as np
import torch
import torchaudio
from transformers import AutoProcessor, MusicgenMelodyForConditionalGeneration
def load_audio(path: str, target_sr: int = 32000):
"""Load audio file and resample to target sample rate."""
waveform, sr = torchaudio.load(path)
# Convert to mono if stereo
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Resample if needed
if sr != target_sr:
resampler = torchaudio.transforms.Resample(sr, target_sr)
waveform = resampler(waveform)
return waveform, target_sr
def main():
parser = argparse.ArgumentParser(description="MusicGen Melody - Hum to Instrument")
parser.add_argument("--input", "-i", type=str, default=None,
help="Path to input audio (WAV/MP3) of humming/melody (8-30 seconds)")
parser.add_argument("--prompt", "-p", type=str, required=True,
help="Text prompt describing desired instrument and style")
parser.add_argument("--duration", "-d", type=int, default=None,
help="Output duration in seconds (default: match input length, max 30)")
parser.add_argument("--output", "-o", type=str, default=None,
help="Output WAV path (default: auto-generated in output/musicgen/)")
parser.add_argument("--guidance", "-g", type=float, default=3.0,
help="Classifier-free guidance scale (default: 3.0, range 1-5)")
parser.add_argument("--top-k", type=int, default=250,
help="Top-k sampling (default: 250)")
parser.add_argument("--top-p", type=float, default=0.0,
help="Top-p nucleus sampling (default: 0.0 = disabled)")
parser.add_argument("--temperature", "-t", type=float, default=1.0,
help="Sampling temperature (default: 1.0)")
parser.add_argument("--seed", "-s", type=int, default=-1,
help="Random seed (-1 = random)")
args = parser.parse_args()
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if device == "cuda":
print(f"GPU: {torch.cuda.get_device_name(0)}")
# Load model
print("Loading MusicGen Melody Large...")
t0 = time.time()
processor = AutoProcessor.from_pretrained("facebook/musicgen-melody-large")
model = MusicgenMelodyForConditionalGeneration.from_pretrained("facebook/musicgen-melody-large")
model = model.to(device)
print(f"Model loaded in {time.time() - t0:.1f}s")
print(f"VRAM used: {torch.cuda.memory_allocated() / 1e9:.1f} GB")
# Determine duration
sample_rate = model.config.audio_encoder.sampling_rate # 32000
max_duration = 30
duration = min(args.duration or max_duration, max_duration)
# Prepare inputs
if args.input:
print(f"Loading input audio: {args.input}")
waveform, sr = load_audio(args.input, target_sr=sample_rate)
input_duration = waveform.shape[1] / sr
print(f"Input duration: {input_duration:.1f}s")
if input_duration > max_duration:
print(f"Warning: Input longer than {max_duration}s, truncating.")
waveform = waveform[:, :max_duration * sr]
input_duration = max_duration
if args.duration is None:
duration = min(int(input_duration) + 2, max_duration) # slight padding
print(f"Auto duration: {duration}s (input + 2s padding)")
audio_np = waveform.squeeze(0).numpy().astype(np.float32)
inputs = processor(
audio=audio_np,
sampling_rate=sample_rate,
text=[args.prompt],
padding=True,
return_tensors="pt",
).to(device)
print(f"Melody conditioning active: input_features shape = {inputs['input_features'].shape}")
else:
print("No input audio - generating from text prompt only.")
inputs = processor(
text=[args.prompt],
padding=True,
return_tensors="pt",
).to(device)
# Calculate max new tokens from duration
# MusicGen generates at ~50 tokens per second of audio
max_new_tokens = int(duration * 50)
# Seed right before generation (after model loading) so conditioning is not overridden
if args.seed >= 0:
torch.manual_seed(args.seed)
if device == "cuda":
torch.cuda.manual_seed(args.seed)
print(f"Seed: {args.seed}")
print(f"\nGenerating {duration}s of audio...")
print(f" Prompt: {args.prompt}")
print(f" Guidance: {args.guidance}")
print(f" Temperature: {args.temperature}")
print(f" Top-k: {args.top_k}")
t0 = time.time()
with torch.inference_mode():
audio_values = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
guidance_scale=args.guidance,
do_sample=True,
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p if args.top_p > 0 else None,
)
gen_time = time.time() - t0
print(f"Generated in {gen_time:.1f}s (RTF: {gen_time/duration:.2f}x)")
# Save output
audio = audio_values[0, 0].cpu() # [samples]
# Create output directory
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "output", "musicgen")
os.makedirs(output_dir, exist_ok=True)
if args.output:
output_path = args.output
else:
timestamp = time.strftime("%Y%m%d_%H%M%S")
safe_prompt = args.prompt[:40].replace(" ", "_").replace(",", "")
output_path = os.path.join(output_dir, f"{timestamp}_{safe_prompt}.wav")
torchaudio.save(output_path, audio.unsqueeze(0), sample_rate)
print(f"\nSaved: {output_path}")
print(f"Duration: {audio.shape[0] / sample_rate:.1f}s @ {sample_rate}Hz")
if __name__ == "__main__":
main()