sonosketch/archive/midi_to_audio.py

"""
MIDI to Audio Synthesizer
=========================
Renders MIDI files to clean WAV audio using sine-wave synthesis with pitch bend support.
Produces a clean melody signal suitable for MusicGen melody conditioning.

Usage:
  python midi_to_audio.py input.mid                    # outputs input_synth.wav
  python midi_to_audio.py input.mid -o output.wav      # custom output path
  python midi_to_audio.py input.mid --sample-rate 32000 # match MusicGen's 32kHz
"""

import argparse
import os

import mido
import numpy as np
import scipy.io.wavfile as wavfile


def midi_note_to_freq(note: int) -> float:
    """Convert MIDI note number to frequency in Hz."""
    return 440.0 * (2.0 ** ((note - 69) / 12.0))


def render_midi(midi_path: str, sample_rate: int = 32000) -> np.ndarray:
    """Render a MIDI file to audio using sine synthesis with pitch bend."""
    mid = mido.MidiFile(midi_path)

    # Calculate total duration
    total_seconds = mid.length
    total_samples = int(total_seconds * sample_rate) + sample_rate  # +1s padding
    audio = np.zeros(total_samples, dtype=np.float64)

    # Process each track
    for track in mid.tracks:
        current_time = 0.0  # in seconds
        tempo = 500000  # default 120 BPM
        active_notes = {}  # note -> (start_sample, velocity)
        current_pitch_bend = 0  # in MIDI pitch bend units (-8192 to 8191)
        pitch_bend_range = 2  # semitones (standard GM default)

        for msg in track:
            # Advance time
            if msg.time > 0:
                delta_seconds = mido.tick2second(msg.time, mid.ticks_per_beat, tempo)
                current_time += delta_seconds

            if msg.type == 'set_tempo':
                tempo = msg.tempo

            elif msg.type == 'pitchwheel':
                current_pitch_bend = msg.pitch

            elif msg.type == 'note_on' and msg.velocity > 0:
                sample_pos = int(current_time * sample_rate)
                active_notes[msg.note] = (sample_pos, msg.velocity, current_pitch_bend)

            elif msg.type == 'note_off' or (msg.type == 'note_on' and msg.velocity == 0):
                if msg.note in active_notes:
                    start_sample, velocity, start_bend = active_notes.pop(msg.note)
                    end_sample = int(current_time * sample_rate)

                    if end_sample <= start_sample:
                        continue

                    # Average the pitch bend (simplification - uses start bend)
                    bend_semitones = (start_bend / 8192.0) * pitch_bend_range
                    freq = midi_note_to_freq(msg.note + bend_semitones)

                    # Generate tone
                    n_samples = end_sample - start_sample
                    t = np.arange(n_samples) / sample_rate
                    tone = np.sin(2 * np.pi * freq * t)

                    # Apply ADSR envelope
                    envelope = np.ones(n_samples)
                    attack = min(int(0.01 * sample_rate), n_samples)  # 10ms attack
                    release = min(int(0.05 * sample_rate), n_samples)  # 50ms release
                    if attack > 0:
                        envelope[:attack] = np.linspace(0, 1, attack)
                    if release > 0:
                        envelope[-release:] = np.linspace(1, 0, release)

                    # Scale by velocity
                    amplitude = velocity / 127.0 * 0.5
                    tone *= envelope * amplitude

                    # Mix into output
                    end_idx = min(start_sample + n_samples, len(audio))
                    actual_len = end_idx - start_sample
                    audio[start_sample:end_idx] += tone[:actual_len]

    # Normalize
    peak = np.max(np.abs(audio))
    if peak > 0:
        audio = audio / peak * 0.9

    # Trim trailing silence
    nonzero = np.nonzero(np.abs(audio) > 0.001)[0]
    if len(nonzero) > 0:
        end = min(nonzero[-1] + sample_rate, len(audio))  # +1s tail
        audio = audio[:end]

    return audio.astype(np.float32)


def main():
    parser = argparse.ArgumentParser(description="Render MIDI to clean audio")
    parser.add_argument("input", help="Input MIDI file")
    parser.add_argument("-o", "--output", help="Output WAV path")
    parser.add_argument("--sample-rate", "-sr", type=int, default=32000,
                        help="Sample rate (default: 32000, matches MusicGen)")
    args = parser.parse_args()

    print(f"Reading MIDI: {args.input}")
    mid = mido.MidiFile(args.input)
    print(f"  Duration: {mid.length:.1f}s")
    print(f"  Tracks: {len(mid.tracks)}")

    # Count notes
    note_count = sum(1 for track in mid.tracks for msg in track
                     if msg.type == 'note_on' and msg.velocity > 0)
    print(f"  Notes: {note_count}")

    print(f"Rendering at {args.sample_rate}Hz...")
    audio = render_midi(args.input, args.sample_rate)
    print(f"  Output duration: {len(audio) / args.sample_rate:.1f}s")

    if args.output:
        output_path = args.output
    else:
        base = os.path.splitext(args.input)[0]
        output_path = f"{base}_synth.wav"

    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
    wavfile.write(output_path, args.sample_rate, audio)
    print(f"Saved: {output_path}")


if __name__ == "__main__":
    main()