#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""Batch-mode vocoder.

Extracts a spectral envelope from one sound, such as your voice, and
uses it to selectively amplify and attenuate the spectral envelope of
another sound, such as (the default) white noise.

Inputs are required to be stereo 44.1kHz .wav files, as is the output.
The output will be the same length as the modulator; the carrier will
be truncated or repeated as necessary to achieve this.

"""
import argparse
import struct
import wave

import numpy


profiling_notes = """
This runs about four times slower than real-time on my laptop — four
seconds of CPU to produce each second of output.

Initial profiling suggests that currently its time (6.2 seconds modulating
ks1-cd.wav with what-hath-god-wrought.wav) is divided as follows:

- 25% in reading wav files;
- 56% in time-domain convolution, including:
    - 49% in upsampling,
    - 7% in moving_rms, and
    - 1% in low-pass filtering (numbers do not total due to rounding);
- 14% in writing wav files;
- 5% in other activities, including startup.

Upsampling is slow because it’s doing a time-domain convolution with a
1025-point sinc kernel, and also because it upsamples signals 88
times; reading and writing are slow because they’re done in loops per
sample in interpreted Python.  It should be pretty easy to make these
all run a lot faster, but I need to read about fast and accurate
upsampling and about numpy byte-swapping.

Once these bottlenecks are removed, maybe I should work on using a
smaller kernel for the low-pass filtering.

Using numpy for writing cuts the time to 5.5 seconds.  Using numpy
also for reading cuts it to 4.3 seconds.  cProfile confirms that
read_wav now takes 0.05% of the run time and write_wav 0.07%, roughly
500 and 200 times faster, respectively.  The new profile results are:

- 93% in convolve, of which roughly
    - 11% in moving_rms,
    - 81% in restore (upsampling), and
    - 2% in low_pass;
- 2% in things in main that aren’t convolve;
- 6% outside of main (startup).

`moving_rms` should be pretty easy to speed up using a sum table,
since it’s just box filtering.  I still have to figure out the
upsampling story.

So I've switched to using the same low-pass filter for upsampling
interpolation that I’m using for band separation prior to
downsampling.  The results still sound good, and now it’s down to 1.6
seconds, which is tantalizingly close to running in real time.  The
new profile says:

- 17% outside of main (startup);
- 74% in convolve;
- 49% in decompose;
- 47% in low_pass, which is now being invoked 106 times since restore
  is using it;
- 42% in restore;
- 32% in moving_rms.

moving_rms probably accounts for almost half of the time in convolve.
Also, I think I can probably cut the low_pass computation time in half
for the upsampling case by using a polyphase filter.  But I can
probably also use a narrower window on the kernel and it’ll probably
be fine, too.

So I fixed moving_rms to use a sum table to extract the envelope.  It
now vocodes in 1.1 seconds, better than real time!  On, uh, one core
of a 1.6GHz laptop.  Profiling now says:

- 24% outside of main (in startup);
- 69% in decompose;
- 66% in low_pass, which is called 106 times;
- 61% in convolve (which is now being called just by low_pass);
- 60% in restore, which is called 88 times, accounting for 88 of the
  calls to low_pass.

So it’s still spending most of its time upsampling signals — 80% of
the time not in the startup code, in fact.  But now it’s doing that
four times as fast as it was yesterday.

It really doesn’t need to use a 129th-order FIR filter to do
zero-phase 1:2 upsampling interpolation on a signal that was already
low-pass filtered ahead of time.  It probably doesn’t need that for
the band separation in the first place, either, in fact.

"""


def read_wav(filename):
    "Returns a Numpy array of the left channel of a .wav file."
    with wave.open(filename) as wav:
        assert wav.getnchannels() == 2, filename
        assert wav.getsampwidth() == 2, filename
        assert wav.getframerate() == 44100, filename
        nframes = wav.getparams().nframes
        samples = wav.readframes(nframes)

    # XXX note that the right channel is still stored in memory, just
    # inaccessible through this view
    return numpy.ndarray(shape=(nframes*2,), dtype='<h', buffer=samples)[::2]


def white_noise(n):
    return numpy.random.random(n) * 2 - 1


def write_wav(filename, samples):
    buf = numpy.zeros(len(samples) * 2, dtype='<h')
    buf[1::2] = buf[::2] = numpy.round(samples)
    with wave.open(filename, 'w') as outf:
        outf.setnchannels(2)
        outf.setsampwidth(2)
        outf.setframerate(44100)
        outf.writeframes(buf.tobytes())


def restore(decimated, target_length):
    expanded = numpy.zeros(target_length)
    expanded[::2] = decimated
    # The *2 compensates for the amplitude lost to the missing samples.
    return low_pass(expanded) * 2


def decimate(signal):
    return signal[::2]


# Derived by applying a Hamming window to 129 samples of the Fourier
# transform of a linear-phase fₛ/8 brick wall filter.
lpf_kernel = numpy.array(
    [    0.        ,    -2.3571521 ,    -3.45714605,    -2.56818691,
         0.        ,     2.93045604,     4.48645575,     3.4571404 ,
         0.        ,    -4.16195044,    -6.48504028,    -5.05926828,
         0.        ,     6.16433725,     9.61642237,     7.4935119 ,
         0.        ,    -9.06458559,   -14.06702866,   -10.89721907,
         0.        ,    13.01350164,    20.0616876 ,    15.43868905,
         0.        ,   -18.20217996,   -27.89030158,   -21.33881974,
         0.        ,    24.89066041,    37.95461817,    28.90936841,
         0.        ,   -33.45957036,   -50.85348956,   -38.62358706,
         0.        ,    44.50827295,    67.54759464,    51.25513695,
         0.        ,   -59.05573897,   -89.70416261,   -68.1758757 ,
         0.        ,    78.99502811,   120.50095848,    92.07363527,
         0.        ,  -108.27416122,  -166.80711861,  -128.99388117,
         0.        ,   156.65156998,   246.59478216,   195.82304627,
         0.        ,  -256.3232765 ,  -425.99004816,  -363.68539901,
         0.        ,   611.55561551,  1300.90909416,  1842.82606724,
         2048.     ,  1842.82606724,  1300.90909416,   611.55561551,
         0.        ,  -363.68539901,  -425.99004816,  -256.3232765 ,
         0.        ,   195.82304627,   246.59478216,   156.65156998,
         0.        ,  -128.99388117,  -166.80711861,  -108.27416122,
         0.        ,    92.07363527,   120.50095848,    78.99502811,
         0.        ,   -68.1758757 ,   -89.70416261,   -59.05573897,
         0.        ,    51.25513695,    67.54759464,    44.50827295,
         0.        ,   -38.62358706,   -50.85348956,   -33.45957036,
         0.        ,    28.90936841,    37.95461817,    24.89066041,
         0.        ,   -21.33881974,   -27.89030158,   -18.20217996,
         0.        ,    15.43868905,    20.0616876 ,    13.01350164,
         0.        ,   -10.89721907,   -14.06702866,    -9.06458559,
         0.        ,     7.4935119 ,     9.61642237,     6.16433725,
         0.        ,    -5.05926828,    -6.48504028,    -4.16195044,
         0.        ,     3.4571404 ,     4.48645575,     2.93045604,
         0.        ,    -2.56818691,    -3.45714605,    -2.3571521 ,
         0.        ])


gain = lpf_kernel.sum()


def low_pass(signal):
    extra = (len(lpf_kernel)-1)//2
    return numpy.convolve(lpf_kernel, signal)[extra:-extra] / gain


def decompose(signal, n=10):
    low = low_pass(signal)
    lower = ([restore(decimated, len(signal))
              for decimated in decompose(decimate(low), n-1)]
             if n > 2
             else [low])
    lower.append(signal - sum(lower))
    return lower
 

def box_filter(n, y):
    zeros = numpy.zeros(n, dtype=y.dtype)
    st = numpy.concatenate((zeros, y, zeros)).cumsum()
    result = st[n:] - st[:-n]
    #assert len(result) == len(y) + n, (len(result), len(y), n)
    return result


def moving_rms(n, y):
    # XXX this has half a sample of phase error if n is odd!
    # Fortunately, in this one place in the program, we can tolerate
    # phase error.
    return numpy.sqrt(box_filter(n, y**2) / n)[n//2:][:len(y)]


def vocode(modulator, carrier):
    modulator_bands = decompose(modulator)
    envelopes = [moving_rms(2**(12+i-len(modulator_bands)), band)
                 for i, band in enumerate(modulator_bands)]
    return sum(envelope * each_carrier
               for envelope, each_carrier in zip(envelopes, decompose(carrier)))


def autoscale(samples):
    return samples * 32767 / abs(samples).max()


def adjust_length(samples, length):
    return numpy.concatenate(tuple(samples
                                   for i in range(0,
                                                  length,
                                                  len(samples))))[:length]


def main():
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument('modulator',
                   help='the filename of a .wav file with a spectral envelope;'
                   + ' a recording of your voice is good')
    p.add_argument('carrier', default=None, nargs='?',
                   help='the filename of a .wav file to modulate with'
                   + ' a spectral envelope; defaults to white noise')
    p.add_argument('-o', '--output', default='tmp.wav',
                   help='the filename of a .wav file to write'
                   + ' (default %(default)r)')
    args = p.parse_args()

    modulator = read_wav(args.modulator)
    carrier = (white_noise(len(modulator)) if args.carrier is None
               else adjust_length(read_wav(args.carrier), len(modulator)))
    write_wav(args.output, autoscale(vocode(modulator=modulator,
                                            carrier=carrier)))


if __name__ == '__main__':
    main()