#!/usr/bin/python3 # -*- coding: utf-8 -*- """Batch-mode vocoder. Extracts a spectral envelope from one sound, such as your voice, and uses it to selectively amplify and attenuate the spectral envelope of another sound, such as (the default) white noise. Inputs are required to be stereo 44.1kHz .wav files, as is the output. The output will be the same length as the modulator; the carrier will be truncated or repeated as necessary to achieve this. """ import argparse import struct import wave import numpy profiling_notes = """ This runs about four times slower than real-time on my laptop — four seconds of CPU to produce each second of output. Initial profiling suggests that currently its time (6.2 seconds modulating ks1-cd.wav with what-hath-god-wrought.wav) is divided as follows: - 25% in reading wav files; - 56% in time-domain convolution, including: - 49% in upsampling, - 7% in moving_rms, and - 1% in low-pass filtering (numbers do not total due to rounding); - 14% in writing wav files; - 5% in other activities, including startup. Upsampling is slow because it’s doing a time-domain convolution with a 1025-point sinc kernel, and also because it upsamples signals 88 times; reading and writing are slow because they’re done in loops per sample in interpreted Python. It should be pretty easy to make these all run a lot faster, but I need to read about fast and accurate upsampling and about numpy byte-swapping. Once these bottlenecks are removed, maybe I should work on using a smaller kernel for the low-pass filtering. Using numpy for writing cuts the time to 5.5 seconds. Using numpy also for reading cuts it to 4.3 seconds. cProfile confirms that read_wav now takes 0.05% of the run time and write_wav 0.07%, roughly 500 and 200 times faster, respectively. The new profile results are: - 93% in convolve, of which roughly - 11% in moving_rms, - 81% in restore (upsampling), and - 2% in low_pass; - 2% in things in main that aren’t convolve; - 6% outside of main (startup). `moving_rms` should be pretty easy to speed up using a sum table, since it’s just box filtering. I still have to figure out the upsampling story. So I've switched to using the same low-pass filter for upsampling interpolation that I’m using for band separation prior to downsampling. The results still sound good, and now it’s down to 1.6 seconds, which is tantalizingly close to running in real time. The new profile says: - 17% outside of main (startup); - 74% in convolve; - 49% in decompose; - 47% in low_pass, which is now being invoked 106 times since restore is using it; - 42% in restore; - 32% in moving_rms. moving_rms probably accounts for almost half of the time in convolve. Also, I think I can probably cut the low_pass computation time in half for the upsampling case by using a polyphase filter. But I can probably also use a narrower window on the kernel and it’ll probably be fine, too. So I fixed moving_rms to use a sum table to extract the envelope. It now vocodes in 1.1 seconds, better than real time! On, uh, one core of a 1.6GHz laptop. Profiling now says: - 24% outside of main (in startup); - 69% in decompose; - 66% in low_pass, which is called 106 times; - 61% in convolve (which is now being called just by low_pass); - 60% in restore, which is called 88 times, accounting for 88 of the calls to low_pass. So it’s still spending most of its time upsampling signals — 80% of the time not in the startup code, in fact. But now it’s doing that four times as fast as it was yesterday. It really doesn’t need to use a 129th-order FIR filter to do zero-phase 1:2 upsampling interpolation on a signal that was already low-pass filtered ahead of time. It probably doesn’t need that for the band separation in the first place, either, in fact. """ def read_wav(filename): "Returns a Numpy array of the left channel of a .wav file." with wave.open(filename) as wav: assert wav.getnchannels() == 2, filename assert wav.getsampwidth() == 2, filename assert wav.getframerate() == 44100, filename nframes = wav.getparams().nframes samples = wav.readframes(nframes) # XXX note that the right channel is still stored in memory, just # inaccessible through this view return numpy.ndarray(shape=(nframes*2,), dtype=' 2 else [low]) lower.append(signal - sum(lower)) return lower def box_filter(n, y): zeros = numpy.zeros(n, dtype=y.dtype) st = numpy.concatenate((zeros, y, zeros)).cumsum() result = st[n:] - st[:-n] #assert len(result) == len(y) + n, (len(result), len(y), n) return result def moving_rms(n, y): # XXX this has half a sample of phase error if n is odd! # Fortunately, in this one place in the program, we can tolerate # phase error. return numpy.sqrt(box_filter(n, y**2) / n)[n//2:][:len(y)] def vocode(modulator, carrier): modulator_bands = decompose(modulator) envelopes = [moving_rms(2**(12+i-len(modulator_bands)), band) for i, band in enumerate(modulator_bands)] return sum(envelope * each_carrier for envelope, each_carrier in zip(envelopes, decompose(carrier))) def autoscale(samples): return samples * 32767 / abs(samples).max() def adjust_length(samples, length): return numpy.concatenate(tuple(samples for i in range(0, length, len(samples))))[:length] def main(): p = argparse.ArgumentParser(description=__doc__) p.add_argument('modulator', help='the filename of a .wav file with a spectral envelope;' + ' a recording of your voice is good') p.add_argument('carrier', default=None, nargs='?', help='the filename of a .wav file to modulate with' + ' a spectral envelope; defaults to white noise') p.add_argument('-o', '--output', default='tmp.wav', help='the filename of a .wav file to write' + ' (default %(default)r)') args = p.parse_args() modulator = read_wav(args.modulator) carrier = (white_noise(len(modulator)) if args.carrier is None else adjust_length(read_wav(args.carrier), len(modulator))) write_wav(args.output, autoscale(vocode(modulator=modulator, carrier=carrier))) if __name__ == '__main__': main()