// Performance test of delay-line resonator tone detection. // I've been noodling around with a tone detection algorithm which // seems promising as a much more efficient way to detect tones. It's // simple enough that it is surely well known, but I haven't found it // in the literature yet. In Numpy it seems promising, taking abou // 20-50 CPU cycles per input sample, but in the past my experience is // that Numpy is typically about 5x slower than relatively // straightforward C, so I'd like to see if I can get it to run a bit // faster. 4-10 cycles would be a big improvement. // Initial results: if I haven't fucked this up, with the first // version of the code, on this 2.6 GHz // Celeron N4120, in 43.9 seconds, one core chews through 4.8 GB. // This is a disappointing 23.8 cycles per byte. Valgrind says it // took 15,259,724 instructions to process 100 4800-sample blocks, // 15,410,188 to process 101, and 361,119 to process one. (/ (- // 15259724 361119) 100.0 4800) = 31.04 instructions per sample, (/(- // 15410188 15259724)4800.0) = 31.35 instructions per sample. // kcachegrind suggests that two thirds of this is getc(), so maybe I // can get down to 10 instructions per byte. // Refactoring it to read 30 bytes at a time from stdio instead of one // speeds that up to 21.3 seconds for 4.8 GB, about 2.06 times as // fast, and now callgrind reports 7,747,956 instructions for 480 kB, // (/ 7747956 480000.0) = 16.1 instructions and (/ (* 21.3 2.6e9) // 4.8e9) = 11.5 CPU cycles per sample. At least now it's not // *slower* than the Numpy prototype anymore. Kcachegrind says it's // still spending 41% of its cycles in fread() and descendants. // Fixing it to read 600 bytes at a time and also not forget the loop // over the resonator differentiators speeds that up to 12.9 seconds // for 4.8 GB, and now callgrind reports only 4,477,523 instructions // to process 480 kB, (/ 4477523 480000.0) = 9.3 instructions per // sample and (/ (* 12.9 2.6e9) 4.8e9) = 7.0 clock cycles per sample. // And it's currently allocating a stack frame of 0x758 bytes (1880 // bytes) plus the CPU's register set, which seems reasonably compact. // The instructions emitted are using a lot of %xmm stuff I don't // understand. // Compiling with clang gets an additional speed boost: 10.2 seconds // for 4.8 gigabytes, (/ (* 10.2 2.6e9) 4.8e9) = 5.5 clock cycles per // sample. This uses only 3,261,759 instructions for 480 kB, (/ // 3261759 480000.0) = 6.8 instructions per sample. It's only 3292063 // instructions for 484800 bytes, so it's 6.3 instructions per // marginal sample. // However, the output is nonsense, so there is some kind of bug. // As an example we are going to read in a stream of 8-bit signed byte // samples at, theoretically, 48000 samples per second. We are going // to downsample it by a factor of d=20 with a third-order Hogenauer // filter in 32-bit integer math. We are going to split it into // m=30-sample chunks and run a second Hogenauer filter across the // corresponding samples in these chunks to "average" them and reduce // their rate by a factor of p=8. Finally we are going to take the // dot product of these averaged chunks with precomputed sine and // cosine waves that oscillate n=11 times over an m-sample interval to // get I and Q signals at 1/(dmp) = 1/4800 of the original sample // rate. These signals indicate the amplitude and phase of a sinusoid // with period dm/n samples, in this case 54.545 samples or 880 Hz. // #include #include #include #include enum { param_d = 20, param_m = 30, param_n = 11, param_p = 8, downsamp_order = 3, resonator_order = 3, }; typedef uint32_t integ_t, resonator_t; void debug_mat(resonator_t *buf) { for (size_t i = 0; i < param_m; i++) { printf("%.2e%c", (float)(int32_t)buf[i], i == param_m - 1 ? '\n' : ' '); } } int main() { integ_t integ[downsamp_order] = {0}; integ_t diff[downsamp_order] = {0}; resonator_t resonator[resonator_order][param_m] = {{0}}; resonator_t rdiff[resonator_order][param_m] = {{0}}; size_t p_counter = 0; double sins[param_m], coss[param_m]; for (size_t i = 0; i < param_m; i++) { double theta = 2 * M_PI * i * param_n / param_m; sins[i] = sin(theta); coss[i] = cos(theta); } for (;;) { char buf[param_d * param_m]; // This made it slower at 600 bytes: /* char *bptr = buf; */ /* size_t left = sizeof buf; */ /* while (left) { */ /* ssize_t n = read(0, bptr, left); */ /* if (n < 0) { */ /* perror("read from stdin"); */ /* return 1; */ /* } */ /* if (n == 0) return 0; /\* end of file *\/ */ /* bptr += n; */ /* left -= n; */ /* } */ size_t n = fread(buf, sizeof buf, 1, stdin); if (n != 1) return 0; /* end of file */ char *samp_p = buf; for (size_t k = 0; k < param_m; k++) { for (size_t j = 0; j < param_d; j++) { integ[0] += (signed char)*samp_p++; for (size_t i = 1; i < downsamp_order; i++) integ[i] += integ[i-1]; } diff[0] = integ[downsamp_order - 1] - diff[0]; for (size_t i = 1; i < downsamp_order; i++) { diff[i] = diff[i-1] - diff[i]; } resonator[0][k] += diff[downsamp_order-1]; for (size_t i = 1; i < resonator_order; i++) { resonator[i][k] += resonator[i-1][k]; // It would be a little easier to have the indices in the // other order, which makes this look a lot like a polyphase // filter. But that makes it about 2% slower. } } // We don't have to do anything special at the end of an m-sized // chunk, just wrap around to the beginning of the next chunk. // Unless it's the chunk at the end of a window of p m-sized // chunks. if (++p_counter != param_p) continue; p_counter = 0; for (size_t k = 0; k < param_m; k++) { rdiff[0][k] = resonator[0][k] - rdiff[0][k]; for (size_t i = 1; i < resonator_order; i++) { rdiff[i][k] = rdiff[i-1][k] - rdiff[i][k]; } } debug_mat(rdiff[resonator_order - 1]); double I = 0, Q = 0; for (size_t i = 0; i < param_m; i++) { resonator_t w = (int32_t)rdiff[resonator_order - 1][i]; I += sins[i] * w; Q += coss[i] * w; } printf("%f %f\n", I, Q); } }