// Performance test of delay-line resonator tone detection.

// I've been noodling around with a tone detection algorithm which
// seems promising as a much more efficient way to detect tones.  It's
// simple enough that it is surely well known, but I haven't found it
// in the literature yet.  In Numpy it seems promising, taking abou
// 20-50 CPU cycles per input sample, but in the past my experience is
// that Numpy is typically about 5x slower than relatively
// straightforward C, so I'd like to see if I can get it to run a bit
// faster.  4-10 cycles would be a big improvement.

// Initial results: if I haven't fucked this up, with the first
// version of the code, on this 2.6 GHz
// Celeron N4120, in 43.9 seconds, one core chews through 4.8 GB.
// This is a disappointing 23.8 cycles per byte.  Valgrind says it
// took 15,259,724 instructions to process 100 4800-sample blocks,
// 15,410,188 to process 101, and 361,119 to process one.  (/ (-
// 15259724 361119) 100.0 4800) = 31.04 instructions per sample, (/(-
// 15410188 15259724)4800.0) = 31.35 instructions per sample.
// kcachegrind suggests that two thirds of this is getc(), so maybe I
// can get down to 10 instructions per byte.

// Refactoring it to read 30 bytes at a time from stdio instead of one
// speeds that up to 21.3 seconds for 4.8 GB, about 2.06 times as
// fast, and now callgrind reports 7,747,956 instructions for 480 kB,
// (/ 7747956 480000.0) = 16.1 instructions and (/ (* 21.3 2.6e9)
// 4.8e9) = 11.5 CPU cycles per sample.  At least now it's not
// *slower* than the Numpy prototype anymore.  Kcachegrind says it's
// still spending 41% of its cycles in fread() and descendants.

// Fixing it to read 600 bytes at a time and also not forget the loop
// over the resonator differentiators speeds that up to 12.9 seconds
// for 4.8 GB, and now callgrind reports only 4,477,523 instructions
// to process 480 kB, (/ 4477523 480000.0) = 9.3 instructions per
// sample and (/ (* 12.9 2.6e9) 4.8e9) = 7.0 clock cycles per sample.
// And it's currently allocating a stack frame of 0x758 bytes (1880
// bytes) plus the CPU's register set, which seems reasonably compact.
// The instructions emitted are using a lot of %xmm stuff I don't
// understand.

// Compiling with clang gets an additional speed boost: 10.2 seconds
// for 4.8 gigabytes, (/ (* 10.2 2.6e9) 4.8e9) = 5.5 clock cycles per
// sample.  This uses only 3,261,759 instructions for 480 kB, (/
// 3261759 480000.0) = 6.8 instructions per sample.  It's only 3292063
// instructions for 484800 bytes, so it's 6.3 instructions per
// marginal sample.

// However, the output is nonsense, so there is some kind of bug.

// As an example we are going to read in a stream of 8-bit signed byte
// samples at, theoretically, 48000 samples per second.  We are going
// to downsample it by a factor of d=20 with a third-order Hogenauer
// filter in 32-bit integer math.  We are going to split it into
// m=30-sample chunks and run a second Hogenauer filter across the
// corresponding samples in these chunks to "average" them and reduce
// their rate by a factor of p=8.  Finally we are going to take the
// dot product of these averaged chunks with precomputed sine and
// cosine waves that oscillate n=11 times over an m-sample interval to
// get I and Q signals at 1/(dmp) = 1/4800 of the original sample
// rate.  These signals indicate the amplitude and phase of a sinusoid
// with period dm/n samples, in this case 54.545 samples or 880 Hz.
// #include <unistd.h>
#include <stdint.h>
#include <stdio.h>
#include <math.h>

enum {
      param_d = 20,
      param_m = 30,
      param_n = 11,
      param_p = 8,
      downsamp_order = 3,
      resonator_order = 3,
};

typedef uint32_t integ_t, resonator_t;

void debug_mat(resonator_t *buf)
{
  for (size_t i = 0; i < param_m; i++) {
    printf("%.2e%c", (float)(int32_t)buf[i], i == param_m - 1 ? '\n' : ' ');
  }
}
  

int main()
{
  integ_t integ[downsamp_order] = {0};
  integ_t diff[downsamp_order] = {0};
  resonator_t resonator[resonator_order][param_m] = {{0}};
  resonator_t rdiff[resonator_order][param_m] = {{0}};
  size_t p_counter = 0;

  double sins[param_m], coss[param_m];
  for (size_t i = 0; i < param_m; i++) {
    double theta = 2 * M_PI * i * param_n / param_m;
    sins[i] = sin(theta);
    coss[i] = cos(theta);
  }

  for (;;) {
    char buf[param_d * param_m];
    // This made it slower at 600 bytes:
    /* char *bptr = buf; */
    /* size_t left = sizeof buf; */
    /* while (left) { */
    /*   ssize_t n = read(0, bptr, left); */
    /*   if (n < 0) { */
    /* 	perror("read from stdin"); */
    /* 	return 1; */
    /*   } */
    /*   if (n == 0) return 0;	/\* end of file *\/ */
    /*   bptr += n; */
    /*   left -= n; */
    /* } */
    size_t n = fread(buf, sizeof buf, 1, stdin);
    if (n != 1) return 0;	/* end of file */

    char *samp_p = buf;
    for (size_t k = 0; k < param_m; k++) {
      for (size_t j = 0; j < param_d; j++) {
	integ[0] += (signed char)*samp_p++;
	for (size_t i = 1; i < downsamp_order; i++) integ[i] += integ[i-1];
      }

      diff[0] = integ[downsamp_order - 1] - diff[0];
      for (size_t i = 1; i < downsamp_order; i++) {
	diff[i] = diff[i-1] - diff[i];
      }
      resonator[0][k] += diff[downsamp_order-1];
      for (size_t i = 1; i < resonator_order; i++) {
	resonator[i][k] += resonator[i-1][k];
	// It would be a little easier to have the indices in the
	// other order, which makes this look a lot like a polyphase
	// filter.  But that makes it about 2% slower.
      }
    }

    // We don't have to do anything special at the end of an m-sized
    // chunk, just wrap around to the beginning of the next chunk.
    // Unless it's the chunk at the end of a window of p m-sized
    // chunks.
    if (++p_counter != param_p) continue;
    p_counter = 0;

    for (size_t k = 0; k < param_m; k++) {
      rdiff[0][k] = resonator[0][k] - rdiff[0][k];
      for (size_t i = 1; i < resonator_order; i++) {
	rdiff[i][k] = rdiff[i-1][k] - rdiff[i][k];
      }
    }

    debug_mat(rdiff[resonator_order - 1]);

    double I = 0, Q = 0;
    for (size_t i = 0; i < param_m; i++) {
      resonator_t w = (int32_t)rdiff[resonator_order - 1][i];
      I += sins[i] * w;
      Q += coss[i] * w;
    }
    printf("%f %f\n", I, Q);
  }
}