// See diter_sum.S and file `iterator-pattern-matching.md` in
// pavnotes2.  The results of this program are that it takes
// 1.600–1.614 seconds to sum the million items 1100 times and
// 0.153–0.178 seconds to do so only 100 times, for a difference of
// 1.422–1.461 seconds and thus 1.4 milliseconds per time and 1.4
// nanoseconds per iteration on this Ryzen 5 3500U.  The corresponding
// numbers for the old version of sum_without_diter are 70–91ms for
// 100 iterations and 604–624ms for 1100, so 513–554ms and thus
// 513–554ps per iteration.  It’s about a 2.7× slowdown.

// The sum_boundschecked version that indexes the array with an index
// and bounds-checks the index before each reference is 711–752ms for
// 1100 iterations and 80–104ms for 100 iterations, giving us
// 607–672ms for 1000 iterations and thus 607–672ps per iteration,
// which is 53–159ps more than the baseline.  This ≈20% cost for
// bounds checking (≈1.2× slowdown) is surprisingly small.  It isn’t
// even really bounds checking that is the cost; what adds this cost
// is indexing the array with base+index*8 instead of with pointer
// arithmetic, because commenting out the bounds-checking instructions
// makes absolutely no difference.  It’s just that calculating that
// array index is a necessary prerequisite to bounds-checking it.

// Since the above, I’ve changed the diter calling convention to
// return 0 in R9 instead of leaving the zero flag set, on the theory
// that macro-op fusion might make the test/jnz pair actually faster
// than testing the returned zero flag.  This makes it take
// 1.597–1.613 seconds to do 1100 in three tests instead of
// 1.600–1.614.  Although I haven’t done a χ² or anything, intuitively
// this doesn’t seem like a significant change one way or the other;
// if there was a difference, it was much smaller than my ±1%
// measurement error.  It seems like a less error-prone calling
// convention and one that can work on architectures without flags,
// such as RISC-V.

#include <stdio.h>

typedef struct {
  void *x[3];
} diter;

typedef struct intlist {
  long car;
  struct intlist *cdr;
} intlist;

// These five functions are defined in diter_sum.S.
extern diter make_diter_array(long *begin, long *end);
extern long sum_without_diter(long *beginl, long *end);
extern long sum_boundschecked(long *beginl, long length);
extern long sum_diter(diter nums);
extern diter make_diter_intlist(intlist *list);

void print_diter(diter d)
{
  printf("%lx %lx %lx\n", (long)d.x[0], (long)d.x[1], (long)d.x[2]);
}

enum { n_nums = 1000 * 1000 };

long nums[n_nums];

int main()
{
  for (long i = 0; i != n_nums; i++) nums[i] = i;

  diter d = make_diter_array(nums, nums+n_nums);

  print_diter(d);

  long total = -53;
  for (int j = 0; j != 1100; j++) {
    total = sum_diter(d);
    // total = sum_without_diter(nums, nums+n_nums);
    // total = sum_boundschecked(nums, n_nums);
  }
  printf("The sum is %ld\n", total);

  intlist a = {53}, b = {9000, &a};
  printf("The other sum is %ld\n", sum_diter(make_diter_intlist(&b)));
  return 0;
}