/* Multiply without using multiply instructions, using a lookup table
 * of perfect squares.  This algorithm is only marginally useful, if
 * at all.
 *
 * The basic idea is that (a + b)² - (a - b)² =
 * (a² + 2ab - b²) - (a² - 2ab + b²) = 4ab, and if we solve that
 * for ab, we get ab = ((a + b)² - (a - b)²)/4.  Calculating this
 * requires one add, two subtracts, two table lookups, and a two-bit
 * right shift.
 *
 * This might be more
 * compelling as a circuit rather than a program, although either
 * you’d need three copies of the lookup table or three sequential
 * clock cycles for the lookups.
 *
 * The first version of this program had the bug that while it
 * correctly thought 30 * 225 = 6750, it thought 30 * 226 = 39548;
 * this was because I was using 16-bit numbers for the items in the
 * squares table, which ceases to be able to correctly represent
 * squares of the numbers over 255, so any two 8-bit numbers that
 * summed to 256 or more would read an incorrect entry in it and get
 * the wrong answer.  We need 18 bits at most for the entries in the
 * table, but stdint.h gives us 8, 16, 32, 64, or max, so 32 it is.
 *
 * Note that the nearly-two-kibibyte size of this table means that
 * this algorithm will only rarely be useful on tiny microcontrollers
 * that lack hardware multipliers.  You could conceivably use a
 * sparser square table (like every fourth square) and interpolate
 * using a few iterations of the loop in initialize_square_table, but
 * that is probably going to be slower than just doing the standard
 * multiplication algorithm loop.
 *
 * Other related algebraic facts which could be used to provide
 * another version of the algorithm:
 *
 * a² + 2ab + b² = (a + b)²; if we solve that for ab, we get
 * ab = ((a + b)² - a² - b²) / 2.  This involves three table lookups
 * instead of two.
 *
 * (c + d)(c - d) = c² - d²; if a = c + d and b = c - d, solving for c and d,
 * we get c = (a + b)/2, d = c - b.  (Are there signedness issues there?)
 * (One add, two subtracts, right shift one, and two lookups.)
 */

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#include "squaretable.h"

typedef size_t I;               /* loop counter type */

static uint32_t squares[511] = {0};
#define LEN(x) (sizeof(x)/sizeof((x)[0]))
void
initialize_square_table()
{
  /*
   * Because it would not be stylish to use multiplications to
   * initialize the table we’re going to use to avoid multiplications,
   * this does it with only addition.
   */
  uint32_t isq = 0;
  for (I i = 0; i < LEN(squares); i++) {
    squares[i] = isq;
    isq += i + i + 1;
  }
}

/* This function is the actual multiply-by-square-lookup algorithm.
   Its body reduces to seven instructions on amd64 — an addition, two
   subtractions, a couple of sign-extensions, two table lookups,
   another subtraction (fused with one of the table lookups), and a
   right shift.

  38:	8d 04 37             	lea    (%rdi,%rsi,1),%eax
  3b:	29 f7                	sub    %esi,%edi
  3d:	48 63 ff             	movslq %edi,%rdi
  40:	48 98                	cltq   
  42:	8b 04 85 00 00 00 00 	mov    0x0(,%rax,4),%eax
  49:	2b 04 bd 00 00 00 00 	sub    0x0(,%rdi,4),%eax
  50:	c1 e8 02             	shr    $0x2,%eax

   On i386, it’s just the expected five:

  2b:	8d 0c 02             	lea    (%edx,%eax,1),%ecx
  2e:	29 c2                	sub    %eax,%edx
  30:	8b 04 8d 00 00 00 00 	mov    0x0(,%ecx,4),%eax
  37:	2b 04 95 00 00 00 00 	sub    0x0(,%edx,4),%eax
  3e:	c1 e8 02             	shr    $0x2,%eax

   It is required that a >= b, because the squares table only has
   squares for positive numbers.  There might be an easy way to fix
   this without adding a conditional or having an absolute-subtract
   instruction.

 */
uint16_t
multiply_8_o(uint8_t a, uint8_t b)
{
  /*
   * Note that even though squares contains 32-bit values (really 18
   * bits wide), the final result is guaranteed to be 16-bit.
   */
  return (squares[a + b] - squares[a - b]) >> 2;
}

/* Ensures that the arguments are in the right order. */
uint16_t
multiply_8(uint8_t a, uint8_t b)
{
  return a > b ? multiply_8_o(a, b) : multiply_8_o(b, a);
}

/* This function and the one that follows use the 8-bit multiply
 * function above to do 16-bit and 32-bit multiplies.  This requires
 * four 8-bit multiplies for a 16-bit multiply, and 16 8-bit
 * multiplies for a 32-bit multiply, although Karatsuba could reduce
 * those numbers to 3 and 9, respectively.  The body of this function
 * is 39 instructions, mostly because of parameter passing.
 */
uint32_t
multiply_16(uint16_t a, uint16_t b)
{
  /*     A a
   *   × B b
   *   -----
   *     c d
   * + e f 
   * -------
   */

  uint8_t
    a_lo = a & 0xff, a_hi = a >> 8,
    b_lo = b & 0xff, b_hi = b >> 8;
  uint32_t                      /* to prevent << 16 from truncating */
    c = multiply_8(a_hi, b_lo),
    d = multiply_8(a_lo, b_lo),
    e = multiply_8(a_hi, b_hi),
    f = multiply_8(a_lo, b_hi);

  return (e << 16) + ((c + f) << 8) + d;
}

/* See comment for multiply_16. */
uint64_t
multiply_32(uint32_t a, uint32_t b)
{
  uint16_t
    a_lo = a & 0xffff, a_hi = a >> 16,
    b_lo = b & 0xffff, b_hi = b >> 16;
  uint64_t
    c = multiply_16(a_hi, b_lo),
    d = multiply_16(a_lo, b_lo),
    e = multiply_16(a_hi, b_hi),
    f = multiply_16(a_lo, b_hi);

  return (e << 32) + ((c + f) << 16) + d;
}

/* Unit testing: verify that a result is correct. */
static void
check(uint32_t a, uint32_t b)
{
  uint64_t expected_result = (uint64_t)a * b;
  uint64_t got = multiply_32(a, b);
  if (expected_result != got) {
    fprintf(stderr, "%ju * %ju = %ju, not %ju\n",
            (uintmax_t)a,
            (uintmax_t)b,
            (uintmax_t)expected_result,
            (uintmax_t)got);
    abort();
  }
}

/* This weak “main” allows this C file to either be compiled as a
 * program by itself, for testing, or used as a library in another
 * program that has its own “main”.  One potential drawback of this
 * technique is that if for whatever reason you don’t provide a strong
 * “main” in the final link step, you may end up getting this “main”
 * by surprise.  Another is that you end up with this code (and
 * “check”) in your final executable even if you’re not using it.
 * XXX maybe make this a weak alias for better combatibility?
 */
#ifndef __GNUC__
#define __attribute__(x)
#endif

int __attribute__((weak))
main(int argc, char **argv)
{
  initialize_square_table();
  check(3, 4);
  check(200, 200);
  check(0xabad, 0x1dea);
  check(0xabad1dea, 0xdeadfeed);

  if (argc > 2) {
    int a = atoi(argv[1]), b = atoi(argv[2]);
    printf("checking %d * %d = %ju\n", a, b, (uintmax_t)a * b);
    check((uint32_t)a, (uint32_t)b);
  }

  return 0;
}