/* Multiply without using multiply instructions, using a lookup table * of perfect squares. This algorithm is only marginally useful, if * at all. * * The basic idea is that (a + b)² - (a - b)² = * (a² + 2ab - b²) - (a² - 2ab + b²) = 4ab, and if we solve that * for ab, we get ab = ((a + b)² - (a - b)²)/4. Calculating this * requires one add, two subtracts, two table lookups, and a two-bit * right shift. * * This might be more * compelling as a circuit rather than a program, although either * you’d need three copies of the lookup table or three sequential * clock cycles for the lookups. * * The first version of this program had the bug that while it * correctly thought 30 * 225 = 6750, it thought 30 * 226 = 39548; * this was because I was using 16-bit numbers for the items in the * squares table, which ceases to be able to correctly represent * squares of the numbers over 255, so any two 8-bit numbers that * summed to 256 or more would read an incorrect entry in it and get * the wrong answer. We need 18 bits at most for the entries in the * table, but stdint.h gives us 8, 16, 32, 64, or max, so 32 it is. * * Note that the nearly-two-kibibyte size of this table means that * this algorithm will only rarely be useful on tiny microcontrollers * that lack hardware multipliers. You could conceivably use a * sparser square table (like every fourth square) and interpolate * using a few iterations of the loop in initialize_square_table, but * that is probably going to be slower than just doing the standard * multiplication algorithm loop. * * Other related algebraic facts which could be used to provide * another version of the algorithm: * * a² + 2ab + b² = (a + b)²; if we solve that for ab, we get * ab = ((a + b)² - a² - b²) / 2. This involves three table lookups * instead of two. * * (c + d)(c - d) = c² - d²; if a = c + d and b = c - d, solving for c and d, * we get c = (a + b)/2, d = c - b. (Are there signedness issues there?) * (One add, two subtracts, right shift one, and two lookups.) */ #include #include #include #include "squaretable.h" typedef size_t I; /* loop counter type */ static uint32_t squares[511] = {0}; #define LEN(x) (sizeof(x)/sizeof((x)[0])) void initialize_square_table() { /* * Because it would not be stylish to use multiplications to * initialize the table we’re going to use to avoid multiplications, * this does it with only addition. */ uint32_t isq = 0; for (I i = 0; i < LEN(squares); i++) { squares[i] = isq; isq += i + i + 1; } } /* This function is the actual multiply-by-square-lookup algorithm. Its body reduces to seven instructions on amd64 — an addition, two subtractions, a couple of sign-extensions, two table lookups, another subtraction (fused with one of the table lookups), and a right shift. 38: 8d 04 37 lea (%rdi,%rsi,1),%eax 3b: 29 f7 sub %esi,%edi 3d: 48 63 ff movslq %edi,%rdi 40: 48 98 cltq 42: 8b 04 85 00 00 00 00 mov 0x0(,%rax,4),%eax 49: 2b 04 bd 00 00 00 00 sub 0x0(,%rdi,4),%eax 50: c1 e8 02 shr $0x2,%eax On i386, it’s just the expected five: 2b: 8d 0c 02 lea (%edx,%eax,1),%ecx 2e: 29 c2 sub %eax,%edx 30: 8b 04 8d 00 00 00 00 mov 0x0(,%ecx,4),%eax 37: 2b 04 95 00 00 00 00 sub 0x0(,%edx,4),%eax 3e: c1 e8 02 shr $0x2,%eax It is required that a >= b, because the squares table only has squares for positive numbers. There might be an easy way to fix this without adding a conditional or having an absolute-subtract instruction. */ uint16_t multiply_8_o(uint8_t a, uint8_t b) { /* * Note that even though squares contains 32-bit values (really 18 * bits wide), the final result is guaranteed to be 16-bit. */ return (squares[a + b] - squares[a - b]) >> 2; } /* Ensures that the arguments are in the right order. */ uint16_t multiply_8(uint8_t a, uint8_t b) { return a > b ? multiply_8_o(a, b) : multiply_8_o(b, a); } /* This function and the one that follows use the 8-bit multiply * function above to do 16-bit and 32-bit multiplies. This requires * four 8-bit multiplies for a 16-bit multiply, and 16 8-bit * multiplies for a 32-bit multiply, although Karatsuba could reduce * those numbers to 3 and 9, respectively. The body of this function * is 39 instructions, mostly because of parameter passing. */ uint32_t multiply_16(uint16_t a, uint16_t b) { /* A a * × B b * ----- * c d * + e f * ------- */ uint8_t a_lo = a & 0xff, a_hi = a >> 8, b_lo = b & 0xff, b_hi = b >> 8; uint32_t /* to prevent << 16 from truncating */ c = multiply_8(a_hi, b_lo), d = multiply_8(a_lo, b_lo), e = multiply_8(a_hi, b_hi), f = multiply_8(a_lo, b_hi); return (e << 16) + ((c + f) << 8) + d; } /* See comment for multiply_16. */ uint64_t multiply_32(uint32_t a, uint32_t b) { uint16_t a_lo = a & 0xffff, a_hi = a >> 16, b_lo = b & 0xffff, b_hi = b >> 16; uint64_t c = multiply_16(a_hi, b_lo), d = multiply_16(a_lo, b_lo), e = multiply_16(a_hi, b_hi), f = multiply_16(a_lo, b_hi); return (e << 32) + ((c + f) << 16) + d; } /* Unit testing: verify that a result is correct. */ static void check(uint32_t a, uint32_t b) { uint64_t expected_result = (uint64_t)a * b; uint64_t got = multiply_32(a, b); if (expected_result != got) { fprintf(stderr, "%ju * %ju = %ju, not %ju\n", (uintmax_t)a, (uintmax_t)b, (uintmax_t)expected_result, (uintmax_t)got); abort(); } } /* This weak “main” allows this C file to either be compiled as a * program by itself, for testing, or used as a library in another * program that has its own “main”. One potential drawback of this * technique is that if for whatever reason you don’t provide a strong * “main” in the final link step, you may end up getting this “main” * by surprise. Another is that you end up with this code (and * “check”) in your final executable even if you’re not using it. * XXX maybe make this a weak alias for better combatibility? */ #ifndef __GNUC__ #define __attribute__(x) #endif int __attribute__((weak)) main(int argc, char **argv) { initialize_square_table(); check(3, 4); check(200, 200); check(0xabad, 0x1dea); check(0xabad1dea, 0xdeadfeed); if (argc > 2) { int a = atoi(argv[1]), b = atoi(argv[2]); printf("checking %d * %d = %ju\n", a, b, (uintmax_t)a * b); check((uint32_t)a, (uint32_t)b); } return 0; }