/* Multiply without using multiply instructions, using a lookup table
* of perfect squares. This algorithm is only marginally useful, if
* at all.
*
* The basic idea is that (a + b)² - (a - b)² =
* (a² + 2ab - b²) - (a² - 2ab + b²) = 4ab, and if we solve that
* for ab, we get ab = ((a + b)² - (a - b)²)/4. Calculating this
* requires one add, two subtracts, two table lookups, and a two-bit
* right shift.
*
* This might be more
* compelling as a circuit rather than a program, although either
* you’d need three copies of the lookup table or three sequential
* clock cycles for the lookups.
*
* The first version of this program had the bug that while it
* correctly thought 30 * 225 = 6750, it thought 30 * 226 = 39548;
* this was because I was using 16-bit numbers for the items in the
* squares table, which ceases to be able to correctly represent
* squares of the numbers over 255, so any two 8-bit numbers that
* summed to 256 or more would read an incorrect entry in it and get
* the wrong answer. We need 18 bits at most for the entries in the
* table, but stdint.h gives us 8, 16, 32, 64, or max, so 32 it is.
*
* Note that the nearly-two-kibibyte size of this table means that
* this algorithm will only rarely be useful on tiny microcontrollers
* that lack hardware multipliers. You could conceivably use a
* sparser square table (like every fourth square) and interpolate
* using a few iterations of the loop in initialize_square_table, but
* that is probably going to be slower than just doing the standard
* multiplication algorithm loop.
*
* Other related algebraic facts which could be used to provide
* another version of the algorithm:
*
* a² + 2ab + b² = (a + b)²; if we solve that for ab, we get
* ab = ((a + b)² - a² - b²) / 2. This involves three table lookups
* instead of two.
*
* (c + d)(c - d) = c² - d²; if a = c + d and b = c - d, solving for c and d,
* we get c = (a + b)/2, d = c - b. (Are there signedness issues there?)
* (One add, two subtracts, right shift one, and two lookups.)
*/
#include
#include
#include
#include "squaretable.h"
typedef size_t I; /* loop counter type */
static uint32_t squares[511] = {0};
#define LEN(x) (sizeof(x)/sizeof((x)[0]))
void
initialize_square_table()
{
/*
* Because it would not be stylish to use multiplications to
* initialize the table we’re going to use to avoid multiplications,
* this does it with only addition.
*/
uint32_t isq = 0;
for (I i = 0; i < LEN(squares); i++) {
squares[i] = isq;
isq += i + i + 1;
}
}
/* This function is the actual multiply-by-square-lookup algorithm.
Its body reduces to seven instructions on amd64 — an addition, two
subtractions, a couple of sign-extensions, two table lookups,
another subtraction (fused with one of the table lookups), and a
right shift.
38: 8d 04 37 lea (%rdi,%rsi,1),%eax
3b: 29 f7 sub %esi,%edi
3d: 48 63 ff movslq %edi,%rdi
40: 48 98 cltq
42: 8b 04 85 00 00 00 00 mov 0x0(,%rax,4),%eax
49: 2b 04 bd 00 00 00 00 sub 0x0(,%rdi,4),%eax
50: c1 e8 02 shr $0x2,%eax
On i386, it’s just the expected five:
2b: 8d 0c 02 lea (%edx,%eax,1),%ecx
2e: 29 c2 sub %eax,%edx
30: 8b 04 8d 00 00 00 00 mov 0x0(,%ecx,4),%eax
37: 2b 04 95 00 00 00 00 sub 0x0(,%edx,4),%eax
3e: c1 e8 02 shr $0x2,%eax
It is required that a >= b, because the squares table only has
squares for positive numbers. There might be an easy way to fix
this without adding a conditional or having an absolute-subtract
instruction.
*/
uint16_t
multiply_8_o(uint8_t a, uint8_t b)
{
/*
* Note that even though squares contains 32-bit values (really 18
* bits wide), the final result is guaranteed to be 16-bit.
*/
return (squares[a + b] - squares[a - b]) >> 2;
}
/* Ensures that the arguments are in the right order. */
uint16_t
multiply_8(uint8_t a, uint8_t b)
{
return a > b ? multiply_8_o(a, b) : multiply_8_o(b, a);
}
/* This function and the one that follows use the 8-bit multiply
* function above to do 16-bit and 32-bit multiplies. This requires
* four 8-bit multiplies for a 16-bit multiply, and 16 8-bit
* multiplies for a 32-bit multiply, although Karatsuba could reduce
* those numbers to 3 and 9, respectively. The body of this function
* is 39 instructions, mostly because of parameter passing.
*/
uint32_t
multiply_16(uint16_t a, uint16_t b)
{
/* A a
* × B b
* -----
* c d
* + e f
* -------
*/
uint8_t
a_lo = a & 0xff, a_hi = a >> 8,
b_lo = b & 0xff, b_hi = b >> 8;
uint32_t /* to prevent << 16 from truncating */
c = multiply_8(a_hi, b_lo),
d = multiply_8(a_lo, b_lo),
e = multiply_8(a_hi, b_hi),
f = multiply_8(a_lo, b_hi);
return (e << 16) + ((c + f) << 8) + d;
}
/* See comment for multiply_16. */
uint64_t
multiply_32(uint32_t a, uint32_t b)
{
uint16_t
a_lo = a & 0xffff, a_hi = a >> 16,
b_lo = b & 0xffff, b_hi = b >> 16;
uint64_t
c = multiply_16(a_hi, b_lo),
d = multiply_16(a_lo, b_lo),
e = multiply_16(a_hi, b_hi),
f = multiply_16(a_lo, b_hi);
return (e << 32) + ((c + f) << 16) + d;
}
/* Unit testing: verify that a result is correct. */
static void
check(uint32_t a, uint32_t b)
{
uint64_t expected_result = (uint64_t)a * b;
uint64_t got = multiply_32(a, b);
if (expected_result != got) {
fprintf(stderr, "%ju * %ju = %ju, not %ju\n",
(uintmax_t)a,
(uintmax_t)b,
(uintmax_t)expected_result,
(uintmax_t)got);
abort();
}
}
/* This weak “main” allows this C file to either be compiled as a
* program by itself, for testing, or used as a library in another
* program that has its own “main”. One potential drawback of this
* technique is that if for whatever reason you don’t provide a strong
* “main” in the final link step, you may end up getting this “main”
* by surprise. Another is that you end up with this code (and
* “check”) in your final executable even if you’re not using it.
* XXX maybe make this a weak alias for better combatibility?
*/
#ifndef __GNUC__
#define __attribute__(x)
#endif
int __attribute__((weak))
main(int argc, char **argv)
{
initialize_square_table();
check(3, 4);
check(200, 200);
check(0xabad, 0x1dea);
check(0xabad1dea, 0xdeadfeed);
if (argc > 2) {
int a = atoi(argv[1]), b = atoi(argv[2]);
printf("checking %d * %d = %ju\n", a, b, (uintmax_t)a * b);
check((uint32_t)a, (uint32_t)b);
}
return 0;
}