#!/usr/bin/luajit --[[ Sinless reimplementation of hadamardsin.js in Lua. This takes about 150–170ms to do a 262144-element Hadamard transform on my Ryzen 5 3500U, almost as fast as the 91ms of the C version in hadamardsin.c. The code in hadamardsin.js instead takes 310–341ms. Possibly this could be optimized further. ]] local newtable = require("table.new") -- to presize arrays; see https://luajit.org/extensions.html -- optimization notes (on MicroPC, so different timings): -- table.new was a huge win! -- before removing local n = #xs: .290 .290 .294 .296 -- after: .300 .301 .302 -- after adding it back: .293 .295 .295 -- after removing `local` from all functions: .298 .296 .299 -- after adding it back:.294 .290 .293 -- after using table.insert to append all results: .542 .543 -- after making table.insert local: .538 .542 .543 -- after taking it back out: .293 .292 .292 -- after trying to use unpack and giving up: .297 .297 .296 .290 .293 -- after inlining concat table.move: .297 .296 .296 -- after switching to pairs() iteration: .443 .450 .444 -- after taking it back out: .294 .297 .294 -- after replacing table.move in slice() with an explicit loop: .286 .290 .288 -- after commenting that back out: .309 .293 .292 -- after switching back to the explicit loop: .288 .292 .291 -- after also switching concat to use explicit loops: .282 .278 .275 -- after replacing add and sub with closures generated by a higher-order function: .290 .292 .294 -- after taking them back out: .277 .284 .280 -- I tried unrolling the loop in slice 2x but got weird bugs. local function slice(xs, first, last) local result = newtable(last - first + 1, 0) for i = 1, last - first + 1 do result[i] = xs[first + i - 1] end return result end local function add(xs, ys) local n = #xs local result = newtable(n, 0) for i = 1, n do result[i] = xs[i] + ys[i] end return result end local function sub(xs, ys) local n = #xs local result = newtable(n, 0) for i = 1, n do result[i] = xs[i] - ys[i] end return result end local function dyadicmap(op) local function f(xs, ys) local n = #xs local result = newtable(n, 0) for i = 1, n do result[i] = op(xs[i], ys[i]) end return result end return f end -- This makes the program 3% slower: -- add = dyadicmap(function(x, y) return x + y end) -- sub = dyadicmap(function(x, y) return x - y end) -- table.move -- -- is slower than these explicit loops. {unpack(xs), unpack(ys)} -- doesn’t work, dropping all but the first of xs. even {unpack(xs)} -- doesn’t work; it says there are “too many results to unpack”. local function concat(xs, ys) local xn, yn = #xs, #ys local result = newtable(xn+yn, 0) for i = 1, xn do result[i] = xs[i] end for i = 1, yn do result[xn+i] = ys[i] end return result end local function hada(xs) local len = #xs if len < 2 then return xs end local half = len / 2 local left = hada(slice(xs, 1, half)) local right = hada(slice(xs, half+1, len)) return concat(add(left, right), sub(left, right)) end local function zeros(n) local result = newtable(n, 0) for i = 1, n do result[i] = 0 end return result end local function sum(xs) local result = 0 for i = 1, #xs do result = result + xs[i] end return result end local b = zeros(1024*256) b[1] = 1 b[8] = 3 print(sum(b), #b) --print(unpack(hada(b))) print(sum(hada(b)))