## Compute N triangular numbers M in the naïve dumb way. This ## is for comparison with a dumb JIT compiler. I’m doing this ## in assembly because I don’t trust my C compiler not to try ## to optimize this into something that’s useless for ## comparison purposes. See trinarm.S for the ARM version. .equ N, 100*1000 .equ M, 10*1000#*1000*1000 .globl main main: push %rbx mov $N, %rbx 1: mov $M, %rdi call tri dec %rbx jnz 1b mov %rax, %rsi # return value to print leaq format(%rip), %rdi xor %eax, %eax # number of arguments? printf dumps core without it call printf xor %eax, %eax pop %rbx ret format: .asciz "%d\n" ## Number to calculate triangular number of is in %rdi. tri: xor %eax, %eax # total in %rax test %rdi, %rdi # zero iterations? jz 1f # skip the loop 2: add %rdi, %rax dec %rdi jnz 2b 1: ret ## Version with overflow check, just as fast triov: xor %eax, %eax # total in %rax test %rdi, %rdi # zero iterations? jz 1f # skip the loop 2: add %rdi, %rax jo 2f dec %rdi jnz 2b 1: ret ## The idea here is that the overflow check in the loop is ## something Schemes would need to do to fall back to a bignum ## version. 2: call abort ## “Optimized” version with an unrolled loop, not in the ARM ## version. Of course actual optimization would just use the ## closed-form solution ½N(N+1). triopt: xor %eax, %eax ## Set up for an 8× unrolled loop with a non-unrolled loop. 0: test $7, %rdi # any low-order bits set? jz 0f add %rdi, %rax dec %rdi jmp 0b 0: test %rdi, %rdi # any high-order bits set? jz 1f ## In the unrolled loop, we are doing a multiple-of-8 step: ## %rdi + %rdi-1 + %rdi-2 + ... + %rdi-7. This works out to ## 8·%rdi - T₇, where T₇ is the 7th triangular number, Σ⁷ᵢ₌₀i ## = ½7(7+1) = 28. This is about 3× faster for large N. 0: lea (,%rdi,8), %rsi add %rsi, %rax sub $28, %rax sub $8, %rdi jnz 0b 1: ret