## Compute N triangular numbers M in the naïve dumb way.  This
        ## is for comparison with a dumb JIT compiler.  I’m doing this
        ## in assembly because I don’t trust my C compiler not to try
        ## to optimize this into something that’s useless for
        ## comparison purposes.  See trinarm.S for the ARM version.
        .equ N, 100*1000
        .equ M, 10*1000#*1000*1000
        .globl main
main:   push %rbx
        mov $N, %rbx

1:      mov $M, %rdi
        call tri
        dec %rbx
        jnz 1b

        mov %rax, %rsi          # return value to print
        leaq format(%rip), %rdi
        xor %eax, %eax # number of arguments? printf dumps core without it
        call printf

        xor %eax, %eax
        pop %rbx
        ret

format: .asciz "%d\n"


        ## Number to calculate triangular number of is in %rdi.
tri:    xor %eax, %eax          # total in %rax
        test %rdi, %rdi         # zero iterations?
        jz 1f                   # skip the loop

2:      add %rdi, %rax
        dec %rdi
        jnz 2b

1:      ret


        ## Version with overflow check, just as fast
triov:  xor %eax, %eax          # total in %rax
        test %rdi, %rdi         # zero iterations?
        jz 1f                   # skip the loop

2:      add %rdi, %rax
        jo 2f
        dec %rdi
        jnz 2b

1:      ret

        ## The idea here is that the overflow check in the loop is
        ## something Schemes would need to do to fall back to a bignum
        ## version.
2:      call abort


        ## “Optimized” version with an unrolled loop, not in the ARM
        ## version.  Of course actual optimization would just use the
        ## closed-form solution ½N(N+1).
triopt: xor %eax, %eax

        ## Set up for an 8× unrolled loop with a non-unrolled loop.
0:      test $7, %rdi           # any low-order bits set?
        jz 0f
        add %rdi, %rax
        dec %rdi
        jmp 0b

0:      test %rdi, %rdi         # any high-order bits set?
        jz 1f

        ## In the unrolled loop, we are doing a multiple-of-8 step:
        ## %rdi + %rdi-1 + %rdi-2 + ... + %rdi-7.  This works out to
        ## 8·%rdi - T₇, where T₇ is the 7th triangular number, Σ⁷ᵢ₌₀i
        ## = ½7(7+1) = 28.  This is about 3× faster for large N.
0:      lea (,%rdi,8), %rsi
        add %rsi, %rax
        sub $28, %rax
        sub $8, %rdi
        jnz 0b
1:      ret