## Dumb Fibonacci assembly-language microbenchmark.

        ## I like to use the stupid Fibonacci function,
        ## fib(n) = 1 if n <= 2 else fib(n-1) + fib(n-2), as
        ## a microbenchmark to get a rough gauge of how much
        ## performance I’m sacrificing to different language
        ## implementations.  However, nowadays GCC is too
        ## smart to get a useful baseline if you write it in
        ## C; with optimization on, you’re really just
        ## testing how deeply it decides to unroll the
        ## recursive loop, and with optimization off, you're
        ## just testing how wasteful its non-optimized code
        ## generator is.

        ## So here’s a version of it in assembly, which
        ## prints `fib(40) = 102334155` in 683–686
        ## milliseconds on this Celeron N4120, having made
        ## 102,334,155 leaf calls.  Dividing, we get 6.7
        ## nanoseconds per leaf call.

        ## Arguably the kind of single-threaded scalar
        ## integer performance this measures is increasingly
        ## irrelevant, since if you really care about
        ## performance you’ll be using SIMD, multithreading,
        ## the GPU, and in most cases floating-point.  But
        ## it’s still kind of the paradigm for a lot of
        ## scripting languages.

        ## See also:
        ## - fib.c: C (GCC has gotten so good that the performance
        ##   of this version is no longer meaningful)
        ## - dumbfib.ml: OCaml (6× slower in ocamlc, 1.06× in ocamlopt)
        ## - fib.go: Golang (1.21× slower)
        ## - dumbfib.lua: Lua (21× slower in PUC Lua, 1.25× in LuaJIT)
        ## - dumbfib.scm: Scheme (6× slower in Guile, 1.75× in Chez)
        ## - dumbfib.py: Python (40× slower in CPython, 1.7× in PyPy)
        ## - dumbfib.fs: Forth (6× slower in Gforth, 3× in Bigforth)
        ## - dumbfib.js: JS (3.2× slower in Node)
        ## - dumbfib.lisp: Common Lisp (5.5× slower in SBCL)
        ## - dumbfib.plx: Perl (70× slower)
        ## - dumbfib.tcl: Tcl (1030× slower)
        ## - dumbfib.sh: bash (8400× slower)

        .intel_syntax noprefix
        .globl main
main:   push rbp
        mov ebp, 40
        mov rdi, rbp
        call fib
        ## I always forget this, but “The standard efficient
        ## way to put a static address into a register is a
        ## RIP-relative LEA.”  movabsq would also work but is
        ## reputedly less efficient (certainly less dense).
        ## <https://stackoverflow.com/questions/57212012/how-to-load-address-of-function-or-label-into-register>
        ## If you just try the obvious `mov rdi,
        ## format_string` you get `relocation R_X86_64_32S
        ## against `.rodata' can not be used when making a
        ## PIE object; recompile with -fPIE`, which is an
        ## error message that suggests an incorrect solution.
        ## Linking with `-no-pie` is the other option:
        ## <https://stackoverflow.com/questions/43367427/32-bit-absolute-addresses-no-longer-allowed-in-x86-64-linux>
        lea rdi, [rip+format_string]
        .pushsection .rodata
format_string:
        .asciz "fib(%d) = %d\n"
        .popsection
        mov rsi, rbp
        mov rdx, rax
        call printf
        xor eax, eax
        pop rbp
        ret

        ## If this is misaligned by 11 bytes (say by
        ## inserting 11 nops), the program takes 13% longer.
        ## I was trying to figure out how on Earth changing
        ## my instruction encodings in main above was having
        ## such an effect...
        .align 16
fib:    cmp rdi, 2
        jg 1f
        mov eax, 1
        ret
1:      push rbp
        dec rdi
        mov rbp, rdi
        call fib
        ## Here’s a thing I hadn’t appreciated about the
        ## amd64 ABI's good choice to have the first argument
        ## register live at the same time as the return value
        ## register save you a register inside the callee
        ## many times, it also sometimes saves you a register
        ## inside the caller.  Like, almost, in this case.
        ## We have our saved argument (minus one) in rbp, and
        ## the return value from our first recursive call in
        ## rax, and we want to swap them.  On amd64 we could
        ## use xchg, but RISC architectures generally don’t
        ## have xchg or actually any two-output instructions.
        ## But even on a RISC architecture we can do this:
        mov rdi, rbp
        mov rbp, rax
        dec rdi
        call fib
        ## Unfortunately, all the RISC ABIs I know of put the
        ## return value in the first argument register, which
        ## is the worst possible place for it in that sense.
        add rax, rbp
        pop rbp
        ret

        ## On -O2, GCC 9.4.0 generated the following code for
        ## me from fib.c (which, note, uses a different
        ## definition for the Fibonacci index!), transforming
        ## the second recursive call into a loop by taking
        ## advantage of the commutativity and associativity
        ## of integer addition.  It runs *significantly*
        ## faster by virtue of doing half as many calls and
        ## returns.  But that’s what we’re trying to
        ## benchmark!  And sometimes I’ve seen GCC do much
        ## more aggressive optimization than that.
        .align 16
fancyfib:
        endbr64
        cmp edi, 1              # (above function uses 2 instead)
        jle 1f                  # Base case for recursion, return 1
        push rbp                # Allocate accumulator register.
        xor ebp, ebp            # Zero accumulator.
        push rbx                # Allocate space to preserve argument.
        mov ebx, edi            # Save argument.
        sub rsp, 8              # Useless stack frame allocation.
2:      lea edi, [rbx - 1]      # Decrement and move in one instruction.
        sub ebx, 2              # Compute argument for second “recursion”.
        call fancyfib           # Get fib(n-1).
        add ebp, eax            # Add it to the accumulator.
        cmp ebx, 1              # Check for loop termination.
        jg 2b                   # Loop instead of recursing.
        add rsp, 8              # Deallocate useless stack frame.
        lea eax, [rbp + 1]      # Add final +1 base case to return value.
        pop rbx
        pop rbp
        ret
        .align 16
1:      mov eax, 1              # Base case.
        ret