## See sumnums.c and file `iterator-pattern-matching.md` in
        ## pavnotes2.  The results are that the dynamically dispatched
        ## loop takes about 1.4 nanoseconds per iteration on this
        ## Ryzen 5 3500U, the baseline loop with no dispatching takes
        ## about 0.53 nanoseconds per iteration, and the
        ## bounds-checked loop takes about 0.63 nanoseconds per
        ## iteration.
        .intel_syntax noprefix
        .globl make_diter_array, sum_diter, diter_array_iterator
        .globl make_diter_intlist, diter_intlist_iterator, sum_without_diter
        .globl sum_boundschecked
make_diter_array:
        ## I think the return value pointer is in RDI, int *begin in
        ## RSI, and int *end in RDX.
        mov [rdi], rsi
        mov [rdi+8], rdx
        lea rsi, [rip + diter_array_iterator]
        mov [rdi+16], rsi
        ret

make_diter_intlist:
        mov [rdi], rsi
        lea rsi, [rip + diter_intlist_iterator]
        mov [rdi+16], rsi
        ret

        ## Baseline for scalar summing performance.
sum_without_diter:
        xor rax, rax
        jmp 1f
        .align 8
    2:  add rax, [rdi]
        add rdi, 8
    1:  cmp rdi, rsi
        jne 2b
        ret

        ## Baseline for scalar summing performance with bounds
        ## checking.  Arguments are array base address in RDI and
        ## array size in RSI.
sum_boundschecked:
        xor rax, rax
        xor rcx, rcx            # Loop counter
        jmp 1f
        .align 8
    2:  ## The redundant bounds checking we want to measure the
        ## performance of:
        cmp rcx, rsi
        jnb 3f
        add rax, [rdi + rcx * 8]
        inc rcx
    1:  cmp rcx, rsi
        jb 2b
        ret
    3:  call abort              # placeholder for bounds violation handling

        ## The version of the sum loop that uses dynamic dispatch to
        ## get each new item.
sum_diter:
        push r12                # Save callee-saved r12-r14.
        push r13
        push r14
        push rbx                # We’re going to build our total here.
        xor rbx, rbx            # Zero the (callee-saved) total.
        ## We get passed the three words of the diter on the stack,
        ## right above our return address.  We’ve pushed four more
        ## qwords on there, so it starts at [rsp + 40].
        mov r12, [rsp + 40]
        mov r13, [rsp + 48]
        mov r14, [rsp + 56]
        ## Now we can jump into the middle of the loop and start
        ## summing numbers!
        jmp 1f
        .align 8
    2:  add rbx, [r9]           # But if it succeeded, we have a number to add.
    1:  call r14                # Invoke iterator’s get-next subroutine.
        test r9, r9
        jnz 2b                  # If it succeeded, go to the top of the loop.
    3:  mov rax, rbx            # Otherwise, restore callee-saves and return.
        pop rbx
        pop r14             
        pop r13
        pop r12
        ret

        ## Diter iterators use a weird calling convention where they
        ## have permission to modify R12 and R13 because those are
        ## where their state variables are, and they return their
        ## result in R9.  Everything else is as normal, and in
        ## particular we are entitled to cabbage non-call-preserved
        ## registers like rax and rcx.
        .align 8
diter_array_iterator:
        cmp r12, r13
        je 1f
        mov r9, r12
        add r12, 8
        ret
    1:  xor r9, r9
        ret

        .align 8
diter_intlist_iterator:
        test r12, r12
        je 1f
        mov r9, r12             # R12 points at the integer, as before!
        mov r12, [r12+8]
        ret
    1:  xor r9, r9
        ret