## See sumnums.c and file `iterator-pattern-matching.md` in ## pavnotes2. The results are that the dynamically dispatched ## loop takes about 1.4 nanoseconds per iteration on this ## Ryzen 5 3500U, the baseline loop with no dispatching takes ## about 0.53 nanoseconds per iteration, and the ## bounds-checked loop takes about 0.63 nanoseconds per ## iteration. .intel_syntax noprefix .globl make_diter_array, sum_diter, diter_array_iterator .globl make_diter_intlist, diter_intlist_iterator, sum_without_diter .globl sum_boundschecked make_diter_array: ## I think the return value pointer is in RDI, int *begin in ## RSI, and int *end in RDX. mov [rdi], rsi mov [rdi+8], rdx lea rsi, [rip + diter_array_iterator] mov [rdi+16], rsi ret make_diter_intlist: mov [rdi], rsi lea rsi, [rip + diter_intlist_iterator] mov [rdi+16], rsi ret ## Baseline for scalar summing performance. sum_without_diter: xor rax, rax jmp 1f .align 8 2: add rax, [rdi] add rdi, 8 1: cmp rdi, rsi jne 2b ret ## Baseline for scalar summing performance with bounds ## checking. Arguments are array base address in RDI and ## array size in RSI. sum_boundschecked: xor rax, rax xor rcx, rcx # Loop counter jmp 1f .align 8 2: ## The redundant bounds checking we want to measure the ## performance of: cmp rcx, rsi jnb 3f add rax, [rdi + rcx * 8] inc rcx 1: cmp rcx, rsi jb 2b ret 3: call abort # placeholder for bounds violation handling ## The version of the sum loop that uses dynamic dispatch to ## get each new item. sum_diter: push r12 # Save callee-saved r12-r14. push r13 push r14 push rbx # We’re going to build our total here. xor rbx, rbx # Zero the (callee-saved) total. ## We get passed the three words of the diter on the stack, ## right above our return address. We’ve pushed four more ## qwords on there, so it starts at [rsp + 40]. mov r12, [rsp + 40] mov r13, [rsp + 48] mov r14, [rsp + 56] ## Now we can jump into the middle of the loop and start ## summing numbers! jmp 1f .align 8 2: add rbx, [r9] # But if it succeeded, we have a number to add. 1: call r14 # Invoke iterator’s get-next subroutine. test r9, r9 jnz 2b # If it succeeded, go to the top of the loop. 3: mov rax, rbx # Otherwise, restore callee-saves and return. pop rbx pop r14 pop r13 pop r12 ret ## Diter iterators use a weird calling convention where they ## have permission to modify R12 and R13 because those are ## where their state variables are, and they return their ## result in R9. Everything else is as normal, and in ## particular we are entitled to cabbage non-call-preserved ## registers like rax and rcx. .align 8 diter_array_iterator: cmp r12, r13 je 1f mov r9, r12 add r12, 8 ret 1: xor r9, r9 ret .align 8 diter_intlist_iterator: test r12, r12 je 1f mov r9, r12 # R12 points at the integer, as before! mov r12, [r12+8] ret 1: xor r9, r9 ret