# Attempting to understand what GCC emits for kmregion_example.c. call km_start testq %rax, %rax # if (!p) abort(); movq %rax, %rbp # so the return from `km_start`, the kmregion pointer, is in %rbp je .L21 xorl %ebx, %ebx # initialize j to 0 xorl %r12d, %r12d # initialize pointer `list` to NULL jmp .L6 .L23: # 0(%rbp) is r->p movq %rdx, 0(%rbp) # update memory to reflect pointer in %rdx: uintptr_t p = r->p - n; if (p >= r->lwm) { r->p = p; movq %rdx, %rax # now q, our list node pointer, is %rax .L8: # and if we jump here from the `km_slow_path_allocate` call, it’s the return value from there movl %ebx, (%rax) # q->i = j; addq $1, %rbx # for (size_t j = 0; j < 5000; j++) { movq %r12, 8(%rax) # q->next = list; cmpq $5000, %rbx # loop exit test je .L22 movq %rax, %r12 # list = q; .L6: movq 0(%rbp), %rdx # mostly maintain r->p in %rdx within the loop: uintptr_t p = r->p - n; subq $16, %rdx # actually allocate the list node cmpq 8(%rbp), %rdx # if (p => r->lwm) { jae .L23 movl $16, %esi # return km_slow_path_allocate(r, n); // otherwise movq %rbp, %rdi call km_slow_path_allocate jmp .L8 .L22: # So the actual allocation fast path is movq 0(%rbp), %rdx; sub $16, # %rdx; cmpq 8(%rbp), %rdx; jae .L23; ... .L23: movq %rdx, 0(%rbp); # movq %rdx, %rax six instructions, and probably five or six # micro-ops; the cmp/jae thing surely gets fused, but the load # implicit in the cmp instruction probably gets split out as a # separate instruction, and I’m not sure if the final mov disappears # in a cloud of register renaming. # Then the rest of the inner loop is another six instructions to # actually initialize the list node. # I rewrote the allocator to avoid an underflow problem and concerns # about conversion between pointers and integers, and got this: call km_start testq %rax, %rax movq %rax, %rbp # %rbp = kmregion pointer je .L21 xorl %ebx, %ebx # j = 0 xorl %r13d, %r13d # list = NULL jmp .L6 .L23: movq 8(%rbp), %rax # load base pointer movq %rdx, 0(%rbp) # store allocation counter addq %rdx, %rax # offset base pointer with allocation pointer to get address .L8: movl %ebx, (%rax) # store j in list node addq $1, %rbx # increment j movq %r13, 8(%rax) # store list pointer into list node cmpq $5000, %rbx # loop counter termination test je .L22 movq %rax, %r13 # point list pointer at new list node .L6: movq 0(%rbp), %rcx # load allocation counter leaq -16(%rcx), %rdx # decrement it cmpq %rdx, %rcx # check against its old value jae .L23 # if it decreased it didn't underflow movl $16, %esi movq %rbp, %rdi call km_slow_path_allocate jmp .L8 .L22: # I think this would be more readable as follows, but this involves an # additional unconditional jump in the inner loop and probably an # initial missed branch prediction on jae .L23 (though the compiler # surely can’t know that): call km_start testq %rax, %rax movq %rax, %rbp # %rbp = kmregion pointer je .L21 xorl %ebx, %ebx # j = 0 xorl %r13d, %r13d # list = NULL .L6: movq 0(%rbp), %rcx # load allocation counter leaq -16(%rcx), %rdx # decrement it cmpq %rdx, %rcx # check against its old value jae .L23 # if it decreased it didn't underflow movl $16, %esi movq %rbp, %rdi call km_slow_path_allocate jmp .L8 .L23: movq 8(%rbp), %rax # load base pointer movq %rdx, 0(%rbp) # store allocation counter addq %rdx, %rax # offset base pointer with allocation pointer to get address .L8: movl %ebx, (%rax) # store j in list node addq $1, %rbx # increment j movq %r13, 8(%rax) # store list pointer into list node cmpq $5000, %rbx # loop counter termination test je .L22 movq %rax, %r13 # point list pointer at new list node jmp .L6 .L22: # So now the fast-path allocation is 7 instructions instead of 6: movq # 0(%rbp), %rcx; leaq -16(%rcx), %rdx; cmpq %rdx, %rcx; jae .L23; movq # 8(%rbp), %rax; movq %rdx, 0(%rbp); addq %rdx, rax. But the code # still runs at exactly the same speed on my laptop.