@  -*- mode: asm; asm-comment-char: ?@ -*-
        @ Manually written simulated JIT compiled dumb Fibonacci.
        @ Manually translated from the bytecode in hadixmv.S.
        @ The idea is that to write a JIT compiler that generates this,
        @ it will be useful to first be able to disassemble it,
        @ for which I need to write and test a disassembler.
        .section .text,"ax",%progbits
        .syntax unified
        .thumb
        .fpu fpv4-sp-d16
        .cpu cortex-m4
        
        .thumb_func
        .globl dumbfib
dumbfib:
        @ Takes one argument from the operand stack, which, as in
        @ hadixmv.S, has top-of-stack in r5 and operand stack pointer
        @ in r4.  Pushes it onto the machine stack, along with the
        @ return address, thus keeping the machine stack 8-byte
        @ aligned, as required by the ARM EABI.
        push {r5, lr}
        @ ldmdb r4!, {r5}  Thumb-2 instruction I’m having trouble with in QEMU
        @ Thumb-1 equivalent sequence:
        @ If we were on Cortex-M0 or M0+ we would probably have to do
        @ something like this instead:
        subs.n r4, #4
        ldr.n  r5, [r4]  @ 6825 for #0, 68a5 for #8, 6be5 for #60,
                         @ 681d for r3 ptr, 6805 for r0 ptr,
                         @ 6820 for r0 dest, 6827 for r7 dest.
        @ So I think this is 0b01101 woffset5 Rn3 Rd3.  Should probably look that up...

        @ There are no other local variables.  We can access our
        @ argument by indexing off sp.
        stmia r4!, {r5}
        ldr   r5, [sp]
        @ This can be compiled by code something like the following:
        @ local_even: movt r5, #0x9d00          @ sp-rel ldr instruction
        @             and  r3, #0xf             @ low 4 bits of input bytecodes
        @             add  r5, r5, r3, lsr #16  @ add them into the instruction
        @             ldrb r3, [r6], #1         @ read next bytecode from odd table
        @             ldr  pc, [r7, r3, lsl #2] @ jump to it
        @ local_odd:  add  r5, #0x9d00          @ lower part of r5 this time
        @             and  r3, #0xf             @ low 4 bits (local var index)
        @             add  r5, r3
        @             str  r5, [r4], #4         @ append r5 to machine-code program
        @             movs r5, #0
        @             ldrb r3, [r6], #1
        @             ldr  pc, [r8, r3, lsl #2] @ read next bytecode from even table
        @ local_even: movt r5, #0xc420          @ stmia r4!, {r5}
        @             add  r5, #0x9d00          @ sp-rel ldr instruction
        @             and  r3, #0xf
        @             add  r5, r3
        @             str  r5, [r4], #4         @ append r5 to machine-code program
        @             movs r5, #0
        @             ldrb r3, [r6], #1
        @             ldr  pc, [r8, r3, lsl #2] @ read next bytecode from even table
        @             

        @ Then we push the literal 2.
        stmia r4!, {r5}
        movs  r5, #2

        @ Then we branch if greater than or equal.
        ldmdb r4!, {r1, r3}
        cmp r3, r5
        mov r5, r1
        bge 1f

        @ Then we push the literal 1.
        stmia r4!, {r5}
        movs  r5, #1

        @ i_ret: the subroutine epilogue is just one 16-bit
        @ instruction so duplicating it is fine.  It needs to discard
        @ our local variable as well as returning.
        pop {r1, pc}

1:      @ i_my 0
        stmia r4!, {r5}
        ldr   r5, [sp]

        @ i_lit4 -1
        stmia r4!, {r5}
        @ <https://developer.arm.com/documentation/dui0204/j/writing-arm-assembly-language/loading-constants-into-registers/direct-loading-with-mov-and-mvn?lang=en>
        @ "You cannot use the 16-bit MVN instruction to load a
        @ constant."  So this ends up as mov.w r5, #-1
        ldr   r5, =-1

        @ i_add.  Note that gas compiles this instruction to an
        @ ldr r1, [r4, #-4]!
        ldmdb r4!, {r1}
        add   r5, r1

        @ i_call 0
        bl    dumbfib           @ in JIT-land this will use a trampoline

        @ i_my 0
        stmia r4!, {r5}
        ldr   r5, [sp]

        @ i_lit4 -2
        stmia r4!, {r5}
        ldr   r5, =-2

        @ i_add
        ldmdb r4!, {r1}
        add   r5, r1

        @ i_call 0
        bl    dumbfib

        @ i_add
        ldmdb r4!, {r1}
        add   r5, r1

        @ i_ret
        pop {r1, pc}

        @ The above amounts to 30 instructions and 0x4e = 78 bytes.
        @ The original bytecode is 16 bytes plus a 4-byte header.
        @ That's almost exactly a 4× blowup, which is probably about
        @ typical.

        @ So, the above code only uses these 15 Thumb-2 instructions:
        @ 1. push with low registers and lr
        @ 2. ldr.w Rd, [r4, #-4]!
        @ 3. stmia.n r4!, {r5}
        @ 4. ldr r5, [sp, #imm]
        @ 5. movs r5, #imm
        @ 6. ldmdb.w r4!, {low regs}
        @ 7. cmp lowreg, lowreg
        @ 8. mov lowreg, lowreg
        @ 9. bge.n
        @ 10. b.n (oops, now removed, but it'll be back)
        @ 11. mov.w r5, #somekindofimmediate
        @ 12. add lowreg, lowreg
        @ 13. bl.w
        @ 14. mvn.w r5, #imm
        @ 15. pop {low regs, pc}
        @ 
        @ The things in these sequences that aren't simple constants are:
        @ - the registers pushed and popped in the prologue and epilogue
        @ - the offset from sp
        @ - the immediate constants being loaded into r5
        @ - the bge and b jump offsets
        @ - the offset in bl
        @ 
        @ In more detail.  The page numbers refer to the ARM ARM from
        @ 01996, in which the Thumb instruction set runs from
        @ pp. 213–276/322.
        @ 1. push with low registers and lr
        @    105b8:       b520            push    {r5, lr}
        @    10606:       b570            push    {r4, r5, r6, lr}
        @    1061c:       b510            push    {r4, lr}
        @    p. 258/322.  0b1011_010 Rb register_list_8.  Same as pop
        @    except the R bit controls lr rather than pc.
        @ 15. pop {low regs, pc}
        @    10604:       bd02            pop     {r1, pc}
        @    10612:       bd70            pop     {r4, r5, r6, pc}
        @    10650:       bd10            pop     {r4, pc}
        @    p. 256/322.  0b1011_110 Rb register_list_8.  The R bit is
        @    1 if pc is included.  Otherwise bit 0 of register_list_8
        @    controls popping r0, bit 1 controls r1, etc.
        @ 2. ldr.w Rd, [r4, #-4]! (Thumb-2 only)
        @    105ba:       f854 5d04       ldr.w   r5, [r4, #-4]!
        @    105e0:       f854 1d04       ldr.w   r1, [r4, #-4]!
        @    Thumb-1 has only ldr Rd, [Rn, #woffset5]: p. 236/322,
        @    0b01101 woffset5 Rn3 Rd3 and ldr Rd, [Rn, Rm].  woffset5
        @    is shifted left by 2 to index by word and zero-extended
        @    to only handle positive offsets.  Also there are ldrb.n
        @    and ldrh.n with similar limitations, plus ldrsb.n and
        @    ldrsh.n without the immediate offset option.
        @ 2.5. ldr Rd, [pc, #imm] (not in original code but needed.)
        @    10608:       4c14            ldr     r4, [pc, #80]
        @       36:       4d02            ldr     r5, [pc, #8]    ; (40 <trash+0x40>)
        @    p. 238/322.  0b01001 Rn3 woffset8; woffset8 is shifted
        @    left by 2 to index by word.  This is denser
        @ 6. ldmdb.w r4!, {low regs} (Thumb-2 only)
        @    105c6:       e934 000a       ldmdb   r4!, {r1, r3}
        @    Thumb-1 has ldmia Rn! but no ldmdb or non-updating ldmia.
        @ 3. stmia.n r4!, {r5}
        @    105be:       c420            stmia   r4!, {r5}
        @    105d0:       c420            stmia   r4!, {r5}
        @    105ea:       c420            stmia   r4!, {r5}
        @    p. 262/322.  0b1100_0 Rn3 register_list_8.  Same as
        @    push/pop except no R bit.
        @ 4. ldr r5, [sp, #imm]
        @    105c0:       9d00            ldr     r5, [sp, #0]
        @    105ec:       9d00            ldr     r5, [sp, #0]
        @       46:       9d01            ldr     r5, [sp, #4]
        @       4e:       f85d 5c04       ldr.w   r5, [sp, #-4]
        @       48:       9d02            ldr     r5, [sp, #8]
        @       4a:       9d03            ldr     r5, [sp, #12]
        @       4c:       9d04            ldr     r5, [sp, #16]
        @       4e:       9d05            ldr     r5, [sp, #20]
        @       50:       9d08            ldr     r5, [sp, #32]
        @       52:       9d10            ldr     r5, [sp, #64]   ; 0x40
        @       54:       9d20            ldr     r5, [sp, #128]  ; 0x80
        @       56:       9d40            ldr     r5, [sp, #256]  ; 0x100
        @       58:       9d80            ldr     r5, [sp, #512]  ; 0x200
        @       5a:       f8dd 5400       ldr.w   r5, [sp, #1024] ; 0x400
        @     0b10011 Rn3 woffset8, same as the pc-relative version
	@ 4.5. str r5, [sp, #imm] (not in original but needed)
        @       48:       9501            str     r5, [sp, #4]
        @       4c:       9502            str     r5, [sp, #8]
        @    p. 266/322, str(3).  0b10010 Rn3 woffset8.  Same as the
        @    ldr opcode.
        @ 5. movs r5, #imm
        @    105c4:       2502            movs    r5, #2
        @    105d2:       2501            movs    r5, #1
        @    106c2:       2435            movs    r4, #53 ; 0x35
        @       18:       25ff            movs    r5, #255        ; 0xff
        @    I'd think those are Thumb-1 MOV(1).  p. 250/322. 0b00101
        @    Rd3 imm8.  But instead of 2502 2501 2435 we'd have 2d02
        @    2d01 2c35.  What is this masked instruction?  The
        @    disassembler interprets 2c80 as cmp r4, #128 rather than
        @    a mov, and p. 230/322 confirms that 0b00101 Rn3 imm8 is
        @    indeed that.  So I think this is just an error in the
        @    ARM ARM, and it should be 0b00100.
        @       1a:       f04f 05ff       mov.w   r5, #255        ; 0xff
        @       1e:       f240 1500       movw    r5, #256        ; 0x100
        @       22:       f44f 7580       mov.w   r5, #256        ; 0x100
        @       2e:       f243 0539       movw    r5, #12345      ; 0x3039
        @       32:       f243 0539       movw    r5, #12345      ; 0x3039
        @       72:       f05f 35ff       movs.w  r5, #4294967295 ; 0xffffffff
	@    Bigger constants than 16 bits need either movw/movt or
        @    pc-relative constant pools.
        @ 11. mov.w r5, #somekindofimmediate
        @    105dc:       f04f 35ff       mov.w   r5, #0xffffffff
        @ 7. cmp lowreg, lowreg
        @       40:       4288            cmp     r0, r1
        @       42:       42a3            cmp     r3, r4
        @       44:       42a8            cmp     r0, r5
        @    105ca:       42ab            cmp     r3, r5
        @    p. 231/322.  0b0100_0010_10 Rm3 Rn3, cmp between two low
        @    registers.  As usual the operands are backwards: Rn - Rm.
        @    There's also a form of cmp which can handle high
        @    registers in Thumb-1.
        @ 9. bge.n
        @    105ce:       da02            bge.n   105d6 <dumbfib+0x1e>
        @    (02 here is I think 2 2-byte units from 0x105ce+4=0x105d2, "pc")
        @    10616:       da01            bge.n   1061c <dumbfib_hand+0x8>
	@    p. 224/322.  0b1101 cond4 hoffset8. hoffset8 is shifted
        @    left by 1 bit and added to pc, which is 4 more than the
        @    instruction's address.
        @ 8. mov lowreg, lowreg
        @    1060a:       4605            mov     r5, r0
        @    10698:       4607            mov     r7, r0
        @    105cc:       460d            mov     r5, r1
        @    10658:       4618            mov     r0, r3
        @    10626:       4621            mov     r1, r4
        @    10610:       4628            mov     r0, r5
        @    p. 251/322.  0b0100_0110 H1b H2b Rm3 Rd3, move high
        @    registers, analogous to add below.  In ARMv4T it's
        @    UNPREDICTABLE if they aren't high registers.
        @ 10. b.n (oops, now removed from the code, but it'll be back)
        @    106de:       e7ef            b.n     106c0 <hashic>  (offset is ...ffe2)
        @    106a4:       e7fa            b.n     1069c <itersz+0x8>  (here, ...fff8)
        @       46:       e00d            b.n     64 <trash+0x64> (and here, ...001e)
	@       50:       e004            b.n     5c <trash+0x5c> (and here, ...000c)
        @    So it looks like 2-byte units from $+4 again?  with an 11-bit offset.
        @    p. 225/322.  0b11100 hoffset11.  Confirmed.
        @ 12. add lowreg, lowreg
        @    105e4:       440d            add     r5, r1
        @    1064e:       4423            add     r3, r4
	@    p. 217/322.  This is the Thumb-1 two-operand add, which
        @    is 0b0100_0100 H1b H2b Rm3 Rd3.  Rm3 and Rd3 are 3-bit
        @    register operands; H1b and H2b are bits that indicate
        @    whether they are high: H1b is 1 if Rd is high, and H2b is
        @    1 if Rm is high, exactly backwards from what you'd
        @    expect.  In theory you're not supposed to use it if
        @    they're both 0 (UNPREDICTABLE) at least in ARMv4T.
        @    Thumb-1 also has an 0b0001_100 add that takes three 3-bit
        @    registers, and a 3-bit-immediate add with two 3-bit
        @    registers, and an 8-bit-immediate add with one 3-bit
        @    register.
        @ 13. bl.w (in Thumb-1)
        @    105e6:       f7ff ffe7       bl      105b8 <dumbfib>
        @    1060c:       f7ff ffd4       bl      105b8 <dumbfib>
        @    10622:       f7ff fff7       bl      10614 <dumbfib_hand>
        @        6:       f000 f82b       bl      60 <trash+0x60>
	@    Again, 2-byte units from $+4.  There are 22 bits of
        @    offset in these instructions, plus one bit of
        @    zero-padding to the right.  p. 227/322: 0b1111 Hb
        @    hoffset11.  If H = 1 it sign-extends, shifts left 12
        @    bits, adds to pc ($+4), and sticks the result in lr.  If
        @    H = 0 it shifts left 1 bit, adds to LR, and branches and
        @    links.  Later versions of the ARM ARM only document the
        @    combined effect rather than the two separate instructions.
        @ 14. mvn.w r5, #imm (no narrow way to load a negative constant)
        @    105f0:       f06f 0501       mvn.w   r5, #1
        @       72:       f06f 0500       mvn.w   r5, #0
        @       76:       f07f 0500       mvns.w  r5, #0
        @ Other miscellaneous probably relevant instructions:
        @    106a2:       47b0            blx     r6
        @    106be:       4770            bx      lr
        @    106e0:       e92d 4230       stmdb   sp!, {r4, r5, r9, lr}
        @    106e0:       e92d 4230       stmdb   sp!, {r4, r5, r9, lr}
        @    10694:       e92d 41c0       stmdb   sp!, {r6, r7, r8, lr}
        @    106a6:       e8bd 81c0       ldmia.w sp!, {r6, r7, r8, pc}
        @    106ae:       f20f 010b       addw    r1, pc, #11
        @    106c8:       f20f 0107       addw    r1, pc, #7
        @    106da:       f2af 0047       subw    r0, pc, #71     ; 0x47
        @    1069c:       f817 0b01       ldrb.w  r0, [r7], #1
        @    there's an ldrb.n: 0b01111 offset5 Rn3 Rd3.

        @ Here's an EABI-compliant FFI stub we can call from C.
        .globl dumbfib_eabi
        .thumb_func
dumbfib_eabi:
        push {r4, r5, r6, lr}   @ The code above uses r4 and r5
        ldr  r4, =operand_stack
        mov  r5, r0             @ pass EABI argument on top of stack
        bl   dumbfib
        mov  r0, r5             @ take EABI return value from top of stack
        pop  {r4, r5, r6, pc}

        @ Here's a hand-written EABI version for comparison:
        .globl dumbfib_hand
        .thumb_func
dumbfib_hand:
        cmp    r0, #2
        bge    1f
        movs.n r0, #1           @ base case, return 1
        bx     lr
1:      push   {r4, lr}         @ r4 is callee-saved in EABI
        mov    r4, r0           @ save argument in r0
        subs   r0, #1
        bl     dumbfib_hand
        mov    r1, r4           @ again EABI's arg0 being return value bites!
        mov    r4, r0           @ save return value from first recursive call
        subs.n r0, r1, #2       @ arg for second call
        bl     dumbfib_hand
        add    r0, r4
        pop    {r4, pc}
        
        @ That's 14 instructions and 32 bytes.  Interestingly, it runs
        @ only about 25% faster than the dumb-simulated-JIT-compiled
        @ version above, at least in QEMU simulation.  32 bytes is 60%
        @ more than the 20-byte bytecode version.  It would be great
        @ if the bytecode slowdown in practice turned out to be only
        @ 33% like that, but I suspect that dumbfib is sort of the
        @ best case for the bytecode, and that the slowdown will also
        @ be worse on real hardware.  Also, code that's actually
        @ JIT-compiled has to use a trampoline to call and return, to
        @ ensure that the destination of the control transfer exists.
        @ Still, as a preliminary result, it's heartening.

        @ If I calculate nominal instruction timings for dumbfib_hand
        @ and dumbfib for the Cortex-M4F, dumbfib_hand comes out to:

        @ - in leaf cases: 4 instructions + half a mispredicted branch
        @   = 5.5 clocks
        @ - in non-leaf cases: 12 instructions + 4 memory accesses +
        @   half a mispredicted branch = 17.5 clocks
        @ - total: 23 clocks per unit of Fibonacci output, e.g., for
        @   F₃₀ = 1'346'269 we have almost 30'964'187 clocks.

        @ And dumbfib comes out to:

        @ - in leaf cases: 12 instructions + 9 memory accesses + half
        @   a mispredicted branch = 22.5 clocks
        @ - in non-leaf cases: 27 instructions + 19 memory accesses +
        @   half a mispredicted branch = 47.5 clocks
        @ - total: 70 clocks per unit of Fibonacci output.

        @ So probably 3× slowdown is more realistic than 25% slowdown.

        @ My comment about the arg0=rv thing above is that in an ABI
        @ which defines the return value to be in r3 instead of r0, we
        @ could have saved an instruction and, in a sense, a temp
        @ register:
        .globl dumbfib_hand_noeabi
        .thumb_func
dumbfib_hand_noeabi:
        cmp    r0, #2
        bge    1f
        movs.n r3, #1
        bx     lr
1:      push   {r4, lr}
        mov    r4, r0
        subs   r0, #1
        bl     dumbfib_hand_noeabi
        subs.n r0, r4, #2       @ no need to move the return value first;
        mov    r4, r3           @ we can do it after subtracting the saved arg
        bl     dumbfib_hand_noeabi
        add    r3, r4
        pop    {r4, pc}

        @ To test that you can use this stub that adapts it to the EABI:
        .globl dumbfib_noeabi_stub
        .thumb_func
dumbfib_noeabi_stub:
        push {r4, lr}
        bl dumbfib_hand_noeabi
        mov r0, r3              @ put return value where EABI wants it
        pop {r4, pc}

        .bss
operand_stack:
        .fill 1024