/* in arm-mode, M-; at the beginning of the buffer errors. */
        /* here M-; inserts a C-style comment which I guess is okay.
           But M-q inside it very much is not.  */
        .syntax unified         @ hey, at least M-; does the right thing
        .cpu cortex-m4          @ and there's no electric semicolon
        .thumb                  @ in arm-mode.  M-RET to go to the
                                @ next line would be nice; M-q does
                                @ work.  Oh, but apparently enter on a
                                @ line aligns it with the previous
                                @ one, which is pretty undesirable in
                                @ a hanging comment like this, or in a
                                @ non-comment line following it.  Or
                                @ in the C-style comment above.
        .thumb_func
trash:  movs r5, #127
        ldr r5, =127
	bl 2f
        movs r4, #128
        movs r5, #128
        ldr r5, =128
        movs r8, #128
        movs r5, #200
        ldr r5, =200
        movs r5, #255
        ldr r5, =255
        movw r5, #256
        ldr r5, =256
        movw r5, #1000
        ldr r5, =1000
        movw r5, #12345
        ldr r5, =12345
        ldr r5, =123456
        @ Recommended way instead on ARMv7:
        @ Same time and space usage as ldr plus a constant pool
	@ element, but apparently there's some advantage I forget.
        movw r5, #:lower16:123456
        movt r5, #:upper16:123456

        cmp r0, r1
        cmp r3, r4
        cmp r0, r5

        b 1f
        ldr r5, [sp, #4]
        ldr r5, [sp, #-4]
        str r5, [sp, #4]
        ldr r5, [sp, #8]
        str r5, [sp, #8]
        b 2f
        ldr r5, [sp, #12]
        ldr r5, [sp, #16]
        ldr r5, [sp, #20]
        ldr r5, [sp, #32]
        ldr r5, [sp, #64]
2:      ldr r5, [sp, #128]
        ldr r5, [sp, #256]
        ldr r5, [sp, #512]
        ldr r5, [sp, #1024]
        ldr r5, [sp, #1028]

1:      movs r5, #-1
        ldr r5, =-1
        mvn r5, #0
        mvns r5, #0
        mvn r5, #128
        mvn r5, #255
        mvn r5, #256        
        mvn r5, #512
        @ Thumb-1 version <https://stackoverflow.com/a/33806893>
        movs r5, #0
        mvns.n r5, r5
	bx lr
        .ltorg

outline_of_paged_access:        
        and r5, r7                 @ limit index range
        bic r5, #3                 @ align for word access
        lsr r0, r5, #8             @ isolate page bits
	@ Can't do this in Thumb, "shift out of range":
        @ldr r0, [r8, r0, lsl #10]  @ load page pointer
        cbz r0, tramp_fall_pag_53  @ handle page fault if NULL
        lsl r5, #24                @ isolate index into page
	@ Can't do this in Thumb either, "Thumb supports only LSL in...indexing"
        @ldr r5, [r0, r5, lsr #24]  @ load word from page
        bx lr
tramp_fall_pag_53:      

        @ But Thumb-2 added bitfield extraction instructions!  So we
	@ can do much better than old ARM could:
better_paged_access:
        and r5, r7              @ as before, limit index range
        ubfx r0, r5, #10, #22   @ unsigned bitfield extract of 22 bits
        ldr r0, [r8, r0, lsl #2] @ load page pointer
        cbz r0, tramp_fall_pag_54
        ubfx r5, r5, #2, #8     @ implicitly discard problematic low bits
        ldr r5, [r0, r5, lsl #2] @ load desired word
tramp_fall_pag_54:      
        @ That's only 6 instructions plus 2 memory references (one to
	@ the page table so probably fast): 9 cycles.
	b.w .

read_word.rd0:
        ubfx r5, r5, #0, #10
        ldr r5, [r10, r5]

trampoline:
        @ Callsite in r0, callee number in r1, subroutine table in r9.
        push {r0, lr}
        ldr r2, [r9, r1, lsl #2]     @ load callee address
        cbz r2, missing_callee
        blx r2
        pop {r0, r2}                 @ use r2 instead of lr to avoid .w
        ubfx r1, r0, #4, #12         @ extract caller subroutine number
        cbz r1, missing_caller
        bx r2
missing_callee:
        b.w not_implemented
missing_caller:
        b.w not_implemented

        .equiv caller_id, 53
        .equiv callsite_number, 4
        .equiv callee_id, 5353
call_via_trampoline:
        movw r0, #(caller_id << 4 | callsite_number)
        movw r1, callee_id
        bl trampoline

per_callee_trampoline:
        movw r1, callee_id
        b trampoline

hand_compiled_no_states:        
        stmia r4!, {r5}              @ my.4
        ldr r5, [sp, #16]
        tst r5, r5                   @ bz.9f
        ldmdb r4!, {r5}
        beq 9f
        stmia r4!, {r5}              @ my.4
        ldr r5, [sp, #16]
        add r5, #12                  @ field.3
        bl.w paged_load_word
        str r5, [sp, #12]            @ setmy.4
        ldmdb r4!, {r5}
        stmia r4!, {r5}              @ my.4
        ldr r5, [sp, #16]
        bl.w paged_load_word         @ field.0
        stmia r4!, {r5}              @ my.5 (note: shorter!)
        ldr r5, [sp, #20]
        ldmdb r4!, {r1, r3}          @ bne.4f
        cmp r3, r5
        mov r5, r1
        bne 4f

hand_compiled_with_states:
        ldr r3, [sp, #16]            @ my.4
        cbz r3, 9f                   @ bz.9f
        ldr r3, [sp, #16]            @ my.4
        add r3, #12                  @ field.3
        bl.w paged_load_word_grown
        str r3, [sp, #12]            @ setmy.4
        ldr r3, [sp, #16]            @ my.4
        bl.w paged_load_word_grown   @ field.0
        stmia r4!, {r5}              @ my.5
        mov r5, r3
        ldr r3, [sp, #20]
        cmp r5, r3                   @ bne.4f
        ldmdb r4!, {r5}
        bne 4f
        
9:
4:      
        .data
tlb_base:
        .fill 2*16*8

        .text
sketch_of_paging_scheme:        
        ldr r0, =tlb_base
        ubfx r2, r0, #2, #4
        ldr r1, [r0, r2, lsl #3]
        cmp r1, r5
        bne not_in_first_way
        adds r0, #4
        ldr r11, [r0, r2, lsl #3]
        bx lr
        .ltorg
not_in_first_way:
        adds r0, #128
        ldr r1, [r0, r2, lsl #3]
        cmp r1, r5
        bne not_in_tlb
        adds r0, #4
        ldr r11, [r0, r2, lsl #3]
        bx lr
not_in_tlb:
        eor r1, r5, r5, ror #17
        eor r1, r1, r1, ror #5
        adds r0, #128           @ 512-entry hash table is immediately after
1:      ubfx r1, r1, #0, #9
        ldr r2, [r0, r1, lsl #3]
        cbz r2, not_in_hash
        cmp r2, r5
        bne 2f
        adds r0, #4
        ldr r11, [r0, r1, lsl #3]
        bx lr
2:      adds r1, #1
        b 1b
not_in_hash:    
        b .

slimmed_load_descriptor:
        eor r1, r5, r5, ror #17    @ just as fast as a mov, so...
        ldr r0, =resident_hash
1:      ubfx r1, r1, #0, #9
        ldr r2, [r0, r1, lsl #3]
        cbz r2, 3f
        cmp r2, r5
        bne 2f
        adds r0, #4
        ldr r11, [r0, r1, lsl #3]
        bx lr
        .ltorg
2:      adds r1, #1
        b 1b
3: