/* in arm-mode, M-; at the beginning of the buffer errors. */ /* here M-; inserts a C-style comment which I guess is okay. But M-q inside it very much is not. */ .syntax unified @ hey, at least M-; does the right thing .cpu cortex-m4 @ and there's no electric semicolon .thumb @ in arm-mode. M-RET to go to the @ next line would be nice; M-q does @ work. Oh, but apparently enter on a @ line aligns it with the previous @ one, which is pretty undesirable in @ a hanging comment like this, or in a @ non-comment line following it. Or @ in the C-style comment above. .thumb_func trash: movs r5, #127 ldr r5, =127 bl 2f movs r4, #128 movs r5, #128 ldr r5, =128 movs r8, #128 movs r5, #200 ldr r5, =200 movs r5, #255 ldr r5, =255 movw r5, #256 ldr r5, =256 movw r5, #1000 ldr r5, =1000 movw r5, #12345 ldr r5, =12345 ldr r5, =123456 @ Recommended way instead on ARMv7: @ Same time and space usage as ldr plus a constant pool @ element, but apparently there's some advantage I forget. movw r5, #:lower16:123456 movt r5, #:upper16:123456 cmp r0, r1 cmp r3, r4 cmp r0, r5 b 1f ldr r5, [sp, #4] ldr r5, [sp, #-4] str r5, [sp, #4] ldr r5, [sp, #8] str r5, [sp, #8] b 2f ldr r5, [sp, #12] ldr r5, [sp, #16] ldr r5, [sp, #20] ldr r5, [sp, #32] ldr r5, [sp, #64] 2: ldr r5, [sp, #128] ldr r5, [sp, #256] ldr r5, [sp, #512] ldr r5, [sp, #1024] ldr r5, [sp, #1028] 1: movs r5, #-1 ldr r5, =-1 mvn r5, #0 mvns r5, #0 mvn r5, #128 mvn r5, #255 mvn r5, #256 mvn r5, #512 @ Thumb-1 version movs r5, #0 mvns.n r5, r5 bx lr .ltorg outline_of_paged_access: and r5, r7 @ limit index range bic r5, #3 @ align for word access lsr r0, r5, #8 @ isolate page bits @ Can't do this in Thumb, "shift out of range": @ldr r0, [r8, r0, lsl #10] @ load page pointer cbz r0, tramp_fall_pag_53 @ handle page fault if NULL lsl r5, #24 @ isolate index into page @ Can't do this in Thumb either, "Thumb supports only LSL in...indexing" @ldr r5, [r0, r5, lsr #24] @ load word from page bx lr tramp_fall_pag_53: @ But Thumb-2 added bitfield extraction instructions! So we @ can do much better than old ARM could: better_paged_access: and r5, r7 @ as before, limit index range ubfx r0, r5, #10, #22 @ unsigned bitfield extract of 22 bits ldr r0, [r8, r0, lsl #2] @ load page pointer cbz r0, tramp_fall_pag_54 ubfx r5, r5, #2, #8 @ implicitly discard problematic low bits ldr r5, [r0, r5, lsl #2] @ load desired word tramp_fall_pag_54: @ That's only 6 instructions plus 2 memory references (one to @ the page table so probably fast): 9 cycles. b.w . read_word.rd0: ubfx r5, r5, #0, #10 ldr r5, [r10, r5] trampoline: @ Callsite in r0, callee number in r1, subroutine table in r9. push {r0, lr} ldr r2, [r9, r1, lsl #2] @ load callee address cbz r2, missing_callee blx r2 pop {r0, r2} @ use r2 instead of lr to avoid .w ubfx r1, r0, #4, #12 @ extract caller subroutine number cbz r1, missing_caller bx r2 missing_callee: b.w not_implemented missing_caller: b.w not_implemented .equiv caller_id, 53 .equiv callsite_number, 4 .equiv callee_id, 5353 call_via_trampoline: movw r0, #(caller_id << 4 | callsite_number) movw r1, callee_id bl trampoline per_callee_trampoline: movw r1, callee_id b trampoline hand_compiled_no_states: stmia r4!, {r5} @ my.4 ldr r5, [sp, #16] tst r5, r5 @ bz.9f ldmdb r4!, {r5} beq 9f stmia r4!, {r5} @ my.4 ldr r5, [sp, #16] add r5, #12 @ field.3 bl.w paged_load_word str r5, [sp, #12] @ setmy.4 ldmdb r4!, {r5} stmia r4!, {r5} @ my.4 ldr r5, [sp, #16] bl.w paged_load_word @ field.0 stmia r4!, {r5} @ my.5 (note: shorter!) ldr r5, [sp, #20] ldmdb r4!, {r1, r3} @ bne.4f cmp r3, r5 mov r5, r1 bne 4f hand_compiled_with_states: ldr r3, [sp, #16] @ my.4 cbz r3, 9f @ bz.9f ldr r3, [sp, #16] @ my.4 add r3, #12 @ field.3 bl.w paged_load_word_grown str r3, [sp, #12] @ setmy.4 ldr r3, [sp, #16] @ my.4 bl.w paged_load_word_grown @ field.0 stmia r4!, {r5} @ my.5 mov r5, r3 ldr r3, [sp, #20] cmp r5, r3 @ bne.4f ldmdb r4!, {r5} bne 4f 9: 4: .data tlb_base: .fill 2*16*8 .text sketch_of_paging_scheme: ldr r0, =tlb_base ubfx r2, r0, #2, #4 ldr r1, [r0, r2, lsl #3] cmp r1, r5 bne not_in_first_way adds r0, #4 ldr r11, [r0, r2, lsl #3] bx lr .ltorg not_in_first_way: adds r0, #128 ldr r1, [r0, r2, lsl #3] cmp r1, r5 bne not_in_tlb adds r0, #4 ldr r11, [r0, r2, lsl #3] bx lr not_in_tlb: eor r1, r5, r5, ror #17 eor r1, r1, r1, ror #5 adds r0, #128 @ 512-entry hash table is immediately after 1: ubfx r1, r1, #0, #9 ldr r2, [r0, r1, lsl #3] cbz r2, not_in_hash cmp r2, r5 bne 2f adds r0, #4 ldr r11, [r0, r1, lsl #3] bx lr 2: adds r1, #1 b 1b not_in_hash: b . slimmed_load_descriptor: eor r1, r5, r5, ror #17 @ just as fast as a mov, so... ldr r0, =resident_hash 1: ubfx r1, r1, #0, #9 ldr r2, [r0, r1, lsl #3] cbz r2, 3f cmp r2, r5 bne 2f adds r0, #4 ldr r11, [r0, r1, lsl #3] bx lr .ltorg 2: adds r1, #1 b 1b 3: