/* Dumb JIT compiler for ARM Thumb from bytecode; forked from nullcomplr.c. Now it can compile conditional jumps, local variable reads and writes, small constant pushes, and returns. The big thing missing now, for example to be able to compile fib, is calls. [arm] is the 716-page ARMv7-M architecture reference manual, ARM DDI 0403C_errata_v3 (ID021910), issue C with errata marked, February 02010 (thank you, Prabal Dutta): */ #include #include #include #include #include int jit_eabi_stub(void *native_code, ...); /**** assembler *****/ void assemble_halfword(char **buf, unsigned short h) { *(*buf)++ = h & 0xff; *(*buf)++ = h >> 8; } void assemble_pushn(char **buf, int regs) { assemble_halfword(buf, 0xb400 + regs); } void assemble_popn(char **buf, int regs) { assemble_halfword(buf, 0xbc00 + regs); } void assemble_movs8(char **buf, int reg, int imm8) { assemble_halfword(buf, 0x2000 + (reg << 8) + imm8); } void assemble_stmia_thumb1(char **buf, int reg, int regs) { /* [arm96] p. 262/322. 0b1100_0 Rn3 register_list_8. Same as push/pop except no R bit. */ assemble_halfword(buf, 0xc000 + (reg << 8) + regs); } void assemble_subs_imm8(char **buf, int reg, int imm8) { assemble_halfword(buf, 0x3800 + (imm8 << 8) + reg); } void assemble_ldr_reg_offset(char **buf, int dest_reg, int base_reg, int woffset5) { assemble_halfword(buf, 0x6800 + (woffset5 << 6) + (base_reg << 3) + dest_reg); } int only_one_bit_set(int v) { /* This clears the least significant bit. If that leaves nothing set, then only one bit was set. (v ^ (v - 1)) + 1 would be a value with only that least significant bit set. */ return !(v & (v - 1)); } int count_trailing_zeroes(unsigned bitmask) { int result = 0; for (;;) { bitmask >>= 1; if (!bitmask) break; result++; } return result; } /* Assemble a base-register-updating “ldmdb” with only one destination register. gas assembles `ldmdb r4!, {r1}` to a non-ldmdb instruction: 7a: f854 1d04 ldr.w r1, [r4, #-4]! As it turns out, that’s actually mandatory, and when I wasn’t observing that restriction, QEMU was giving me SIGILL. says: > Encoding T1 does not support a list containing only one register. If an LDMDB instruction with just one register `` in the list is assembled to Thumb, it is assembled to the equivalent `LDR ,[,#-4]{!}` instruction. [arm] also says this! I just hadn’t read it carefully. [arm] p.226/716 (A6-88) encoding T4 for LDR (Immediate). 0b1111_1000_0101 Rn4 Rt4 1 P1 U1 W1 imm8. U is 1 to add the immediate offset rather than subtracting, so the offset is sort of sign-magnitude. Rt is the target. W is 1 if Rn should be updated with the effective address (pre- or postincrement). P is 0 if that update should not be applied to the actual index (postincrement). */ void assemble_ldmdb_1(char **buf, int reg, int regs) { int target = count_trailing_zeroes(regs); assemble_halfword(buf, 0xf850 + reg); int P = 1, U = 0, W = 1; assemble_halfword(buf, (target << 12) + (1 << 11) + (P << 10) + (U << 9) + (W << 8) + 4); // For a while I was using this 2-instruction Thumb-1 sequence (I // hadn’t looked up the Thumb-2 sequence) but it sets the flags, // which is a fatal difference in some contexts: //assemble_subs_imm8(buf, reg, 4); /* subs reg, #4. */ //assemble_ldr_reg_offset(buf, target, reg, 0); /* ldr target, [reg, #0] */ } void assemble_ldmdb(char **buf, int reg, int regs) { /* Thumb2. [arm] p. 126/716 (A5-20) and XXX (A6-86). 0b1110100 op2 0 W1 L1 Rn4 regs16. This encoding is illegal for just one destination register (see below), so QEMU is giving me SIGILL in that case, so I need to encode that with ldr.w. */ if (only_one_bit_set(regs)) { assemble_ldmdb_1(buf, reg, regs); return; } int op2 = 2; /* 0x10, xxxdb, not xxxia */ int L1 = 1; /* 1: ldmxx, not stmxx */ int W1 = 1; /* update base register (“wback”) */ assemble_halfword(buf, 0xe800 + (op2 << 7) + (L1 << 5) + (W1 << 4) + reg); assemble_halfword(buf, regs); } /* ldr r5, [sp, #(4*woffset)], if reg=5 */ void assemble_ldr_sp_rel(char **buf, int reg, int woffset) { assemble_halfword(buf, 0x9800 + (reg << 8) + woffset); } // str r5, [sp, #(4*woffset)], if reg=5. [arm] p. 360/716 (A6-222), // STR (immediate), encoding T2 void assemble_str_sp_rel(char **buf, int reg, int woffset) { assemble_halfword(buf, 0x9000 + (reg << 8) + woffset); } /* mov dest_reg, src_reg, if both are low regs */ void assemble_mov_rr_thumb1(char **buf, int dest_reg, int src_reg) { /* [arm] p. 288/716 (A6-150), encoding T1 from ARMv6: 0b0100_0110 D1 Rm3 Rd3; D is prepended to the source register number to permit high register access */ assemble_halfword(buf, 0x4600 + (src_reg << 3) + dest_reg); } /* add dest_reg, src_reg, if both are low regs */ void assemble_add_rr_low(char **buf, int dest_reg, int src_reg) { assemble_halfword(buf, 0x4400 + (src_reg << 3) + dest_reg); } /* cmp rn, rm, if both are low regs */ void assemble_cmp_loregs(char **buf, int rm, int rn) { // [arm96] p. 231/322. 0b0100_0010_10 Rm3 Rn3, cmp between // two low registers. As usual the operands are backwards: Rn // - Rm. There’s also a form of cmp which can handle high // registers in Thumb-1. assemble_halfword(buf, 0x4280 + (rm << 3) + rn); } // Add some number of words to sp, to remove that many items from the // stack in a single cycle. void assemble_add_sp_4(char **buf, int n_words) { // [arm] p. 164/716 (A6-26), ADD (SP plus immediate), encoding T2, // 0b1011_0000_0 imm7. assemble_halfword(buf, 0xb000 + n_words); } // Thumb-2 mvn.w, [arm] p. 300/716 (A6-162), 0b1111_0 i1 0b00_011 S1 // 0b1111 imm3 Rd3 imm8. i:imm3:imm8 go through the usual 12-bit // Thumb immediate expansion; for this simple case, as explained in // table A5-11 on p. 121/716 (A5-15), we want i and imm3 to be 0 so // the imm8 will just be the imm8. S1 = 1 if we want to update the // flags, which in this case we don’t care about, so I’m leaving it 0. void assemble_mvn8(char **buf, int reg, int imm8) { assemble_halfword(buf, 0xf06f); assemble_halfword(buf, (reg << 8) + imm8); } /**** bytecode compiler *****/ enum { tos_register = 5, /* r5 contains the top of the operand stack */ ops_register = 4, /* r4 points at the rest of the operand stack */ }; void whine(int unknown_bytecode, int where) { fprintf(stderr, "Surprising bytecode %02x at %d\n", unknown_bytecode, where); } void compile_prologue(char **buf, int n_args) { assert(n_args >= 0 && n_args <= 5); // We pop all but one of the args from the operand stack in memory. // The last one is r5, if there are any. int pop_args = n_args - 1; if (pop_args < 0) pop_args = 0; int pop_args_mask = (1 << pop_args) - 1; if (pop_args_mask) { assemble_ldmdb(buf, ops_register, pop_args_mask); } // Now we must push those args on the return stack, along with lr. // If necessary, we also push r6 to avoid misaligning the stack and // violating the EABI. This happens for even numbers of args. // Instead of r6 we could use any register between r5 and lr. int push_bitmask = 1 << 8 | pop_args_mask; /* lr */ if (n_args) push_bitmask |= 1 << tos_register; /* r5 */ if (!(n_args & 1)) push_bitmask |= 1 << 6; /* r6 */ assemble_pushn(buf, push_bitmask); // If we pushed r5, we also need to pop a new r5 value from the // operand stack. if (n_args) { assemble_ldmdb(buf, 4, 1 << tos_register); } } void compile_epilogue(char **buf, int n_args) { // It’s faster to just increment the SP past most of the args than // to re-pop them. We do have to re-pop the final one to avoid // having the stack temporarily misaligned. Here we always // increment the SP by an *even* number of words to avoid // misalignment; we might have pushed one extra on entry. // 1 -> 2, 2 -> 2, 3 -> 4, 4 -> 4 int pop_args = (1 | n_args) - 1; if (pop_args) { assemble_add_sp_4(buf, pop_args); } // In the return instruction we must pop one more arg to discard, // which we can safely just toss into r0. assemble_popn(buf, 1 << 8 | 1); /* only pop pc and r0 */ } int compile_bytecode(char *outbuf, const char *in, size_t len, int n_args) { char *native_ptr[len]; /* native code for in[i] starts at native_ptr[i] */ struct reloc { char *insn; int target; } jumps[len]; int n_jumps = 0; char *p = outbuf; compile_prologue(&p, n_args); for (size_t i = 0; i < len; i++) { native_ptr[i] = p; char b = in[i]; switch(b >> 4) { case 0: /* bytecodes without arguments */ switch(b) { case 1: /* bge */ ; int target = (signed char)in[i+1] + i + 2; i++; assemble_ldmdb(&p, ops_register, 1 << 3); /* ldmdb r4!, {r3} */ assemble_cmp_loregs(&p, 3, tos_register); /* cmp r5, r3 */ assemble_ldmdb(&p, ops_register, 1 << tos_register); /* ldmdb r4!, {r5} */ jumps[n_jumps].insn = p; jumps[n_jumps].target = target; n_jumps++; assemble_halfword(&p, 0xda00); /* bge.n with offset 0 */ break; case 2: /* ret */ compile_epilogue(&p, n_args); break; case 3: /* add */ assemble_ldmdb(&p, ops_register, 1 << 1); /* ldmdb r4!, {r1} */ assemble_add_rr_low(&p, tos_register, 1); /* add r5, r1 */ break; default: whine(b, i); break; } break; case 1: /* my: read a local */ assemble_stmia_thumb1(&p, ops_register, 1 << tos_register); /* stmia r4!, {r5} */ assemble_ldr_sp_rel(&p, tos_register, b & 0xf); /* ldr r5, [sp, #(4*n)] */ break; case 4: /* setmy: write a local */ assemble_str_sp_rel(&p, tos_register, b & 0xf); assemble_ldmdb(&p, ops_register, 1 << tos_register); break; case 2: /* lit4 */ /* stmia r4!, {r5} */ assemble_stmia_thumb1(&p, ops_register, 1 << tos_register); int val = b & 0xf; /* Arguably this conditional should be down in assemble_movs8. */ if (val & 8) { /* bit-invert nibble */ assemble_mvn8(&p, tos_register, val ^ 0xf); break; } assemble_movs8(&p, tos_register, val); /* set r5 to val */ break; default: whine(b, i); break; } } /* Link jumps to their destinations */ for (int i = 0; i < n_jumps; i++) { char *jump = jumps[i].insn; char *origin = jump + 4; /* Thumb offset figured from here */ int bytecode_target = jumps[i].target; assert(0 <= bytecode_target); assert(bytecode_target < len); int diff = native_ptr[bytecode_target] - origin; assert(diff % 2 == 0); assert(-256 <= diff); assert(diff < 254); *jump = (char)(diff/2); } int n = p - outbuf; assemble_halfword(&p, 0xdead); assemble_halfword(&p, 0xbeef); return n; } int debug_dump_bytecode(char *codebuf, size_t n) { /* Write it to a file so I can run a disassembler on it. As in: arm-linux-gnueabi-objdump -b binary -D -Mforce-thumb -mcortex-m4 dumbjit.out.bin */ char *outfile = "dumbjit.out.bin"; int fd = open(outfile, O_WRONLY | O_CREAT | O_TRUNC, 0666); if (fd < 0) { perror(outfile); return 2; } if (n != write(fd, codebuf, n)) { perror("write"); return 3; } if (0 != close(fd)) { perror("close"); return 4; } return 0; } /**** compiler tests *****/ /* Subroutine that just returns 3; our first objective is to compile this to working Thumb-2 machine code */ char return_3_bytecode[] = "\x23\x02"; /* This one returns -3. */ char return_neg3_bytecode[] = "\x2d\x02"; /* This is a one-argument subroutine that returns its argument (but can be compiled to take more arguments) */ char return_arg1_bytecode[] = "\x10\02"; /* This is a two-argument subroutine that returns its second argument */ char return_arg2_bytecode[] = "\x11\02"; /* This one adds its two arguments. */ char add_args_bytecode[] = "\x10\x11\x03\x02"; /* This one returns the greater of its two arguments. */ char max_bytecode[] = "\x10\x11\x01\x02\x10\x02\x11\x02"; /* Triangular number plus second argument. Loops when 1 >= arg0. */ char tri_bytecode[] = "\x10\x11\x03\x41\x10\x2f\x03\x40\x21\x10\x01\xf4\x11\x02"; /* Fibonacci subroutine from hadixmv.S; our second objective */ char fib_bytecode[] = "\x10\x22\x01\x02\x21\x02\x10\x2f\x03\x30\x10\x2e\x03\x30\x03\x02"; /* Regression test case definition */ typedef struct { char *name; /* name to display for the case */ char *code; /* pointer to bytecode to compile */ int size; /* size of bytecode to compile */ int n_args; /* number of args to compile for */ int args[3]; /* jit_eabi_stub supports up to 3 so far */ int expected_result; /* when called with those args */ } test_case; test_case tests[] = { #define make_case(name, varname, n_args, expected, args...) \ {#name, varname ## _bytecode, sizeof varname ## _bytecode - 1, n_args, {args}, expected} make_case(return 3, return_3, 0, 3), make_case(return 3 with arg, return_3, 1, 3, 53), make_case(return 3 with 2 args, return_3, 2, 3, 53, 33), make_case(return 3 with 3 args, return_3, 3, 3, 53, 33, 13), make_case(return -3, return_neg3, 0, -3), // These argument lists are in reverse order because jit_eabi_stub // doesn’t compensate for that. make_case(return arg1, return_arg1, 1, 53, 53), make_case(return arg1 2 args, return_arg1, 2, 33, 13, 33), make_case(return arg1 3 args, return_arg1, 3, 63, 23, 43, 63), make_case(return arg2 2 args, return_arg2, 2, 13, 13, 33), make_case(return arg2 3 args, return_arg2, 3, 43, 23, 43, 63), make_case(addition, add_args, 2, 7, 3, 4), make_case(addition II, add_args, 2,5353+53,53,5353), make_case(max, max, 2, 53, 53, 33), make_case(max II, max, 2, 53, 33, 53), make_case(tri 1, tri, 2, 1, 0, 1), make_case(tri 2, tri, 2, 3, 0, 2), make_case(tri 3, tri, 2, 6, 0, 3), make_case(tri 4, tri, 2, 10, 0, 4), #undef make_case }; int run_tests(char *codebuf) { int n_tests = sizeof(tests)/sizeof(tests[0]); printf("Running %d tests:\n", n_tests); for (size_t i = 0; i < n_tests; i++) { test_case *t = &tests[i]; printf(" %s:", t->name); fflush(stdout); int n = compile_bytecode(codebuf, t->code, t->size, t->n_args); for (int j = 0; j < n/2; j++) { if (!(j & 7)) printf("\n "); printf("%04x ", ((unsigned short*)codebuf)[j]); } printf("\n"); /* This will make the tests slower, but makes it much easier to disassemble the broken code when a test fails. */ int err = debug_dump_bytecode(codebuf, n); if (err) return err; // +1 for Thumb int result = jit_eabi_stub(codebuf+1, t->args[0], t->args[1], t->args[2]); // More commonly, compiler bugs and missing features cause a crash. if (result != t->expected_result) { printf("FAIL: got %d (0x%x), expected %d (0x%x)\n", result, result, t->expected_result, t->expected_result); return 1; } printf("ok (%d)\n", result); } printf("All OK!\n"); return 0; } int main() { char *codebuf = mmap(0, /* no requested address */ 4096, /* requested size */ PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, /* MAP_ANONYMOUS portability requires fd -1 */ 0 /* offset is meaningless with no fd */ ); printf("codebuf = %lx\n", (long)codebuf); if (!codebuf) return 1; int result = run_tests(codebuf), n; /* Seems like it takes about 15ns per triangular-number iteration via qemu-user on this Ryzen 5 3500U because these billion loops take 14-15 seconds, versus 450ms for the native amd64 assembly implementation in trin.S, so that’s a slowdown of about 30 between ARM emulation and the JIT-compiler dumbness. The inner loop being tested is 24 emulated ARM Thumb-2 instructions (vs. 3 for trin.S). trinarm.S is an ARM Thumb-2 version and takes 4.6 seconds, so dumbjit is responsible for a slowdown of about π, though QEMU’s performance characteristics may not be a good approximation of a Cortex-M4’s. In SBCL the slowdown vs. native amd64 assembly is about 7.5: * (time (loop for i from 1 to 100000 do (loop for j from 1 to 10000 summing j))) Evaluation took: 3.360 seconds of real time 3.362771 seconds of total run time (3.362771 user, 0.000000 system) 100.09% CPU 7,048,616,274 processor cycles 0 bytes consed In Racket it’s closer to 6.5: > (time (let ((n #f)) (for ([i 100000]) (set! n (for/sum ([j 10001]) j))) n)) cpu time: 2874 real time: 2875 gc time: 0 50005000 On a Rockchip RK3399 dumbjit takes 37 seconds on the LITTLE cores (Cortex-A53?) and 16 seconds on the big cores (Cortex-A72?), but I don’t know at what clock frequency in either case, probably between 400 MHz and 1.4 GHz in the second case. At “max performance” (a06-gearbox -s 6) it takes 9-10 seconds. Aha, at 1008MHz on the A53 cores it takes 30-31.5 seconds. At 1418MHz it’s 21.4-23.3 seconds on the LITTLE cores. My Ryzen laptop normally runs at 2.4 GHz when loaded, so 15ns is about 36 clock cycles. 10ns at 1.4 GHz is about 14 clock cycles. The Cortex-A72 is supposedly capable of dispatching five instructions per cycle, supposedly roughly comparable to the Ryzen; so this amounts to an emulation slowdown of about 2–3. Interestingly, this program is missing an essential cache-flushing step: */ /* for (size_t i = 0; i < 100000; i++) { */ /* n = jit_eabi_stub(codebuf+1, 0, 10000); */ /* } */ /* printf("%d\n", n); */ return result; }