@@ Semi-FP-persistent priority queue for recursive-subdivision SDF
        @@ raymarching.  Totally untested.  172 bytes of code, 64
        @@ instructions.  EABI-compliant?  See also monton.c.  It
        @@ supports four operations once the queue is built:
        @@ 
        @@ - peek(): (priority, value), returns the element with the currently
        @@   lowest priority (without removing it);
        @@ - kick(priority): (), changes its priority (usually to
        @@   something higher);
        @@ - save(): (), snapshots the state of the priority queue on
        @@   a snapshot stack;
        @@ - restore(): (), restores and removes the latest snapshot
        @@   on the snapshot stack, thus undoing all the kick()
        @@   operations since the corresponding save().
        @@
        @@ To provide them efficiently, this is a binary heap with
        @@ explicit pointers for the child relationships, with its
        @@ memory managed via an arena allocator.  There are two
        @@ main record types, each four words in size, which are packed
        @@ into this arena:
        @@
        @@ struct node { struct node *left, *right; int k; void *v; };
        @@ struct snap { char *β; struct node *root; struct snap *parent, *x; };
        @@
        @@ save() appends a new snap to the arena, and then a new copy
        @@ of the root node.  β is the current allocation pointer,
        @@ which is incremented by 16 after every allocation.  Nodes
        @@ to the right of the current snapshot are part of that
        @@ snapshot and thus mutable (maybe “snapshot” is a bad term),
        @@ while nodes to its left are immutable.  The handle to the
        @@ whole heap is a struct snap**, but within kick() and its
        @@ descendants, the pointer to the current snap is always kept
        @@ in r10.
        @@
        @@ The current snapshot always contains its own copy of at
        @@ least the root node.
        @@
        @@ Instead of null pointers, the child pointers at the bottom
        @@ tier of the tree all point at a single sentinel node whose
        @@ priority is bigger than any possible real node.
        @@
        @@ I say “semi-FP-persistent” because snapshotting is explicit
        @@ rather than implicit, and you can only update the latest
        @@ live snapshot rather than any snapshot, and the
        @@ implementation takes advantage of this to update in place.
        @@ Under the covers, though, those are mostly just questions
        @@ of the allocation strategy.

        .syntax unified
        .thumb
        .cpu cortex-m4
        .text

        @@ peek: Returns lowest-priority value in the heap.
        @@ On entry: struct snap** in r0.
        @@ On exit: priority in r0, associated value in r1.  (I think
        @@ EABI passes a pointer for struct return in r0 instead.)
        .globl peek
        .thumb_func
peek:   ldr r0, [r0]            @ fetch snap pointer
        ldr r0, [r0, #4]        @ fetch root pointer
        ldr r1, [r0, #12]       @ fetch v from root of heap
        ldr r0, [r0, #8]        @ fetch k from root of heap
        bx lr
        @@ That’s 5 instructions, 1 jump, and 4 memory accesses, so
        @@ nominally like 12 cycles.


        @@ restore: Restores the latest heap snapshot.
        @@ On entry: struct snap** in r0.
        .globl restore
        .thumb_func
restore:
        ldr r1, [r0]            @ fetch snap pointer
        ldr r1, [r1, #8]        @ fetch parent pointer
        str r1, [r0]            @ overwrite old snap pointer with it
        bx lr
        @@ That’s 4 instructions, 1 jump, and 4 memory accesses, so
        @@ also nominally like 11 cycles.


        @@ save: Creates a new heap snapshot.
        @@ On entry: struct snap** in r0.
        .globl save
        .thumb_func
save:   push {r4, r5, r10, lr}
        ldr r5, [r0]            @ fetch snap pointer into r5
        ldr r10, [r5]           @ fetch allocation pointer (steal gets r10)
        add r3, r10, #16        @ compute allocation pointer for new snap;
        mov r4, r3              @ root node will be there once we copy it.
        stmia r2!, {r3-r6}      @ construct new snap (update r2 just bcz tiny)
        ldr r0, [r5, #4]        @ load pointer to root of old snap we will copy
        bl steal                @ make a “stolen” local copy of it
        pop {r4, r5, r10, lr}
        @@ That’s 9 instructions, 2 jumps, 15 memory accesses, and a
        @@ call to steal, so nominally like 53 cycles.


        @@ steal: Copies a node into the current snapshot so we can
        @@ start mutating it.  The caller is responsible for updating
        @@ any pointers to that node.
        @@ On entry: struct node* in r0, struct snap* in r10.
        @@ On exit: copied struct node* in r0.
        .globl steal
        .thumb_func
steal:  push {r4, lr}           @ save r4
        ldmia r0, {r1-r4}       @ load node’s current fields into r1-r4
        ldr r0, [r10]           @ load allocation pointer from snap
        stmia r0!, {r1-r4}      @ copy node’s fields into new space
        str r0, [r10]           @ store updated allocation pointer in snap
        pop {r4, pc}            @ restore r4 and return
        @@ That’s 6 instructions, 1 jump, and 14 memory accesses, so
        @@ nominally like 23 cycles.

        @@ Update the priority of the lowest-priority node in the heap.
        @@ On entry: struct snap** in r0, new priority in r1.
        .globl kick
        .thumb_func
kick:   push {r10, lr}          @ expropriate r10 (sl)
        ldr r10, [r0]           @ fetch snap pointer (in r10 for siftdown)
        ldr r0, [r10, #4]       @ load root node pointer
        str r1, [r0, #8]        @ update its priority with the new priority
        bl siftdown
        pop {r10, pc}           @ XXX can probably get rid of these two insns
        @@ The entry-exit overhead there is 6 instructions, 5 memory
        @@ accesses, and 2 jumps, which works out to nominally like 17
        @@ cycles, but the real work is in the callee `siftdown`.


        @@ On entry: struct node* in r0, current snap pointer in r10.
        @@ r0 must point to a *mutable* node, one in the current snap.
        .globl siftdown
        .thumb_func
siftdown:
        push {r4-r10, lr}       @ Save callee-saved r4-r10; r10 only for align
        mov r6, r0              @ Within main loop, r6 points at parent node
    1:  ldmia r6, {r2-r5}       @ left r2, right r3, k r4, v r5
        ldr r7, [r2, #8]        @ left priority (left.k) in r7
        ldr r8, [r3, #8]        @ right priority (right.k) in r8
        cmp r4, r7              @ is parent.k < left.k?  if not,
        blt 4f
        cmp r4, r8              @ is parent.k < right.k?  if so,
        blt 2f                  @ we know right.k < left.k, so kid ← right;
        pop {r4-r10, pc}        @ or if neither kid is lower than k, we’re done.

    4:  cmp r7, r8              @ Find out which kid is the lower of the two.
        bge 2f                  @ If right is lower, skip to the end.
        mov r9, r2              @ kid ← left
        cmp r2, r10             @ uh but can we mutate kid?  is it mutable?
        bhi 5f                  @ If it’s part of a previous snapshot,
        mov r0, r2              @ steal it (invoke steal with left)
        bl steal
        str r0, [r6]            @ and store pointer to copy in parent.left,
        mov r9, r0              @ and set kid to that return value.
    5:  ldr r0, [r9, #12]       @ Now load kid.v (we already have kid.k in r7)
        str r4, [r9, #8]        @ and overwrite kid.k with parent.k
        str r5, [r9, #12]       @ and kid.v with parent.v,
        str r7, [r6, #8]        @ then overwrite parent.k with kid.k
        str r0, [r6, #12]       @ and parent.v with kid.v,
        mov r6, r9              @ then parent ← kid and
        b 1b                    @ go back to the top of the main loop!

    2:  mov r9, r3              @ This is the right-subtree case.  kid ← right
        mov r7, r8              @ And also let’s put right.k into r7
        cmp r3, r10             @ Uh but can we mutate right?  is it mutable?
        bhi 5b                  @ If it’s part of a previous snapshot,
        mov r0, r3              @ steal(right)
        bl steal
        str r0, [r6, #4]        @ store return value into parent.right and
        mov r9, r0              @ into kid
        b 5b                    @ before continuing as in the left-hand case.

        @@ All in all that’s 37 instructions, but there are
        @@ conditionals and a loop.

        @@ The entry/exit sequence there is 3 instructions, 16 memory
        @@ accesses, and a jump.  The main loop starting and ending at
        @@ label 1 has a lot of possible execution paths.
        @@ 
        @@ Consider the case where we traverse down to the left child.
        @@ We run 5 instructions and jump to label 4.  In what we hope
        @@ is the usual case, we can mutate the child in place, so
        @@ after 5 more instructions, we jump to label 5, then run 7
        @@ more instructions before going back to the start.  So this
        @@ run of the main loop has run 17 instructions, including 11
        @@ memory accesses and 3 jumps.  Nominally this is about 37
        @@ cycles.
        @@
        @@ If we traverse down to the right child instead, there are
        @@ two ways to get to label 2, which happen to involve the
        @@ same number of instructions from the top of the loop (7
        @@ instructions) but either 1 or 2 jumps.  Once we’re at label
        @@ 2, the happy in-place path is 4 more instructions
        @@ (including another jump), then the same last 7 instructions
        @@ we’d use for the left child (and final jump).  So 18
        @@ instructions, the same 11 memory accesses, and either 3 or
        @@ 4 jumps.  Estimate, 38 or 41 cycles.
        @@
        @@ On either path, we may need to steal a child that’s
        @@ currently immutable.  This costs an extra 4 instructions, 1
        @@ memory access, the 23 cycles of steal, and in the
        @@ right-hand case, an extra jump.  So 28 or 31 more cycles.
        @@ 
        @@ So we’re at something like 38 cycles per level on average
        @@ when we’re not copying nodes, or 65 cycles on average when
        @@ we are.  But how deep do we typically go?  This depends a
        @@ lot on the application, but for the application I have in
        @@ mind, I think it’s typically like an average of 1 level.


        @@ Okay, how do you build the heap?  At the moment that’s
        @@ sort of a less interesting question from my point of view,
        @@ because the application I’m currently thinking about is
        @@ raymarching in which the heap gets built once per frame (or
        @@ even less often) and then saved and restored a couple
        @@ million times per frame and kicked tens of millions of
        @@ times per frame, so it doesn’t really matter how efficient
        @@ the initial heap-building is, as long as it’s less than
        @@ about five orders of magnitude more inefficient.