@ -*- asm -*-
        @ Exploration of a calling convention supporting a variant of
	@ asymmetric coroutines.  Motivating examples include resource
	@ allocation and cleanup, user-defined internal iterators and
	@ other control constructs, and IMGUI libraries.  The
	@ mechanism permits a caller to pass zero or more code blocks
	@ to a callee which the callee then chooses to invoke zero or
	@ more times in the caller's context.

        @ The example here is deliberately impractical: it defines an
	@ internal iterator over the bytes of a nul-terminated string
	@ and applies it to hash the string.  This mechanism has low
	@ overhead, but not so low that this is competitive with the
	@ performance of the conventional approach, though I guess you
	@ could conceivably use it if you had several different
	@ possible sources of iteration over bytes and wanted to
	@ polymorphically hash any of them.  My estimate is that, on
	@ my microcontroller target, it adds about 3 clock cycles per
	@ iterated item, plus 19 clock cycles of setup.

        @ The example, in pseudocode:
        
        @ hashsz s = {
        @   h := 53u
        @   itersz s, b => {
        @     h = (h >> 27 | h << 5) ^ b
        @   }
        @   return h
        @ }

        @ r0, r1, r2, r3: temporary or argument registers as is
	@ conventional; in particular r0 is used for the return value
	@ and first argument of hashsz, itersz, and the code block,
	@ and also for the return value of hashsz.  Neither blocks nor
	@ full subroutines must preserve them.  Sometimes this is
	@ misleadingly called "caller-saved".

        @ r4, r5, r9: caller-context registers which the callee must
	@ restore to their original values both upon returning, as
	@ usual, but also upon invoking caller-provided blocks.  You
	@ probably want at least two such registers in the Thumb
	@ low-register space in your calling convention.

        @ r6, r7, r8: call-preserved registers which the callee must
	@ restore to their original values upon returning, as usual,
	@ and which may have different callee-owned values inside of
	@ blocks, values which the blocks must restore before
	@ returning.  You probably also want at least two of these.

        .section .text,"ax",%progbits
        .syntax unified
        .thumb
        .thumb_func
        .fpu fpv4-sp-d16
        .cpu cortex-m4
itersz: push {r6, r7, r8, lr}   @ r6 and r7 are preserved by blocks
        mov  r7, r0             @ so put the string pointer in r7
        mov  r6, r1             @ and the block code pointer in r6
1:      ldrb r0, [r7], #1       @ loop over string bytes, post-indexing
        cbz  r0, 2f             @ bail out on NUL, but otherwise
        blx  r6                 @ invoke block argument with byte, then
        b    1b                 @ repeat loop
2:      pop  {r6, r7, r8, pc}   @ restore registers and return

        .thumb_func
        .globl hashsz
hashsz: push {r4, lr}           @ r4 is callee-preserved and passed to blocks
        movs r4, #53            @ initial value of hash
        adr  r1, 1f+1  @ load Thumbified pointer for block using gas local label
        bl   itersz             @ invoke iterator subroutine
        mov  r0, r4             @ load completed hash into return value
        pop  {r4, pc}           @ restore and return
1:      eor  r4, r0, r4, ror #27  @ block argument rotates & xors the byte in r0
        bx   lr                 @ then resumes the iterator

        @ Such block-based functions have an interesting degree of
	@ composability, even if less than full-fledged closures.
	@ hashsz is obviously silly, but if we define it to take an
	@ iterator function and a context parameter for that iterator:

        @ hashic iter, cx = {
        @   h := 53u
        @   iter cx, b => {
        @     h = (h >> 27 | h << 5) ^ b
        @   }
        @   return h
        @ }

	@ the only extra cost is two extra mov instructions:
        .thumb_func
hashic: push {r4, lr}
        movs r4, #53
	mov  r3, r0             @ save iterator function
        mov  r0, r1             @ put iterator context in r0
        adr  r1, 1f+1
        blx  r3
        mov  r0, r4
        pop  {r4, pc}
1:      eor  r4, r0, r4, ror #27
        bx   lr

        @ This can be used with itersz as before:
        @ hashiz s = hashic itersz, s
        .thumb_func
        .thumb
	.globl hashiz
hashiz: mov  r1, r0             @ move string pointer to second param
        adr  r0, itersz         @ no +1 because it's marked with .thumb_func
        b    hashic

        @ But it can also be used with other byte iterator functions.
        @ For example, we can define an ASCII-downcasing byte iterator
	@ function to get a case-insensitive hash.  This iterator
	@ invokes itersz to iterate over the string bytes, but
	@ downcases each one before passing it along to its own
	@ caller's block.  Because there aren't any registers left,
        @ it needs to use stack memory for this.
        @ iterlc s, &f = {   # &f declares a block argument f
        @   itersz s, b => {
        @     if 'A' <= b <= 'Z' { b += ('a' - 'A') }
        @     &f b           # &f invokes the block argument f
        @   }
        @ }
        .thumb_func
iterlc: push {r4, r5, r9, lr}
        mov  r4, sp             @ now r4 points to caller's r4, r5
        mov  r5, r1          @ put caller's block in r5 so our block can call it
        adr  r1, 1f+1           @ load thumbified pointer to our block
        bl   itersz
        pop  {r4, r5, r9, pc}
1:      cmp  r0, #'A            @ iterlc block argument has input byte in r0
        blt  1f                 @ no downcasing needed unless >= 'A'
        cmp  r0, #'Z            @ and <= 'Z'
        bgt  1f
        add  r0, #('a - 'A)
1:      mov  r1, r5             @ caller's block is in r5
        push {r4, r5, r9, lr}
        ldm  r4, {r4, r5}       @ pointer to caller's regs is in r4
        blx  r1
        pop  {r3}               @ must save any changes to caller's regs
        stm  r3, {r4, r5}
        mov  r4, r3             @ and not lose our own either
        pop  {r5, r9, pc}

        @ That's about 5 cycles of extra legitimate work going on in
	@ the downcasing, plus about 20 cycles of per-byte overhead in
	@ pushing and popping and calling, so this is not a very
	@ inspiring example of great efficiency for this method.
	@ Still, 20 cycles is not too bad as function-call overhead
	@ goes; it's comparable to a normal function call and return.

	@ Here's how we'd use that to compute a case-insensitive hash:
	.globl hashlc
	.thumb_func
hashlc: mov  r1, r0
        adr  r0, iterlc
        b    hashic

        @ But this is a bit silly.  What if we want to hash bytes from
	@ an iterator that takes two arguments instead of one?  A
	@ better solution would be to specify the iterator by giving
	@ the hashing function a block; then it can pass a block of
	@ its own to that block, like this:

        @ hashb &iter = {
        @   h := 53u
        @   &iter b => {
        @     h = (h >> 27 | h << 5) ^ b
        @   }
        @   return h
        @ }

        @ No, that is impossible.  A block can return a block, but it
	@ cannot take one as a parameter.