## Implementation of 8 parallel 15-bit LFSRs; see lfsr.c.
        ## Implements x¹⁵ + x¹⁴ + 1 with b[i] ^= b[(i+1) % 15].
        ## i is the first byte of the 16-byte buffer; b is the other
        ## 15.

        ## Because this doesn't access any callee-saved registers it
        ## doesn't need any stack.  I'm trying to use only the RVC
	## registers to reduce code size, and aping GCC's apparent
        ## attempts to do instruction scheduling by avoiding data
        ## dependencies between adjacent instructions.

        .globl lfsr0
lfsr0:  lb a1, (a0)             # a0 points to beginning of 16-byte buffer
        addi a1, a1, 1          # increment counter i in a1 stored in first byte
        addi a4, zero, 15       # can't compare to an immediate constant, so load 15 into register a4
        mv a2, a1               # copy i+1 to a2 for (i+1) % 15 calculation
        bne a1, a4, 1f          # skip if i+1 != 15
        addi a2, a2, -15        # but if it was, subtract 15, result in a2
1:      add a3, a0, a1          # calculate &b[i] in a3
        addi a5, a2, 1          # calculate (i+1) % 15 + 1 in a5
        lb a1, (a3)             # load b[i] into a1
        add a5, a0, a5          # calculate &b[(i+1) % 15] in a5
        sb a2, (a0)             # store incremented value (i+1) % 15 in first byte
        lb a0, (a5)             # load b[(i+1) % 15] into a0
        xor a0, a0, a1          # calculate final XOR result in a0
        sb a0, (a3)             # store a0 at b[i] and implicitly return it
        jalr zero, ra           # return from subroutine

        ## Here's a disassembly of the above before renaming to lfsr0:

### 0000000000000000 <lfsr>:
###    0:	00050583          	lb	a1,0(a0)           there is no compressed form of lb, just C.LW, C.LD on RV64C, and C.LQ on RV128C
###    4:	0585                	addi	a1,a1,1            compression successful!
###    6:	00f00713          	li	a4,15              compression unsuccessful; I'd think C.LI ought to be able to do this?  has 6 bits!
###    a:	862e                	mv	a2,a1              compression successful!
###    c:	00e59363          	bne	a1,a4,12 <.L1^B1>  compression unsuccessful; only BEQZ and BNEZ have compressed forms
###   10:	1645                	addi	a2,a2,-15          compression successful!

### 0000000000000012 <.L1^B1>:
###   12:	00b506b3          	add	a3,a0,a1           can't compress because three operands (but we need to preserve both a0 and a1)
###   16:	00160793          	addi	a5,a2,1            similarly, need to preserve both a2 and a5
###   1a:	00068583          	lb	a1,0(a3)
###   1e:	97aa                	add	a5,a5,a0           compression successful!  note swapped arguments, clever assembler
###   20:	00c50023          	sb	a2,0(a0)           there is no compressed form of sb either
###   24:	00078503          	lb	a0,0(a5)
###   28:	8d2d                	xor	a0,a0,a1           compression successful!
###   2a:	00a68023          	sb	a0,0(a3)
###   2e:	00008067          	ret                        why doesn't this use C.JR?

### In total this is 15 instructions and 50 bytes.  GCC's version,
### is 15 instructions, 46 bytes, and branchless because it is using
### the division instructions:

### 858:   00054783                lbu     a5,0(a0)     *unsigned* byte load of i
### 85c:   463d                    li      a2,15        successfully compressed, unlike my attempt!
### 85e:   86aa                    mv      a3,a0        copy the pointer into a3 because we will needlessly overwrite it later
### 860:   2785                    addiw   a5,a5,1      increment it specifically 32-bittishly
### 862:   02c7e63b                remw    a2,a5,a2     calculate remainder in a2, dividing by 15 in a2
### 866:   97aa                    add     a5,a5,a0     same approach to calculate address of first byte
### 868:   0007c503                lbu     a0,0(a5)     load first byte, needlessly overwriting a0
### 86c:   0016059b                addiw   a1,a2,1      same approach to calculate address of second byte
### 870:   95b6                    add     a1,a1,a3     calculate address of second byte using saved copy of a0 in a3
### 872:   0005c703                lbu     a4,0(a1)     load second byte into a4
### 876:   8f29                    xor     a4,a4,a0     calculate xor
### 878:   0ff77513                andi    a0,a4,255    needlessly (?) convert result to byte; sb doesn't care about high bits but return value might
### 87c:   00a78023                sb      a0,0(a5)     store xor result at b[i]
### 880:   00c68023                sb      a2,0(a3)     store incremented i
### 884:   8082                    ret                  successfully compressed, unlike my attempt!

### If I can get the same compression in my li and ret instructions I
### could cut mine to 46 bytes.  And I think that if I subtract 15
### unconditionally I can use beqz to do the modulo instead of bne,
### and also maybe I can avoid the li instruction entirely.

### I'm pretty sure the andi 255 is unnecessary.  The lbu instructions
### zero-extend the bytes they load, and xor certainly doesn't
### introduce any high-order 1 bits, so there's nothing for it to mask
### away.

### This 13-instruction 38-byte version also seems to work, though at
### first I had the sense of the branch test backwards.  I've marked
### with * the instructions with a data dependency on an immediately
### previous instruction, which was a thing I was trying to avoid when
### I wrote this code at first, aping GCC.

        .globl lfsr
lfsr:   lbu a1, (a0)          # load unsigned counter byte i at a0 into a1
        addi a1, a1, 1        #* increment it
        addi a2, a1, -15      #* compute (i+1) - 15
        beqz a2, 1f           #* but unless the result is zero (compressible jump!)
        mv a2, a1             # overwrite it with (i+1) (this mv is an SSA φ function)
1:      add a1, a1, a0        # so a2 is (i+1)%15.  then compute &b[i] in a1 from a0 + (i+1)
        sb a2, (a0)           # store %15 result at a0
        lbu a3, (a1)          # load byte b[i] from a1 into a3
        add a2, a2, a0        # add (i+1) % 15 to a0 to get other byte address in a2 (offset by -1)
        lbu a0, 1(a2)         #* overwrite a0 with byte from address a2+1
        xor a0, a0, a3        #* xor bytes fetched from a1 (b[i]) and a5 (b[(i+1)%15])
        sb a0, (a1)           #* store xor result at address a1
        ret                   # oddly enough, the assembler can compress this

### This gives this disassembly:
###0000000000000032 <lfsr>:
###  32:	00054583          	lbu	a1,0(a0)        lbu isn't any more compressible than lb is
###  36:	0585                	addi	a1,a1,1
###  38:	ff158613          	addi	a2,a1,-15       uncompressible because result register is different 
###  3c:	c211                	beqz	a2,40 <.L1^B2>  compressible this time!
###  3e:	862e                	mv	a2,a1
###
###0000000000000040 <.L1^B2>:
###  40:	95aa                	add	a1,a1,a0        compressible because I stopped using a3 for the result (a1 was dead)
###  42:	00c50023          	sb	a2,0(a0)
###  46:	0005c683          	lbu	a3,0(a1)        (because we clobber a1 here)
###  4a:	962a                	add	a2,a2,a0
###  4c:	00164503          	lbu	a0,1(a2)        offset here saves us a register and an addi
###  50:	8d35                	xor	a0,a0,a3
###  52:	00a58023          	sb	a0,0(a1)
###  56:	8082                	ret                     compressible this time!


### By not using byte accesses I think we can do smaller.  This
### untested version is still 13 instructions but only 28 bytes.
        .globl lfsrw
lfsrw:  lw a1, (a0)
        addi a1, a1, 4
        addi a2, a1, -60        # this is the only remaining uncompressible instruction
        beqz a2, 1f     
        mv a2, a1       
1:      add a1, a1, a0  
        sw a2, (a0)     
        lw a3, (a1)     
        add a2, a2, a0  
        lw a0, 4(a2)    
        xor a0, a0, a3
        sw a0, (a1)
        ret             

### a1 and a2 still need to be computed separately because sometimes
### they are at opposite ends of the buffer, but I wonder if there's a
### way to avoid that extra addi instruction at the beginning, since
### we get a constant offset for free in the load and store
### instructions, even in RV64C.  Like, +4-60 is -56, right?  And if
### that doesn't give us 0, we could conditionally do an add rather
### than a move?  Maybe a way to think of this is that 4(a0) is the
### base address of the buffer of bytes?
### 
### I don't think that actually works because we have to compute a1+4
### (or a1+1) anyway in order to update the counter in memory.