### Dumbsort and gnomesort, compared to see which one is simpler.
### Insertion sort is also included as a sort of control group.  See
### also <https://news.ycombinator.com/item?id=36054816> where I said
### dumbsort was probably 10 instructions.

        .intel_syntax noprefix

        ## Dumbsort: roughly the simplest sorting algorithm.  O(N³).
        ## A 64-bit integer array base pointer is in RDI.
        ## The number of integers is in RSI.
        ## Clobbers RAX and RDX.
        ## This corresponds to the standard AMD64 ABI.
        ## Sorts 2048 numbers in 580ms, so its time is about
        ## 67 picoseconds times N³ on this AMD Ryzen 5 3500U.
        ## You could almost certainly code this in less *bytes*, but
        ## I’m shooting for least *instructions* here.
        ## I’m not sure who came up with this sorting algorithm;
        ## I may have gotten it from someone else.
        .globl dumbsort
6:      mov rdx, [rdi + rax*8]      # Load array item.
        cmp rdx, [rdi + rax*8 - 8]  # Compare to previous.
        jge 4f                      # Already in order?  Thank u, next

        xchg rdx, [rdi + rax*8 - 8] # Swap
        mov [rdi + rax*8], rdx
        ## FALL THROUGH into the top of dumbsort, restarting from beginning.
dumbsort:
        xor rax, rax                # Initialize item index.

4:      inc rax                     # Increment item index.
        cmp rax, rsi                # Have we reached the last item?
        jb 6b                       # If not, do an iteration of the loop.

        ret                         # If so, return.

        ## Insertion sort: the simplest practical sorting algorithm.
        ## O(N²).  Same calling convention, but also clobbers RCX.  15
        ## instructions instead of 10.  Takes about 2.6 nanoseconds
        ## times N².
        .globl inssort
inssort:
        xor rax, rax                # Outer loop for i from 1 to N-1.

4:      inc rax
        cmp rax, rsi
        jnb 5f                      # Terminate outer loop and return.

        mov rcx, rax                # Inner loop for j from i-1 to 0.

2:      dec rcx
        mov rdx, [rdi + rcx*8 + 8]  # Load a[j+1]
        cmp rdx, [rdi + rcx*8]      # and compare to a[j].
        jge 4b                      # If already in order, end inner loop.

        xchg rdx, [rdi + rcx*8]     # Otherwise, swap.
        mov [rdi + rcx*8 + 8], rdx
        test rcx, rcx               # Any items left?
        jz 4b                       # If not, end inner loop.

        jmp 2b                      # If so, repeat it.

5:      ret


        ## Gnome sort (due to Hamid Sarbazi-Azad but popularized by
        ## Dick Grune as “the simplest sort algorithm”) is a slightly
        ## slower version of insertion sort.  (Dumbsort is *much*
        ## slower.)  On amd64 at least, it’s the same amount of code
        ## as insertion sort, 15 instructions instead of 10.
        ## I think this is sort of unavoidable on most architectures
        ## because it contains three conditionals, like insertion
        ## sort, rather than two like dumbsort; and six basic blocks
        ## to the five in dumbsort.  Insertion sort might be longer or
        ## shorter on some architectures, but dumbsort is probably
        ## always shorter.
        .globl gnomesort
gnomesort:
        xor rax, rax                # Initialize item index on function entry.
2:      cmp rax, rsi                # Have we reached the rightmost item?
        jae 1f                      # If so, exit.

        test rax, rax
        jz 3f                       # If we’ve moved to index 0, go right.

        mov rdx, [rdi + rax*8]      # Load current pair of items
        cmp rdx, [rdi + rax*8 - 8]  # and see if they’re already in order;
        jl 4f                       # if not, swap.

3:      inc rax                     # Go right, and
        jmp 2b                      # resume from top of loop.

4:      xchg rdx, [rdi + rax*8 - 8] # Swap,
        mov [rdi + rax*8], rdx
        dec rax                     # go left, and
        jmp 2b                      # resume from top of loop.

1:      ret