/* Program to use the GCC vector extensions, rather than Intel
 * intrinsics, to generate SSE alpha-blending code.  This implements
 * only the Porter–Duff "over" operation, in premultiplied-alpha form.
 *
 * This does seem to have been successful in that the generated code
 * has a bunch of instructions with absurdly long mnemonics in it, and
 * no loops, but it’s not as much faster as I hoped.

 * With -O -msse4.2, the generated function
 * is 28 instructions, one of which is RET and 4 more of which are
 * PC-relative loads (and thus presumably constants?) so that’s about
 * 6 or 7 instructions per pixel.  Crude benchmarking with the
 * commented-out loop and `static inline` declaration suggests that it
 * takes 5.9 ns to alpha-blend four pixels, or 1.46 ns per pixel.
 * While this is a big improvement
 * over the 30 instructions and 7.2 ns per pixel of the naïve approach, I was
 * sort of hoping for a bit better.  But I don’t have
 * AVX256, and although <emmintrin.h> has `_mm_mulhi_epi16`
 * (pmulhw128) and `_mm_mulhi_epu16` (pmulhuw128), there's no
 * `_mm_mulhi_epu8`, which is what I’d really need.
 *
 * Compiled for ARM with arm-linux-gnueabihf-gcc-5 -O -mthumb
 * -mfpu=neon, it's 27 instructions (26 NEON instructions and a `bx
 * lr`.)  This owns.
 *
 * Things I have tried that don’t help:
 * - Computing the complement of the alpha (256 - α) in 8-bit-land
 *   rather than 16-bit land.  Simply negating the 8-bit vector of
 *   foreground pixels doesn’t work because then 0, which should map
 *   to 256, maps to 0.  But we can take the bitwise NOT, and then
 *   either add 1 to it in 16-bit land, or add an additional copy of
 *   bghi and bglo after the multiplication.  None of these are
 *   faster.
 * - Getting the alpha channel out of the foreground pixels with two
 *   shuffles (one using himask/lomask and the other using an
 *   alphamask) instead of one.
 *
 * I think actually maybe GCC sees through my ruses and generates the
 * same code in all of these cases.
 *
 * A thing that I think would actually work is to store the pixels BIL
 * fashion (“band-interleaved by line”, as opposed to the usual BIP),
 * with all the reds together, all the greens together, etc.
 * Then shuffling is only needed to convert between 8-bit and 16-bit
 * formats, and the 8-bit operations work on 16 pixels at a time
 * rather than 4, and the 16-bit operations work on 8 pixels at a time
 * rather than 2.
 *
 * The output of the program is:
 * { 128, 194, 67, 255 },
 * { 0, 255, 127, 255 },
 * { 168, 227, 150, 127 },
 * { 64, 75, 87, 98 },
 * and I have verified that this is correct.
 */

#include <stdio.h>
#include <stdint.h>
#include <stdalign.h>

typedef uint8_t u8;
typedef struct { u8 rr, gg, bb, aa; } pixel;
typedef uint8_t vec16 __attribute__((vector_size(16)));
typedef int16_t vec8  __attribute__((vector_size(16)));

#define shuffle __builtin_shuffle

static const vec16
  lomask      = { 0,  0,  1,  1,   2,  2,  3,  3,   4,  4,  5,  5,   6,  6,  7,  7},
  himask      = { 8,  8,  9,  9,  10, 10, 11, 11,  12, 12, 13, 13,  14, 14, 15, 15},
  gathermask  = { 1,  3,  5,  7,   9, 11, 13, 15,  17, 19, 21, 23,  25, 27, 29, 31},
  alphahimask = {11, 11, 11, 11,  11, 11, 11, 11,  15, 15, 15, 15,  15, 15, 15, 15},
  alphalomask = { 3,  3,  3,  3,   3,  3,  3,  3,   7,  7,  7,  7,   7,  7,  7,  7};

/* static inline */
void
alpha_blend_four(pixel *bg, pixel *fg)
{
  vec16 fgp = *(vec16*)fg
    , bgp = *(vec16*)bg
  ;
  vec8 bghi = (vec8)shuffle(bgp, himask) & 0xff
    , fgahi = (vec8)shuffle(fgp, alphahimask) & 0xff
    , rhi = bghi * (256 - fgahi)

    , bglo = (vec8)shuffle(bgp, lomask) & 0xff
    , fgalo = (vec8)shuffle(fgp, alphalomask) & 0xff
    , rlo = bglo * (256 - fgalo)
    ;
  *(vec16*)bg = shuffle((vec16)rlo, (vec16)rhi, gathermask) + fgp;
}

int
main(int argc, char **argv)
{
  alignas(16) pixel
    bg[] = {
    { 127, 192, 64, 255 },
    { 102, 44, 55, 127 },
    { 82, 200, 47, 0 },
    { 55, 66, 77, 88 },
  }
  , fg[] = {
    { 1, 2, 3, 0 },
    { 0, 255, 127, 255 },
    { 127, 127, 127, 127 },
    { 13, 14, 15, 16 },
  };
  /* for (int i = 10; i < 1000*1000*1000; i++) { */
  alpha_blend_four(bg, fg);
  /* } */
  for (int i = 0; i < 4; i++) {
    printf("{ %d, %d, %d, %d },\n", bg[i].rr, bg[i].gg, bg[i].bb, bg[i].aa);
  }
  return 0;
}