/* Program to use the GCC vector extensions, rather than Intel * intrinsics, to generate SSE alpha-blending code. This implements * only the Porter–Duff "over" operation, in premultiplied-alpha form. * * This does seem to have been successful in that the generated code * has a bunch of instructions with absurdly long mnemonics in it, and * no loops, but it’s not as much faster as I hoped. * With -O -msse4.2, the generated function * is 28 instructions, one of which is RET and 4 more of which are * PC-relative loads (and thus presumably constants?) so that’s about * 6 or 7 instructions per pixel. Crude benchmarking with the * commented-out loop and `static inline` declaration suggests that it * takes 5.9 ns to alpha-blend four pixels, or 1.46 ns per pixel. * While this is a big improvement * over the 30 instructions and 7.2 ns per pixel of the naïve approach, I was * sort of hoping for a bit better. But I don’t have * AVX256, and although has `_mm_mulhi_epi16` * (pmulhw128) and `_mm_mulhi_epu16` (pmulhuw128), there's no * `_mm_mulhi_epu8`, which is what I’d really need. * * Compiled for ARM with arm-linux-gnueabihf-gcc-5 -O -mthumb * -mfpu=neon, it's 27 instructions (26 NEON instructions and a `bx * lr`.) This owns. * * Things I have tried that don’t help: * - Computing the complement of the alpha (256 - α) in 8-bit-land * rather than 16-bit land. Simply negating the 8-bit vector of * foreground pixels doesn’t work because then 0, which should map * to 256, maps to 0. But we can take the bitwise NOT, and then * either add 1 to it in 16-bit land, or add an additional copy of * bghi and bglo after the multiplication. None of these are * faster. * - Getting the alpha channel out of the foreground pixels with two * shuffles (one using himask/lomask and the other using an * alphamask) instead of one. * * I think actually maybe GCC sees through my ruses and generates the * same code in all of these cases. * * A thing that I think would actually work is to store the pixels BIL * fashion (“band-interleaved by line”, as opposed to the usual BIP), * with all the reds together, all the greens together, etc. * Then shuffling is only needed to convert between 8-bit and 16-bit * formats, and the 8-bit operations work on 16 pixels at a time * rather than 4, and the 16-bit operations work on 8 pixels at a time * rather than 2. * * The output of the program is: * { 128, 194, 67, 255 }, * { 0, 255, 127, 255 }, * { 168, 227, 150, 127 }, * { 64, 75, 87, 98 }, * and I have verified that this is correct. */ #include #include #include typedef uint8_t u8; typedef struct { u8 rr, gg, bb, aa; } pixel; typedef uint8_t vec16 __attribute__((vector_size(16))); typedef int16_t vec8 __attribute__((vector_size(16))); #define shuffle __builtin_shuffle static const vec16 lomask = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}, himask = { 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}, gathermask = { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}, alphahimask = {11, 11, 11, 11, 11, 11, 11, 11, 15, 15, 15, 15, 15, 15, 15, 15}, alphalomask = { 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7}; /* static inline */ void alpha_blend_four(pixel *bg, pixel *fg) { vec16 fgp = *(vec16*)fg , bgp = *(vec16*)bg ; vec8 bghi = (vec8)shuffle(bgp, himask) & 0xff , fgahi = (vec8)shuffle(fgp, alphahimask) & 0xff , rhi = bghi * (256 - fgahi) , bglo = (vec8)shuffle(bgp, lomask) & 0xff , fgalo = (vec8)shuffle(fgp, alphalomask) & 0xff , rlo = bglo * (256 - fgalo) ; *(vec16*)bg = shuffle((vec16)rlo, (vec16)rhi, gathermask) + fgp; } int main(int argc, char **argv) { alignas(16) pixel bg[] = { { 127, 192, 64, 255 }, { 102, 44, 55, 127 }, { 82, 200, 47, 0 }, { 55, 66, 77, 88 }, } , fg[] = { { 1, 2, 3, 0 }, { 0, 255, 127, 255 }, { 127, 127, 127, 127 }, { 13, 14, 15, 16 }, }; /* for (int i = 10; i < 1000*1000*1000; i++) { */ alpha_blend_four(bg, fg); /* } */ for (int i = 0; i < 4; i++) { printf("{ %d, %d, %d, %d },\n", bg[i].rr, bg[i].gg, bg[i].bb, bg[i].aa); } return 0; }