/* BGRAify: dumb microbenchmark of pixel format conversion.

Get a rough measure of the performance we can expect for converting
between pixel formats.  The background of this is that some
graphics APIs and protocols (the X11 X-Windows protocol, Xlib, VNC,
etc.) require producers of graphics to convert their pixel images
into the pixel format used by the actual graphics hardware, which
nowadays on laptops is invariably 32-bit BGRA, generally ignoring
the A.

This is sort of an optimization: if the interface required a
particular pixel format that happened to not match the hardware,
the software implementing the protocol (the X server or whatever)
would have to always do the pixel format conversion.  By matching
the pixel format at the interface to the hardware’s pixel format,
the conversion could be done “upstream” and possibly eliminated.

This is not guaranteed to actually be faster, because (especially with
BGRA) it results in a larger bulk of data being transferred across the
interface (copied to and from shared memory, or sent across a network
cable) and possibly more data shuffled around inside the display
driver code, if the framebuffer memory isn’t arranged in the same way
as the graphics data from applications.

So I wanted to know, quantitatively, about how much time it takes.
I wrote this program to do a couple of pixel format conversions.

Measurements
------------

I haven’t verified that the output data is actually correct!  But if I
didn’t fuck it up the results look like this, using Debian GCC
12.2.0-14 with -O on a Ryzen 5 3500U:

    $ time ./bgraify -bit 10mfile 1000 -o 10mfile.bit
    Debitmapizing 10485760 input bytes into 335544320 output bytes 1000 times

    real    0m12.700s
    user    0m12.164s
    sys     0m0.482s
    $ time ./bgraify -bit 10mfile 1000 -o 10mfile.bit
    Debitmapizing 10485760 input bytes into 335544320 output bytes 1000 times

    real    0m13.043s
    user    0m12.353s
    sys     0m0.585s
    $ time ./bgraify -bit 10mfile 1000 -o 10mfile.bit
    Debitmapizing 10485760 input bytes into 335544320 output bytes 1000 times

    real    0m12.819s
    user    0m12.216s
    sys     0m0.588s

That works out to about 25.8–26.4 gigabytes per second of output.
Memcpy on the same machine is about 10–11 gigabytes per second, so
that’s 2½× as fast as memcpy.  It’s using GCC vector extensions, which
compile to SSE instructions, to achieve that speed:

    12f1:  83 e6 0f           and    $0xf,%esi
    12f4:  48 c1 e6 04        shl    $0x4,%rsi
    12f8:  66 0f 6f 44 34 88  movdqa -0x78(%rsp,%rsi,1),%xmm0
    12fe:  41 0f 29 04 0a     movaps %xmm0,(%r10,%rcx,1)
    1303:  c0 e8 04           shr    $0x4,%al
    1306:  0f b6 c0           movzbl %al,%eax
    1309:  48 c1 e0 04        shl    $0x4,%rax
    130d:  66 0f 6f 4c 04 88  movdqa -0x78(%rsp,%rax,1),%xmm1
    1313:  41 0f 29 4c 0a 10  movaps %xmm1,0x10(%r10,%rcx,1)

The actual BGRAifying of RGB data is much slower:
    
    $ time ./bgraify -rgb 10mfile 1000 -o 10mfile.bit; time ./bgraify -rgb 10mfile 1000 -o 10mfile.bit; time ./bgraify -rgb 10mfile 1000 -o 10mfile.bit
    DeRGBifying 10485760 input bytes into 13981012 output bytes 1000 times

    real    0m3.360s
    user    0m3.227s
    sys     0m0.133s
    DeRGBifying 10485760 input bytes into 13981012 output bytes 1000 times

    real    0m3.280s
    user    0m3.221s
    sys     0m0.033s
    DeRGBifying 10485760 input bytes into 13981012 output bytes 1000 times

    real    0m3.277s
    user    0m3.210s
    sys     0m0.033s

That’s about 4.1–4.3 gigabytes per second.  This is just under half of
the speed of memcpy.  That’s doing nothing fancy at all, just using
bytewise memory access:

    11f8:       0f b6 50 02             movzbl 0x2(%rax),%edx
    11fc:       88 16                   mov    %dl,(%rsi)
    11fe:       0f b6 50 01             movzbl 0x1(%rax),%edx
    1202:       88 56 01                mov    %dl,0x1(%rsi)
    1205:       0f b6 10                movzbl (%rax),%edx
    1208:       88 56 02                mov    %dl,0x2(%rsi)
    120b:       48 83 c0 03             add    $0x3,%rax
    120f:       48 83 c6 04             add    $0x4,%rsi

So presumably you could approach the speed of memcpy using things like
PSHUFB.  [There’s a Stack Overflow question][0] with various
approaches to making it faster, including a promising-looking
implementation using Intel SSSE3 CPU intrinsics by caf.  But that
wouldn’t compile for ARM or for older AMD64 CPUs.

So, I put it behind a `-Duse_intel_intrinsics` compile flag and tried
it.  Compiled with `-O -mssse3 -Duse_intel_intrinsics` I get these
results:

    $ time ./bgraify -rgb 10mfile 1000 -o 10mfile.rgb
    DeRGBifying 10485760 input bytes into 13981012 output bytes 1000 times

    real    0m1.806s
    user    0m1.766s
    sys     0m0.041s
    $ time ./bgraify -rgb 10mfile 1000 -o 10mfile.rgb
    DeRGBifying 10485760 input bytes into 13981012 output bytes 1000 times

    real    0m1.851s
    user    0m1.814s
    sys     0m0.037s
    $ time ./bgraify -rgb 10mfile 1000 -o 10mfile.rgb
    DeRGBifying 10485760 input bytes into 13981012 output bytes 1000 times

    real    0m1.834s
    user    0m1.800s
    sys     0m0.033s

That works out to 7.5–7.7 gigabytes per second, about 80% faster than
the bytewise approach, and 75% of the speed of memcpy.
   
[0]: https://stackoverflow.com/questions/7194452/fast-vectorized-conversion-from-rgb-to-bgra

Calculations
------------

For large (out-of-cache) images like a whole framebuffer, I’d think
that the reduction in data volume would usually compensate for the 25%
slowdown implied in using the SSSE3 shuffling code.  Like, say you
have a megapixel display (one *million* pixels).  If you send four
BGRA megabytes to the X server, the X server probably has to memcpy
them at least once to get them into the framebuffer, costing 380μs.
If things work out so that it has to memcpy them twice, for example
because the X client is using a shared memory segment but can't build
the pixels directly in it, or because your network card driver isn't
zero-copy, it’s 760μs.  If you instead send three RGB megabytes to the
X server, it can unpack them into the framebuffer, which costs about
530μs.  But if a second memcpy is involved before unpacking RGB into
the framebuffer, that second memcpy only takes 290μs, for a total of
790μs, only a few percent worse.  And if a third memcpy comes into
play, the RGB approach definitely wins.

Discussion
----------

The 1-bit-deep bitmap unpacking at 2.5× the speed of memcpy is a
surprise to me.  That means that, if you’re doing something with
bitmap images, it is probably a big speedup to keep them in bitmap
form until as close to the framebuffer as you can possibly get it.

The fact that the RGB format is a tie with SSSE3 once the number of
memcpys is only 2 was also a bit of a surprise.  And I suspect that,
with AVX handling 32 or 64 pixels at a time instead of 16, or with
NEON doing the channel shuffling during load and store instructions,
it could be a win even with 1 memcpy.

This suggests that probably a 256-color “PseudoColor” paletted format
would also be faster than memcpy, although I haven’t tested it.

 */

#include <sys/mman.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <errno.h>
#ifdef use_intel_intrinsics
#include <tmmintrin.h>
#endif

typedef uint32_t u32;

#ifdef use_intel_intrinsics
// caf’s SSSE3 Intel intrinsics implementation (CC BY-SA, from the
// above link).  Evidently w is the number of pixels, and gets reduced
// to a number of 16-pixel chunks, which are 48 bytes on the input?
// You’d need special handling for the up to 15 pixels in the tail.
// But that isn’t significant for our benchmarking purposes here.

/* in and out must be 16-byte aligned */
void rgb_to_bgrx_sse(unsigned w, const void *in, void *out)
{
    const __m128i *in_vec = in;
    __m128i *out_vec = out;

    w /= 16;

    while (w-- > 0) {
        /*             0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
         * in_vec[0]   Ra Ga Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf
         * in_vec[1]   Gf Bf Rg Gg Bg Rh Gh Bh Ri Gi Bi Rj Gj Bj Rk Gk
         * in_vec[2]   Bk Rl Gl Bl Rm Gm Bm Rn Gn Bn Ro Go Bo Rp Gp Bp
         */
        __m128i in1, in2, in3;
        __m128i out;

        in1 = in_vec[0];

        out = _mm_shuffle_epi8(in1,
            _mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 5, 0xff, 0, 1, 2));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[0] = out;

        in2 = in_vec[1];

        in1 = _mm_and_si128(in1,
            _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
        out = _mm_and_si128(in2,
            _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
        out = _mm_or_si128(out, in1);
        out = _mm_shuffle_epi8(out,
            _mm_set_epi8(0xff, 5, 6, 7, 0xff, 2, 3, 4, 0xff, 15, 0, 1, 0xff, 12, 13, 14));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[1] = out;

        in3 = in_vec[2];
        in_vec += 3;

        in2 = _mm_and_si128(in2,
            _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
        out = _mm_and_si128(in3,
            _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
        out = _mm_or_si128(out, in2);
        out = _mm_shuffle_epi8(out,
            _mm_set_epi8(0xff, 1, 2, 3, 0xff, 14, 15, 0, 0xff, 11, 12, 13, 0xff, 8, 9, 10));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[2] = out;

        out = _mm_shuffle_epi8(in3,
            _mm_set_epi8(0xff, 13, 14, 15, 0xff, 10, 11, 12, 0xff, 7, 8, 9, 0xff, 4, 5, 6));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[3] = out;

        out_vec += 4;
    }
}

void bgraify(char *rgb_buf, char *bgra_buf, size_t n_pixels)
{
  rgb_to_bgrx_sse(n_pixels, rgb_buf, bgra_buf);
}
  
#else
void bgraify(char *rgb_buf, char *bgra_buf, size_t n_pixels)
{
  for (size_t i = 0; i != n_pixels; i++) {
    /* Naïve approach for a baseline */
    char *p = &rgb_buf[i*3], *q = &bgra_buf[i*4];
    q[0] = p[2];
    q[1] = p[1];
    q[2] = p[0];
  }
}
#endif

/* Not currently tested.  Renders an 8-bit PseudoColor image into
   32-bit TrueColor. */
void depalettize(u32 *palette, unsigned char *img, u32 *out, size_t n_pixels)
{
  for (size_t i = 0; i < n_pixels; i++) out[i] = palette[img[i]];
}

void debitmapize(u32 *palette, unsigned char *img, u32 *out, size_t n_pixels)
{
  for (size_t i = 0; i < n_pixels/8; i++) {
    unsigned char c = img[i];
    for (size_t j = 8; j; j--) {
      out[i*8 + j] = palette[1 & c];
      c >>= 1;
    }
  }
}

/* I think that maybe with GCC vector attributes and 256-bit AVX we
   could maybe debitmapize faster?  256-bit AVX registers can hold 32
   bytes of pixel data, which is 8 32-bit pixels.  We could precompute
   an 8-kibibyte table of the 256 possible 32-byte combinations of 8
   monochrome pixels and index into it and copy.  8 kibibytes fits in
   L1D$.  Or, 128-bit SSE registers can hold 16 bytes containing 4
   32-bit pixels, and the 16 possible combinations of 4 pixels would
   only be a 256-byte table.

   I don’t know if it works, though the disassembly looks reasonable.
 */
typedef u32 avx_pixels __attribute__((vector_size(32)));
typedef u32 sse_pixels __attribute__((vector_size(16)));
void debitmapize_sse(u32 *palette, unsigned char *img, u32 *out, size_t n_pixels)
{
  u32 table[16*4];
  for (int i = 0; i < 16; i++) {
    for (int j = 0; j < 4; j++) {
      table[i*4 + j] = i & (1 << j) ? palette[1] : palette[0];
    }
  }

  sse_pixels *p = (void*)table; // XXX is this guaranteed to work?
  sse_pixels *op = (void*)out;
  for (size_t i = 0; i < n_pixels/8; i++) {
    unsigned char c = img[i];
    op[i/2] = p[c & 0xf];
    op[i/2 + 1] = p[c >> 4];
  }
}

// Pluralize a word.
static char *pl(int n)
{
  return n == 1 ? "" : "s";
}

// Command-line arguments.
char *function;                 // either do_bgra or do_bit
char *do_rgb = "DeRGBifying";
char *do_bit = "Debitmapizing";
int n;
char *infile;
char *outfile;

// Returns 0 on failure.
int parse_arguments(int argc, char **argv)
{
  if (argc != 6) return 0;

  if (0 == strcmp(argv[1], "-rgb")) {
    function = do_rgb;
  } else if (0 == strcmp(argv[1], "-bit")) {
    function = do_bit;
  } else {
    return 0;
  }

  infile = argv[2];

  n = atoi(argv[3]);
  if (0 == n) return 0;

  if (strcmp(argv[4], "-o")) return 0;

  outfile = argv[5];

  return 1;
}

int main(int argc, char **argv)
{
  if (!parse_arguments(argc, argv)) {
    fprintf(stderr, "Usage: %s {-rgb|-bit} largefile 53 -o outfile\n"
            "Converts the RGB data or bit data in largefile to a BGRA memory buffer\n"
            "53 times to measure the speed of byteswapping.\n", argv[0]);
    return 1;
  }

  /* XXX copied and pasted from memcpycost.c */
  int f = open(infile, O_RDONLY);
  if (f < 0) {
    perror(infile);
    return 1;
  }
  int size = lseek(f, 0, SEEK_END);
  char *in_mem = mmap(0, size, PROT_READ, MAP_SHARED, f, 0);
  if (MAP_FAILED == in_mem) {
    perror("mmap");
    return 1;
  }

  int outfd = open(outfile, O_WRONLY | O_TRUNC | O_CREAT, 0777);
  if (outfd < 0) {
    perror(outfile);
    return 1;
  }

  int out_size = function == do_bit ? size*32 : size/3*4;
  char *p = malloc(out_size);
  if (!p) {
    perror("malloc");
    return 1;
  }

  printf("%s %d input byte%s into %d output byte%s %d time%s\n",
         function, size, pl(size), out_size, pl(out_size), n, pl(n));
  if (function == do_bit) {
      u32 palette[] = { 0x01020304, 0xf5f6f7f8 };

      for (int i = 0; i != n; i++) {
        debitmapize_sse(palette, (unsigned char*)in_mem, (u32*)p, size*8);
      }

  } else if (function == do_rgb) {
      for (int i = 0; i != n; i++) {
        bgraify(in_mem, p, size/3);
      }

  } else {
    fprintf(stderr, "Internal error: invalid function\n");
    return 1;
  }

  errno = 0;
  if (out_size != write(outfd, p, out_size)) {
    perror("Writing output");
    return 1;
  }

  return 0;
}