/* BGRAify: dumb microbenchmark of pixel format conversion. Get a rough measure of the performance we can expect for converting between pixel formats. The background of this is that some graphics APIs and protocols (the X11 X-Windows protocol, Xlib, VNC, etc.) require producers of graphics to convert their pixel images into the pixel format used by the actual graphics hardware, which nowadays on laptops is invariably 32-bit BGRA, generally ignoring the A. This is sort of an optimization: if the interface required a particular pixel format that happened to not match the hardware, the software implementing the protocol (the X server or whatever) would have to always do the pixel format conversion. By matching the pixel format at the interface to the hardware’s pixel format, the conversion could be done “upstream” and possibly eliminated. This is not guaranteed to actually be faster, because (especially with BGRA) it results in a larger bulk of data being transferred across the interface (copied to and from shared memory, or sent across a network cable) and possibly more data shuffled around inside the display driver code, if the framebuffer memory isn’t arranged in the same way as the graphics data from applications. So I wanted to know, quantitatively, about how much time it takes. I wrote this program to do a couple of pixel format conversions. Measurements ------------ I haven’t verified that the output data is actually correct! But if I didn’t fuck it up the results look like this, using Debian GCC 12.2.0-14 with -O on a Ryzen 5 3500U: $ time ./bgraify -bit 10mfile 1000 -o 10mfile.bit Debitmapizing 10485760 input bytes into 335544320 output bytes 1000 times real 0m12.700s user 0m12.164s sys 0m0.482s $ time ./bgraify -bit 10mfile 1000 -o 10mfile.bit Debitmapizing 10485760 input bytes into 335544320 output bytes 1000 times real 0m13.043s user 0m12.353s sys 0m0.585s $ time ./bgraify -bit 10mfile 1000 -o 10mfile.bit Debitmapizing 10485760 input bytes into 335544320 output bytes 1000 times real 0m12.819s user 0m12.216s sys 0m0.588s That works out to about 25.8–26.4 gigabytes per second of output. Memcpy on the same machine is about 10–11 gigabytes per second, so that’s 2½× as fast as memcpy. It’s using GCC vector extensions, which compile to SSE instructions, to achieve that speed: 12f1: 83 e6 0f and $0xf,%esi 12f4: 48 c1 e6 04 shl $0x4,%rsi 12f8: 66 0f 6f 44 34 88 movdqa -0x78(%rsp,%rsi,1),%xmm0 12fe: 41 0f 29 04 0a movaps %xmm0,(%r10,%rcx,1) 1303: c0 e8 04 shr $0x4,%al 1306: 0f b6 c0 movzbl %al,%eax 1309: 48 c1 e0 04 shl $0x4,%rax 130d: 66 0f 6f 4c 04 88 movdqa -0x78(%rsp,%rax,1),%xmm1 1313: 41 0f 29 4c 0a 10 movaps %xmm1,0x10(%r10,%rcx,1) The actual BGRAifying of RGB data is much slower: $ time ./bgraify -rgb 10mfile 1000 -o 10mfile.bit; time ./bgraify -rgb 10mfile 1000 -o 10mfile.bit; time ./bgraify -rgb 10mfile 1000 -o 10mfile.bit DeRGBifying 10485760 input bytes into 13981012 output bytes 1000 times real 0m3.360s user 0m3.227s sys 0m0.133s DeRGBifying 10485760 input bytes into 13981012 output bytes 1000 times real 0m3.280s user 0m3.221s sys 0m0.033s DeRGBifying 10485760 input bytes into 13981012 output bytes 1000 times real 0m3.277s user 0m3.210s sys 0m0.033s That’s about 4.1–4.3 gigabytes per second. This is just under half of the speed of memcpy. That’s doing nothing fancy at all, just using bytewise memory access: 11f8: 0f b6 50 02 movzbl 0x2(%rax),%edx 11fc: 88 16 mov %dl,(%rsi) 11fe: 0f b6 50 01 movzbl 0x1(%rax),%edx 1202: 88 56 01 mov %dl,0x1(%rsi) 1205: 0f b6 10 movzbl (%rax),%edx 1208: 88 56 02 mov %dl,0x2(%rsi) 120b: 48 83 c0 03 add $0x3,%rax 120f: 48 83 c6 04 add $0x4,%rsi So presumably you could approach the speed of memcpy using things like PSHUFB. [There’s a Stack Overflow question][0] with various approaches to making it faster, including a promising-looking implementation using Intel SSSE3 CPU intrinsics by caf. But that wouldn’t compile for ARM or for older AMD64 CPUs. So, I put it behind a `-Duse_intel_intrinsics` compile flag and tried it. Compiled with `-O -mssse3 -Duse_intel_intrinsics` I get these results: $ time ./bgraify -rgb 10mfile 1000 -o 10mfile.rgb DeRGBifying 10485760 input bytes into 13981012 output bytes 1000 times real 0m1.806s user 0m1.766s sys 0m0.041s $ time ./bgraify -rgb 10mfile 1000 -o 10mfile.rgb DeRGBifying 10485760 input bytes into 13981012 output bytes 1000 times real 0m1.851s user 0m1.814s sys 0m0.037s $ time ./bgraify -rgb 10mfile 1000 -o 10mfile.rgb DeRGBifying 10485760 input bytes into 13981012 output bytes 1000 times real 0m1.834s user 0m1.800s sys 0m0.033s That works out to 7.5–7.7 gigabytes per second, about 80% faster than the bytewise approach, and 75% of the speed of memcpy. [0]: https://stackoverflow.com/questions/7194452/fast-vectorized-conversion-from-rgb-to-bgra Calculations ------------ For large (out-of-cache) images like a whole framebuffer, I’d think that the reduction in data volume would usually compensate for the 25% slowdown implied in using the SSSE3 shuffling code. Like, say you have a megapixel display (one *million* pixels). If you send four BGRA megabytes to the X server, the X server probably has to memcpy them at least once to get them into the framebuffer, costing 380μs. If things work out so that it has to memcpy them twice, for example because the X client is using a shared memory segment but can't build the pixels directly in it, or because your network card driver isn't zero-copy, it’s 760μs. If you instead send three RGB megabytes to the X server, it can unpack them into the framebuffer, which costs about 530μs. But if a second memcpy is involved before unpacking RGB into the framebuffer, that second memcpy only takes 290μs, for a total of 790μs, only a few percent worse. And if a third memcpy comes into play, the RGB approach definitely wins. Discussion ---------- The 1-bit-deep bitmap unpacking at 2.5× the speed of memcpy is a surprise to me. That means that, if you’re doing something with bitmap images, it is probably a big speedup to keep them in bitmap form until as close to the framebuffer as you can possibly get it. The fact that the RGB format is a tie with SSSE3 once the number of memcpys is only 2 was also a bit of a surprise. And I suspect that, with AVX handling 32 or 64 pixels at a time instead of 16, or with NEON doing the channel shuffling during load and store instructions, it could be a win even with 1 memcpy. This suggests that probably a 256-color “PseudoColor” paletted format would also be faster than memcpy, although I haven’t tested it. */ #include #include #include #include #include #include #include #include #ifdef use_intel_intrinsics #include #endif typedef uint32_t u32; #ifdef use_intel_intrinsics // caf’s SSSE3 Intel intrinsics implementation (CC BY-SA, from the // above link). Evidently w is the number of pixels, and gets reduced // to a number of 16-pixel chunks, which are 48 bytes on the input? // You’d need special handling for the up to 15 pixels in the tail. // But that isn’t significant for our benchmarking purposes here. /* in and out must be 16-byte aligned */ void rgb_to_bgrx_sse(unsigned w, const void *in, void *out) { const __m128i *in_vec = in; __m128i *out_vec = out; w /= 16; while (w-- > 0) { /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * in_vec[0] Ra Ga Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf * in_vec[1] Gf Bf Rg Gg Bg Rh Gh Bh Ri Gi Bi Rj Gj Bj Rk Gk * in_vec[2] Bk Rl Gl Bl Rm Gm Bm Rn Gn Bn Ro Go Bo Rp Gp Bp */ __m128i in1, in2, in3; __m128i out; in1 = in_vec[0]; out = _mm_shuffle_epi8(in1, _mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 5, 0xff, 0, 1, 2)); out = _mm_or_si128(out, _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0)); out_vec[0] = out; in2 = in_vec[1]; in1 = _mm_and_si128(in1, _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0)); out = _mm_and_si128(in2, _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff)); out = _mm_or_si128(out, in1); out = _mm_shuffle_epi8(out, _mm_set_epi8(0xff, 5, 6, 7, 0xff, 2, 3, 4, 0xff, 15, 0, 1, 0xff, 12, 13, 14)); out = _mm_or_si128(out, _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0)); out_vec[1] = out; in3 = in_vec[2]; in_vec += 3; in2 = _mm_and_si128(in2, _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0)); out = _mm_and_si128(in3, _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff)); out = _mm_or_si128(out, in2); out = _mm_shuffle_epi8(out, _mm_set_epi8(0xff, 1, 2, 3, 0xff, 14, 15, 0, 0xff, 11, 12, 13, 0xff, 8, 9, 10)); out = _mm_or_si128(out, _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0)); out_vec[2] = out; out = _mm_shuffle_epi8(in3, _mm_set_epi8(0xff, 13, 14, 15, 0xff, 10, 11, 12, 0xff, 7, 8, 9, 0xff, 4, 5, 6)); out = _mm_or_si128(out, _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0)); out_vec[3] = out; out_vec += 4; } } void bgraify(char *rgb_buf, char *bgra_buf, size_t n_pixels) { rgb_to_bgrx_sse(n_pixels, rgb_buf, bgra_buf); } #else void bgraify(char *rgb_buf, char *bgra_buf, size_t n_pixels) { for (size_t i = 0; i != n_pixels; i++) { /* Naïve approach for a baseline */ char *p = &rgb_buf[i*3], *q = &bgra_buf[i*4]; q[0] = p[2]; q[1] = p[1]; q[2] = p[0]; } } #endif /* Not currently tested. Renders an 8-bit PseudoColor image into 32-bit TrueColor. */ void depalettize(u32 *palette, unsigned char *img, u32 *out, size_t n_pixels) { for (size_t i = 0; i < n_pixels; i++) out[i] = palette[img[i]]; } void debitmapize(u32 *palette, unsigned char *img, u32 *out, size_t n_pixels) { for (size_t i = 0; i < n_pixels/8; i++) { unsigned char c = img[i]; for (size_t j = 8; j; j--) { out[i*8 + j] = palette[1 & c]; c >>= 1; } } } /* I think that maybe with GCC vector attributes and 256-bit AVX we could maybe debitmapize faster? 256-bit AVX registers can hold 32 bytes of pixel data, which is 8 32-bit pixels. We could precompute an 8-kibibyte table of the 256 possible 32-byte combinations of 8 monochrome pixels and index into it and copy. 8 kibibytes fits in L1D$. Or, 128-bit SSE registers can hold 16 bytes containing 4 32-bit pixels, and the 16 possible combinations of 4 pixels would only be a 256-byte table. I don’t know if it works, though the disassembly looks reasonable. */ typedef u32 avx_pixels __attribute__((vector_size(32))); typedef u32 sse_pixels __attribute__((vector_size(16))); void debitmapize_sse(u32 *palette, unsigned char *img, u32 *out, size_t n_pixels) { u32 table[16*4]; for (int i = 0; i < 16; i++) { for (int j = 0; j < 4; j++) { table[i*4 + j] = i & (1 << j) ? palette[1] : palette[0]; } } sse_pixels *p = (void*)table; // XXX is this guaranteed to work? sse_pixels *op = (void*)out; for (size_t i = 0; i < n_pixels/8; i++) { unsigned char c = img[i]; op[i/2] = p[c & 0xf]; op[i/2 + 1] = p[c >> 4]; } } // Pluralize a word. static char *pl(int n) { return n == 1 ? "" : "s"; } // Command-line arguments. char *function; // either do_bgra or do_bit char *do_rgb = "DeRGBifying"; char *do_bit = "Debitmapizing"; int n; char *infile; char *outfile; // Returns 0 on failure. int parse_arguments(int argc, char **argv) { if (argc != 6) return 0; if (0 == strcmp(argv[1], "-rgb")) { function = do_rgb; } else if (0 == strcmp(argv[1], "-bit")) { function = do_bit; } else { return 0; } infile = argv[2]; n = atoi(argv[3]); if (0 == n) return 0; if (strcmp(argv[4], "-o")) return 0; outfile = argv[5]; return 1; } int main(int argc, char **argv) { if (!parse_arguments(argc, argv)) { fprintf(stderr, "Usage: %s {-rgb|-bit} largefile 53 -o outfile\n" "Converts the RGB data or bit data in largefile to a BGRA memory buffer\n" "53 times to measure the speed of byteswapping.\n", argv[0]); return 1; } /* XXX copied and pasted from memcpycost.c */ int f = open(infile, O_RDONLY); if (f < 0) { perror(infile); return 1; } int size = lseek(f, 0, SEEK_END); char *in_mem = mmap(0, size, PROT_READ, MAP_SHARED, f, 0); if (MAP_FAILED == in_mem) { perror("mmap"); return 1; } int outfd = open(outfile, O_WRONLY | O_TRUNC | O_CREAT, 0777); if (outfd < 0) { perror(outfile); return 1; } int out_size = function == do_bit ? size*32 : size/3*4; char *p = malloc(out_size); if (!p) { perror("malloc"); return 1; } printf("%s %d input byte%s into %d output byte%s %d time%s\n", function, size, pl(size), out_size, pl(out_size), n, pl(n)); if (function == do_bit) { u32 palette[] = { 0x01020304, 0xf5f6f7f8 }; for (int i = 0; i != n; i++) { debitmapize_sse(palette, (unsigned char*)in_mem, (u32*)p, size*8); } } else if (function == do_rgb) { for (int i = 0; i != n; i++) { bgraify(in_mem, p, size/3); } } else { fprintf(stderr, "Internal error: invalid function\n"); return 1; } errno = 0; if (out_size != write(outfd, p, out_size)) { perror("Writing output"); return 1; } return 0; }