/* Test speed of proportional pixel-font rendering in C. * * To the extent possible under law, Kragen Javier Sitaker has waived * all copyright and related or neighboring rights to this code. This * work is published from Argentina. For more information, please see * . * * Performance tests compiled with: cc -O -Wall -Werror -std=gnu99 propfont.c -o propfont * To generate an assembly listing: cc -fverbose-asm -Wa,-adhln=propfont.lst -g -Os -Wall -std=gnu99 \ propfont.c -o propfont * Basically, the objective here is to come up with some kind of upper * limit on how fast computers can reasonably format text from plain * ASCII to plain pixels. Surely you can do 128 kilobytes per second; * that’s just repainting a VT100 at 64Hz. But can you render 1 * megabyte per second on a modern machine? 8? 64? 512? 4096? I * want to get to where memcpy bandwidth into the output pixel buffers is * the bottleneck. * * This code does a little more than a VT100, in that it does * proportional font rendering and word wrap. * * Current results have it rendering 12.6 megabytes in 1.05" on my * netbook, so 12 megabytes of text per second, * including word wrapping, but always rendered into the same buffer. * On my laptop, it runs at 70 megabytes of text per second, * about seven times as fast. * This * is far from memcpy-limited! * 12.6 megabytes of text input is about 260 megapixels of pixel output, * at one byte per pixel. * Even my netbook is capable of 1700 megabytes per second from main memory! * On a newer but lower-power laptop (Pentium N3700 1.6GHz) it does 126 * megabytes in 5.7", 22 megabytes per second. * * Possible ways to speed this up further: * * - Maybe have the word-wrap code store the glyph offsets and byte * widths into a “compositing stick” that the rendering code can * then use, rather than recomputing them. * * - Use the GPU! * * - Compile each line of text into machine code that copies the * relevant part of the glyph into the relevant part of the image. * This hoists the memcpy-size test out of the inner loop even more * effectively. * * - Maybe initially compose the pixels as bits using shifts, only * translating to bytes or whole pixels later (with, say, a * 256-by-8-pixel lookup buffer)? */ #include #include #include #include /* I designed this 6-pixel-tall proportional square-pixel pixel font * (based, obviously, on the work of others) in 2012 for * * and rendered the KJV Bible with it at * . This is the output * of, basically, pngtopnm | pnmtopgm | pgmtopbm | pbmtoxbm. Though I * haven’t read the XBM spec, it seems to be organized as 7 sequential * rows of 47 bytes, each containing 8 horizontal pixels in * little-endian order (376 in total, even though the original image * is only 374 pixel wide), which are 1 for ink (originally black) or * 0 for paper. */ #define propfont_width 374 #define propfont_height 7 const static char propfont_bits[] = { 0x54,0x8a,0x84,0x48,0x29,0x00,0x20,0x91,0x5c,0x9d,0x9d,0x09,0x20,0xc4,0x38, 0x19,0x99,0x3b,0xab,0xa3,0x8a,0x28,0xc9,0xc8,0xd8,0x55,0x45,0x55,0xdf,0x62, 0x82,0x40,0x00,0x01,0x41,0xa4,0x18,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x40, 0x85,0x12,0x54,0xdf,0x51,0x29,0x12,0x01,0xa0,0x5a,0x51,0x45,0x50,0x55,0x92, 0x09,0xa5,0xaa,0xaa,0x88,0x28,0xa1,0x8a,0x6d,0x55,0x55,0x85,0x54,0x55,0x55, 0x48,0x42,0x05,0xd9,0x98,0x89,0xd8,0x80,0x92,0x65,0x64,0xac,0xec,0xaa,0xa2, 0xaa,0x27,0x49,0x19,0x04,0xca,0x88,0x20,0xba,0xc3,0x90,0x13,0xc9,0xcd,0x88, 0x18,0x08,0x90,0xb4,0x9b,0xa8,0x99,0x3a,0xa1,0x89,0xaa,0xd5,0xd4,0x88,0x54, 0x55,0x22,0x44,0x44,0x00,0x54,0x45,0xd5,0x55,0xa5,0x91,0xaa,0xaa,0x6a,0x46, 0xaa,0x2a,0x29,0x32,0x18,0x18,0x00,0x1f,0x45,0x21,0x12,0x01,0x88,0x92,0x10, 0x51,0x45,0x11,0x90,0x09,0x84,0xaa,0xaa,0x88,0x2a,0xa9,0x8a,0x2a,0x55,0x54, 0x91,0xd4,0x6d,0x25,0x42,0x48,0x00,0x54,0x45,0x8d,0x58,0xa5,0x92,0xaa,0x6a, 0x2a,0x48,0xaa,0x2a,0x31,0x21,0x09,0x18,0x04,0xca,0x91,0x22,0x2a,0x10,0x0a, 0xd9,0x0d,0x8d,0xc4,0x4c,0x22,0x84,0xb8,0x1a,0x99,0x0b,0xab,0x93,0xba,0x28, 0x49,0x48,0x8d,0x9c,0x44,0x25,0xdf,0x68,0x00,0xd8,0x98,0x99,0x50,0xa5,0xba, 0xaa,0x24,0x2c,0x86,0x4c,0x94,0xa2,0x27,0x09,0x10,0x00,0x80,0x00,0x40,0x01, 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x0c, 0x10,0x00,0x00,0x20,0x08,0x00,0x00,0x00,0x18,0x40,0x04,0x10,0x8a,0x20,0x22, 0x94,0x44,0x24,0x45,0x24,0x22,0x22,0x22,0xa2,0x44,0x22,0x42,0x44,0x44,0x44, 0x44,0x44,0x44,0x10,0x22,0x22,0x22,0x22,0x82,0x88,0x20,0x91,0x48,0x22,0x22, 0x22,0x22,0x4a,0x44,0x10,0x11,0x11,0x11,0x11,0x41,0x44,0x88,0x22,0x24}; enum { propfont_glyphs = 96 }; /* Safely convert an ASCII byte into a glyph index for propfont. */ static inline int _glyph_index_from_ascii(char c) { unsigned x = c; if (x == '\n') x = 127; x -= ' '; return x > 127 - ' ' ? 127 : x; } #define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x))) static char glyph_indices[256]; static void setup_glyph_indices() { for (int i = 0; i < ARRAY_SIZE(glyph_indices); i++) { glyph_indices[i] = _glyph_index_from_ascii(i); } } static inline int glyph_index_from_ascii(char c) { return glyph_indices[(int)(unsigned char)c]; } /* Return true if the propfont pixel at x, y has ink */ static inline int propfont_ink(int x, int y) { return (propfont_bits[y * ((propfont_width + 7) / 8) + x / 8] >> x % 8) & 1; } static int *propfont_columns = 0; static void setup_propfont_columns() { static int columns[propfont_glyphs + 1]; /* Compute char widths from last row at startup */ int propfont_index = 1; /* char 0, the space, starts at 0 */ for (int x = 0; x < propfont_width; x++) { if (propfont_ink(x, propfont_height - 1)) { assert(propfont_index < ARRAY_SIZE(columns)); columns[propfont_index++] = x + 1; } } assert(propfont_index == propfont_glyphs + 1); /* otherwise, invalid font */ columns[propfont_glyphs] = propfont_width; /* quasi-sentinel */ propfont_columns = columns; } /* Compute the pixel width in propfont of ASCII char c. Also return * the column where it starts via passed-by-reference int column. */ static inline int pixel_width_from_glyph_index(int glyph_index, int *column) { int *columns = propfont_columns; int col = columns[glyph_index]; *column = col; return columns[glyph_index + 1] - col; } static int columns_by_ascii[256], pixel_widths_by_ascii[256]; static inline void setup_ascii_tables() { for (int c = 0; c < 256; c++) { int glyph_index = glyph_index_from_ascii(c); int *column = &columns_by_ascii[c]; pixel_widths_by_ascii[c] = pixel_width_from_glyph_index(glyph_index, column); } } static inline int pixel_width_from_ascii(unsigned char ascii_char, int *column) { *column = columns_by_ascii[(int)ascii_char]; return pixel_widths_by_ascii[(int)ascii_char]; } /* This is sort of like a two-dimensional, dynamically-typed pointer * into pixels. */ typedef struct { char *pixels; int pixel_bytes, line_stride; } pixel_buf; static inline char *index_pixbuf(pixel_buf buf, int x, int y) { return buf.pixels + y * buf.line_stride + x * buf.pixel_bytes; } static inline pixel_buf offset_pixbuf(pixel_buf buf, int dx, int dy) { pixel_buf result = { index_pixbuf(buf, dx, dy), buf.pixel_bytes, buf.line_stride, }; return result; } static inline void fill_pixbuf(pixel_buf buf, char *ink, int w, int h) { for (int x = 0; x < w; x++) { memcpy(index_pixbuf(buf, x, 0), ink, buf.pixel_bytes); } for (int y = 0; y < h; y++) { memcpy(index_pixbuf(buf, 0, y), index_pixbuf(buf, 0, 0), w * buf.pixel_bytes); } } /* Render a version of propfont at a given size, suitable for * memcpying chunks of pixels from later. */ static void render_propfont(pixel_buf buf, char *ink_pixel, char *paper_pixel) { setup_propfont_columns(); setup_glyph_indices(); setup_ascii_tables(); for (int y = 0; y < propfont_height; y++) { char *pixel = buf.pixels + buf.line_stride * y; for (int x = 0; x < propfont_width; x++) { char *color = propfont_ink(x, y) ? ink_pixel : paper_pixel; memcpy(pixel, color, buf.pixel_bytes); pixel += buf.pixel_bytes; } } } /* A version of memcpy optimized for * the very short chunks of bytes we’re mostly copying here. Using * this makes the program as a whole run twice as fast. * GCC is smart enough to use optimized versions of memcpy for a * particular constant number of bytes if it knows what that number * is. So this code looks sort of stupid but it actually makes a big * performance difference. */ static void short_memcpy(char *dest, const char *src, size_t nbytes) { if (nbytes == 4) { memcpy(dest, src, 4); } else if (nbytes < 4) { if (nbytes == 2) { memcpy(dest, src, 2); } else if (nbytes < 2) { if (nbytes == 1) { *dest = *src; } } else { /* nbytes == 3 */ memcpy(dest, src, 3); } } else { memcpy(dest, src, nbytes); } } /* Render the given line of ASCII into the given pixel buffer. */ static void render_line(char *line, int len, pixel_buf font, pixel_buf output) { int column; assert(output.pixel_bytes == font.pixel_bytes); for (int i = 0; i < len; i++) { int pixel_width = pixel_width_from_ascii(line[i], &column); int glyph_width = output.pixel_bytes * pixel_width; for (int y = 0; y < propfont_height - 1; y++) { short_memcpy(index_pixbuf(output, 0, y), index_pixbuf(font, column, y), glyph_width); } output = offset_pixbuf(output, pixel_width, 0); } } /* Return the number of characters from this line that can fit into * the given pixel width, using a crude word-wrap. */ static int line_word_wrap_point(char *line, int len, int pixel_width) { int i, x = 0, last_break_point = -1, column; for (i = 0; i < len; i++) { if (line[i] == '\n') return i; x += pixel_width_from_ascii(line[i], &column); if (x > pixel_width) { return last_break_point == -1 ? i - 1 : last_break_point; } if (line[i] == ' ') { last_break_point = i; } else if (line[i] == '-') { last_break_point = i+1; } } return i; } static void render_text(char *text, int len, pixel_buf font, pixel_buf output, int pixel_width) { int line_start = 0; int lineno = 0; while (line_start < len) { char *line = text + line_start; int line_len = line_word_wrap_point(line, len - line_start, pixel_width); render_line(line, line_len, font, output); line_start += line_len; if (line_start < len && (text[line_start] == '\n' || (text[line_start] == ' ' && line_start > 0 && text[line_start-1] != '\n' ) ) ) { line_start++; } output = offset_pixbuf(output, 0, propfont_height - 1); lineno++; } } /* PPM P6 file format; see ; copied from */ static void ppm_header(int ww, int hh) { printf("P6\n%d %d\n255\n", ww, hh); } static void ppm_pixel(unsigned char r, unsigned char g, unsigned char b) { putchar(r); putchar(g); putchar(b); } enum { canvas_width = 96, canvas_height = 96 }; int main() { char buf[canvas_height * canvas_width] = {0}; pixel_buf canvas = { buf, 1, canvas_width }; pixel_buf dest = offset_pixbuf(canvas, 2, 2); char fontbuf[propfont_height * propfont_width]; pixel_buf font = { fontbuf, 1, propfont_width }; char black = 0, white = 255, gray = 127; render_propfont(font, &black, &white); char *hello = "hello, world\nThis is a test of tiny text rendering!\n" "And it was an I'll-be-gobsmacked-upon-the-butt moment when it worked!\n" "Yay!"; fill_pixbuf(canvas, &gray, canvas_width - 1, canvas_height); int n = 100*1000; int len = strlen(hello); fprintf(stderr, "running %d iterations of rendering %d bytes\n", n, len); for (int i = 0; i < n; i++) { render_text(hello, len, font, dest, canvas_width - 2); } ppm_header(canvas_width, canvas_height); for (int i = 0; i < ARRAY_SIZE(buf); i++) { ppm_pixel(buf[i], buf[i], buf[i]); } return 0; }