/* Wercaμ: a microscopic version of Wercam, to see if the performance * figures I’ve been bruiting about are plausible. * * XXX not yet done * * Wercaμ uses xshmu (q.v.) to draw a window on the X11 screen, and in * that window it α-composites windows (using the Porter–Duff “over” * operation) from various sources. */ #include #include #include #include #include #include "xshmu.h" typedef uint8_t u8; typedef uint16_t u16; typedef int16_t s16; /* XXX this is not necessarily compatible with xshmu’s documented interface; depends on endianness */ typedef struct { u8 rr, gg, bb, aa; } pixel; static inline void over_line(pixel *bg, pixel *fg, int n) { for (int ii = n; ii; ii--) { /* This inner loop probably needs rewriting with SSE intrinsics both for speed and for saturation on overflow. Currently it takes 1.514/20480/10240 = 7.2 ns per pixel. That’s only 140 megapixels per second per core. */ u16 transparency = 256 - fg->aa; pixel result = { .rr = ((bg->rr * transparency) >> 8) + fg->rr, .gg = ((bg->gg * transparency) >> 8) + fg->gg, .bb = ((bg->bb * transparency) >> 8) + fg->bb, .aa = ((bg->aa * transparency) >> 8) + fg->aa, }; *bg = result; bg++; fg++; } } typedef struct { pixel *contents; s16 xx, yy; u16 ww, hh; } window; /* Ordered back-to-front */ typedef struct { window *windows; u16 ww, hh, n_windows; } screen; typedef struct { s16 winidx, yy; enum { start_win, stop_win } op; } winop; static int winop_ordering(const void *a, const void *b) { const winop *wa = a, *wb = b; int diff = wa->yy - wb->yy; if (diff) return diff; /* ensure zero-height windows start before stopping */ return wa->op == stop_win ? 1 : -1; } static inline int int_max(int a, int b) { return a > b ? a : b; } static inline int int_min(int a, int b) { return a > b ? b : a; } static inline void composite_screen(xshmu_pic out, screen scr) { int n_ops = scr.n_windows * 2; winop ops[n_ops]; for (int ii = 0; ii < scr.n_windows; ii++) { window *win = &scr.windows[ii]; winop *wo = &ops[ii*2]; winop wostart = { ii, win->yy, start_win }; *wo = wostart; winop wostop = { ii, win->yy + win->hh, stop_win }; wo[1] = wostop; } qsort(ops, n_ops, sizeof(ops[0]), winop_ordering); /* A Z-sorted display list of the windows active on each line */ struct { u16 idx, yy; } active[scr.n_windows]; u16 n_active = 0; int pc = 0; /* Iterate over screen scan lines */ for (int yy = 0; yy < scr.hh; yy++) { /* Update active windows list for this scan line */ while (pc < n_ops && ops[pc].yy <= yy) { u16 winidx = ops[pc].winidx; int pos = 0; while (pos < n_active && active[pos].idx < winidx) pos++; if (ops[pc].op == start_win) { /* insert window into active list */ memmove(&active[pos+1], &active[pos], (n_active - pos) * sizeof(active[0])); active[pos].idx = winidx; /* yy may be negative on the first scan line. */ active[pos].yy = int_max(0, -scr.windows[winidx].yy); n_active++; } else { /* delete window from active list */ assert(active[pos].idx == winidx); memmove(&active[pos], &active[pos+1], (n_active - pos - 1) * sizeof(active[0])); n_active--; } pc++; } /* Compute the line pixels. If the foremost window or windows are opaque, we could avoid the work of drawing the background pixels and then multiplying them away to nothing, but we don’t. */ pixel *line = (pixel*)xshmu_pix(out, 0, yy); for (int wi = 0; wi < n_active; wi++) { /* XXX THIS BULLSHIT IS REALLY STRONG EVIDENCE THAT xshmu_pic IS THE RIGHT THING */ /* but maybe int_min goes into xshmu.c? */ window *w = &scr.windows[active[wi].idx]; int left_clip = int_max(0, -w->xx); pixel *bg = line + w->xx + left_clip; pixel *fg = w->contents + w->ww * active[wi].yy + left_clip; int n = int_min(out.w, int_min(w->ww, scr.ww - w->xx) - left_clip); /* Try to optimize out the first compositing operation with a * `memcpy` if there is a background window */ if (wi == 0) { if (bg == line && n == scr.ww) { memcpy(bg, fg, n * sizeof(*bg)); } else { memset(line, 0, scr.ww * sizeof(*bg)); over_line(bg, fg, n); } } else { over_line(bg, fg, n); } active[wi].yy++; } } } /* simple test demo */ int main(int argc, char **argv) { enum { width = 828, height = 512, ball_rad = 150 }; xshmu win = xshmu_open("Wercam", width, height, ""); pixel *bg_pixels = malloc(width * height * sizeof(pixel)); if (!bg_pixels) abort(); for (int ii = 0; ii < width * height; ii++) { int xx = ii % width, yy = ii / width; pixel p = { .rr = xx * yy, .gg = xx*xx + yy*yy, .bb = xx + yy, .aa = 255 }; bg_pixels[ii] = p; } pixel *fg_pixels = malloc(ball_rad*ball_rad*4 * sizeof(pixel)); if (!fg_pixels) abort(); int ball_rad_sq = ball_rad*ball_rad; /* int ball_rad_m1_sq = ball_rad_sq - 2*ball_rad + 1; */ for (int yy = 0; yy < ball_rad*2; yy++) { int dy = yy - ball_rad, dysq = dy*dy; pixel *line = &fg_pixels[ball_rad*2 * yy]; for (int xx = 0; xx < ball_rad*2; xx++) { int dx = xx - ball_rad, dxsq = dx*dx; int rsq = dxsq + dysq; if (rsq > ball_rad_sq) { pixel p = { 0, 0, 0, 0 }; line[xx] = p; } else { int brill = rsq / 10; int trans = 255 * (xx + yy) / (ball_rad*4); pixel p = { trans*brill >> 11, trans*brill >> 12, trans*brill >> 12, trans }; line[xx] = p; } } } int vx = 16, vy = -128; window windows[] = { { .contents = bg_pixels, .xx = 0, .yy = 0, .ww = width, .hh = height }, { .contents = fg_pixels, .ww = ball_rad*2, .hh = ball_rad*2 }, }; screen sc = { .windows = windows, .ww = width, .hh = height, .n_windows = 2 }; int frames = 0; for (;; frames++) { for (xshmu_event *ev; (ev = xshmu_get_event(win));) { if (xshmu_as_die_event(ev)) { printf("%d frames\n", frames); xshmu_close(win); return 0; } } windows[1].xx += vx >> 4; windows[1].yy += vy >> 4; vy += 1; vy -= vy >> 8; if (windows[1].xx > width - ball_rad || windows[1].xx < -ball_rad) { vx = -vx; windows[1].xx += vx >> 4; } if (windows[1].yy > height - ball_rad || windows[1].yy < -ball_rad) { vy = -vy; windows[1].yy += vy >> 4; } composite_screen(xshmu_framebuffer(win), sc); xshmu_flush(win); } }