#include <stdio.h>

// Bresenham’s line algorithm with clipping, size-optimized as far as
// I could.  d here is the doubled, negated y distance of the ideal
// line from the pixel center?  Scaled by some factor?  I forget.

void
octant_1(int x, int y, int x1, int y1, char *canvas, int w, int h)
{
  int dy = y1 - y,
    dx = x1 - x,
    d = dx;

  do {
    d -= 2*dy;

    // https://stackoverflow.com/a/2711560 in C++ and
    // https://stackoverflow.com/a/50632 in C explains that this
    // conversion is safe; GCC doesn’t optimize the straightforward
    // code that contains four comparisons to it.
    if ((unsigned)x < (unsigned)w && (unsigned)y < (unsigned)h) {
      canvas[y*w + x] = 1;
    }
    
    if (d < 0) {
      y++;
      d += 2*dx;
    }

    x++;
  } while (x <= x1);
}

// That’s 61 bytes with GCC 12.2.0 with -Os.  Here’s my attempted
// disassembly:
//
// 0000000000000000 <octant_1>:
//    0:	41 89 d2             	mov    %edx,%r10d              copy x1 to r10d so we don’t clobber it
//    3:	29 f1                	sub    %esi,%ecx               compute dy in ecx, clobbering y1 (esi is y)
//    5:	29 fa                	sub    %edi,%edx               compute dx in edx (edi is x)
//    7:	89 f8                	mov    %edi,%eax               move x to eax for no good reason
//    9:	01 c9                	add    %ecx,%ecx               double dy
//    b:	44 8d 1c 12          	lea    (%rdx,%rdx,1),%r11d     compute dx*2 in r11d
//    f:	29 ca                	sub    %ecx,%edx               initialize d to dx-dy???
//   11:	44 39 c8             	cmp    %r9d,%eax               x < w?  eax is x, r9d is w
//   14:	73 16                	jae    2c <octant_1+0x2c>
//   16:	3b 74 24 08          	cmp    0x8(%rsp),%esi          y < h?  esi is y, rsp[8] is h
//   1a:	73 10                	jae    2c <octant_1+0x2c>
//   1c:	89 f7                	mov    %esi,%edi               copy y
//   1e:	41 0f af f9          	imul   %r9d,%edi               multiply by w
//   22:	01 c7                	add    %eax,%edi               add x
//   24:	48 63 ff             	movslq %edi,%rdi               sign-extend
//   27:	41 c6 04 38 01       	movb   $0x1,(%r8,%rdi,1)       canvas[*] = 1   r8 is canvas
//   2c:	85 d2                	test   %edx,%edx               if (d < 0)      edx is d
//   2e:	79 05                	jns    35 <octant_1+0x35>      {
//   30:	ff c6                	inc    %esi                      y++
//   32:	44 01 da             	add    %r11d,%edx                d += 2*dx; }  r11d is 2*dx
//   35:	ff c0                	inc    %eax                    x++
//   37:	41 39 c2             	cmp    %eax,%r10d              while (x <= x1) r10d is x1
//   3a:	7d d3                	jge    f <octant_1+0xf>
//   3c:	c3                   	ret

// In C it’s more compact with conditional expressions, also closer to
// a hardware implementation, but GCC compiles this to 77 bytes instead of 61:
void
octant_1_x(int x, int y, int x1, int y1, char *canvas, int w, int h)
{
  int dy = y1 - y,
    dx = x1 - x,
    d = dx - 2*dy;

  do {
    if ((unsigned)x < (unsigned)w && (unsigned)y < (unsigned)h) {
      canvas[y*w + x] = 1;
    }
    
    y += d < 0;
    d += (d < 0 ? 2*dx - 2*dy : -2*dy);

    x++;
  } while (x <= x1);
}

// Here’s a version that generalizes to a second octant, octant 8.
// It’s already 93 bytes!
void
octant_18(int x, int y, int x1, int y1, char *canvas, int w, int h)
{
  int dy = y1 - y,
    dx = x1 - x,
    d = dx,
    ystep = w,
    pix = y*w + x;

  if (dy < 0) {
    dy = -dy;
    ystep = -ystep;
  }

  do {
    d -= 2*dy;

    if ((unsigned)x < (unsigned)w && (unsigned)pix < (unsigned)(w*h)) {
      canvas[pix] = 1;
    }
    
    if (d < 0) {
      pix += ystep;
      d += 2*dx;
    }

    x++;
    pix++;
  } while (x <= x1);  
}

// And it’s easy enough to generalize that to octants 4 and 5, but
// this adds another 36 bytes, though I think that’s GCC being stupid:
void
octant_1458(int x, int y, int x1, int y1, char *canvas, int w, int h)
{
  if (x <= x1) return octant_18(x, y, x1, y1, canvas, w, h);
  return octant_18(x1, y1, x, y, canvas, w, h);
}

// Here’s a representation of a canvas that can support transposing
// and reversing rows or columns:
typedef struct {
  char *pixels;
  int w, h, xstride, ystride;
} tranvas;

// A version of the original function implemented in terms of that
// comes out to 79 bytes:
void
octrant_1(int x, int y, int x1, int y1, tranvas *t)
{
  int dy = y1 - y,
    dx = x1 - x,
    d = dx;

  do {
    d -= 2*dy;
    if ((unsigned)x < (unsigned)t->w && (unsigned)y < (unsigned)t->h) {
      t->pixels[y * t->ystride + x * t->xstride] = 1;
    }
    
    if (d < 0) {
      y++;
      d += 2*dx;
    }

    x++;
  } while (x <= x1);
}

// A façade that supports the original interface is simple (though
// bloated):
void
tranwrap(int x, int y, int x1, int y1, char *canvas, int w, int h)
{
  tranvas t = { .pixels = canvas, .w = w, .h = h, .xstride = 1, .ystride = w };
  octrant_1(x, y, x1, y1, &t);
}

// But you can easily extend it to all octants by flipping and
// transposing it.  25 bytes:
void
transpose(tranvas *t)
{
  int tmp = t->h;    t->h = t->w;              t->w = tmp;
  tmp = t->ystride;  t->ystride = t->xstride;  t->xstride = tmp;
}

// 19 bytes:
void
xflip(tranvas *t)
{
  int xstride = t->xstride;
  t->pixels += xstride - 1;
  t->xstride = -xstride;
}

// Hmm, okay, well, maybe not *easily*.  You still have to translate
// the points!

// Here’s an attempt to generalize to the whole first quadrant.  It
// doesn’t work right.  Only 80 bytes tho.
void
quadrant_1(int x, int y, int x1, int y1, char *canvas, int w, int h)
{
  int dy = y1 - y,
    dx = x1 - x,
    d = dx;

  do {
    d -= 2*dy;

    while (y <= y1) {
      if ((unsigned)x < (unsigned)w && (unsigned)y < (unsigned)h) {
        canvas[y*w + x] = 1;
      }

      if (d >= 0) break;

      y++;
      d += 2*dx;
    }

    x++;
  } while (x <= x1);
}


// Minimal smoke test.
int
main(int argc, char **argv)
{
  enum {w = 80, h = 24};
  char canvas[w*h] = {0};

  for (int i = -64; i <= 24; i += 8) {
    //quadrant_1(10, i, 90, 20, canvas, w, h);
    octant_1_x(10, i, 90, 20, canvas, w, h);
    //tranwrap(10, i, 90, 20, canvas, w, h);
    //octant_1458(10, i, 90, 20, canvas, w, h);
    //octant_1458(90, i, 10, 20, canvas, w, h);
  }

  for (int line = 0; line < h; line++) {
    for (int col = 0; col < w; col++) {
      printf(canvas[line * w + col] ? "#" : " ");
    }

    printf("\n");
  }

  return 0;
}