/* Grampian: a sketch of “immediate-mode parsing”, which may or may
 * not be garden-variety recursive descent with PEG-like ordered
 * choice.
 *
 * This example application is just an integer calculator, but what
 * I’m exploring is how far I can go embedding a domain-specific
 * parsing language in an imperative language such as C or assembly.
 *
 * The basic idea here is to represent each nonterminal of a
 * recursive-descent grammar as an imperative function whose sequence
 * of calls to an underlying parsing layer defines a possible
 * expansion of that nonterminal and acquires input data, and the
 * thing I want to explore is to what extent I can repurpose a grammar
 * expressed in this way: to generate random strings that match it,
 * for example, or to report what was expected at which point in a
 * farce pailure.
 *
 * Why would anyone want such a thing?  Well, generally when you use a
 * grammar in a program, that grammar is interconnected with some kind
 * of data structure traversal or other computation.  Maybe you’re
 * drawing a GUI with an immediate-mode GUI system like dear imgui, or
 * maybe you’re parsing an expression into an AST, or maybe you’re
 * doing a computation, as in the example below.  If you specify the
 * grammar in one place and the traversal somewhere else, you must
 * edit them both every time you add a new construct or remove an old
 * one; worse, with some tools, you won’t even get a compile-time
 * error if the two definitions are out of sync.  At least for
 * experimentation and whipupitude, it’s more appealing to define the
 * grammar and the computation in one place.  (For reusability it
 * might be more desirable to define a single grammar that is shared
 * between many computations.)
 *
 * There are basically three ways you can do this.  First, you can
 * write a parser by brute force in whatever language you’re using,
 * interspersing it with the traversal.  Second, you can use an
 * external domain-specific language, like yacc, where you write a
 * grammar in a separate language for grammars, and specify the
 * traversal as “semantic actions” to take for each production,
 * usually in the form of little snippets of code in C or whatever
 * other language you’re using.  Third, you can use an *internal* or
 * “embedded” domain-specific language, in which you coerce your
 * implementation language to look like yacc or whatever, by
 * overloading operators and defining doesNotUnderstand: methods and
 * metaclasses and whatnot.
 *
 * But what if your implementation language is something dumb and
 * low-level, like assembly or C?  You have no metaclasses or operator
 * overriding or anything.  How far can you go to defining an embedded
 * DSL for grammars with just sequencing of function calls and the
 * usual kinds of control flow?
 *
 * I think you can go pretty far, so I hacked this example program
 * together tonight to see.
 *
 * I’m not sure what I have here is the right way to do this.
 *
 * An aspect not yet explored in this sketch is what it looks like to
 * backtrack a memory allocation.  I think a simple arena
 * pointer-bumping allocator should be adequate.
 *
 * https://commons.wikimedia.org/wiki/File:Beinn_a_ghlo_012.jpg
 * by Stuart Westwater, Ⓒ 2007, CC-BY-SA
 *
 * https://commons.wikimedia.org/wiki/File:Grampian_Mountains01.jpg
 * public domain
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>


/* Parsing actions */

const char *input;
const char *hwm;                /* high-water mark of parsing */
struct {
  const char *pos;              /* next byte to consume */
  char ok;                      /* true if parsing is succeeding */
  char kid_ok;                  /* true if a child parse has succeeded */
} parsestack[128], *tos;


/* The two most fundamental parsing actions are byte(), which consumes a
 * byte and returns it (or EOF), and fail(), which sets the parsing
 * state to failed.  The position of a failed parsing state is
 * considered to be irrelevant, but reading from it is still
 * permissible — it’s up to the parser to ensure that it doesn’t have
 * any unwanted effects.
 */

int byte()
{
  if (!tos->ok) return 0xdeadc0de;
  if (hwm < tos->pos) hwm = tos->pos;
  if (*tos->pos) return *tos->pos++;
  return EOF;
}

void fail()
{
  tos->ok = 0;
}

/* To invoke those, first we must start parsing a string with focus(). */
void focus(const char *new_input)
{
  tos = parsestack;
  tos->pos = input = hwm = new_input;
  tos->ok = 1;
}

/* To support backtracking, we have a begin()/end() pair that pushes
 * and pops the parse state stack, an or() function that separates
 * backtracking alternatives, and a save() function that limits
 * backtracking if we are currently in a succeeding parse.
 *
 * XXX it would be kind of nice to store these parse states in the
 * stack frames of the functions that create them, wouldn’t it?
 * They’d need a parent pointer, but then the arbitrary limit above of
 * 128 would go away, and locality might improve.
 */

void begin()
{
  tos->kid_ok = 0;
  tos++;
  if ((tos - parsestack) * sizeof *tos >= sizeof parsestack) abort();
  tos[0] = tos[-1];
}

/* save() separates a part of an alternative that has already
 * succeeded from a further part that may or may not succeed.  After
 * save(), the parent begin()/end() will succeed with either what was
 * save()d or what came after it.  If save() is called while failing,
 * it has no effect.
 *
 * As it happens, the other alternative-terminating functions below,
 * end() and or(), also invoke save(), so anything that needs to be
 * done whenever an alternative is terminated can be done here.
 */
void save()
{
  if (tos - parsestack < 1) abort();
  if (!tos->ok) return;
  tos[-1].kid_ok = 1;
  tos[-1].pos = tos->pos;
}

void end()
{
  /* If the last alternative was succeeding, we want to treat that the
     same as if some previous alternative had succeeded. */
  save();
  tos--;                        /* pop subparse off stack */
  /* Now, we only want to succeed if some subparse had succeeded: */
  tos->ok = tos->ok && tos->kid_ok; /* XXX is !tos->ok && tos->kid_ok possible? */
}

/* ok() returns true if the current parsing state is succeeding, which
 * it initially will be, and will thus remain until and unless fail()
 * is called.  Since Grampian doesn’t use nonlocal control flow like
 * longjmp() to jump out of failed parses, rather allowing them to
 * continue (leaving it to the client parser to neutralize any
 * unwanted effects).  So calling ok() is the only way to take a
 * different action if succeeding or failing.
 */
int ok()
{
  return tos->ok;
}

/* or() after a failing state simply backtracks to the state at the
 * nearest begin().  Once or() is called in a succeeding state, all
 * future alternatives until end() will be failing, but the parent
 * begin()/end() will succeed with the alternative that succeeded.
 */
void or()
{
  save();
  tos->pos = tos[-1].pos;
  tos->ok = !tos[-1].kid_ok;
}

/* commit() is a convenience function which returns what ok() would
 * have returned, and additionally, if ok() was true, it invokes
 * end().  This is because a very common reason to invoke ok() is to
 * see if an alternative succeeded in order to return its results, in
 * which case you must call end().  This avoids numerous conditional
 * calls to end() inside nonterminal functions.
 */
int commit()
{
  if (!ok()) return 0;
  end();
  return 1;
}

/* where() returns the furthest to the right position that the parse
 * has successfully reached; this is what you usually want for
 * reporting parse errors.  Experience shows that this is somewhat
 * imperfect.
 */
int where()
{
  return hwm - input;
}

/* I need to be able to tell how much of the string was actually
 * consumed in order to detect things that were consumed and then
 * backtracked.  This function returns true if the whole input string
 * has been parsed successfully with no leftover parse states on the
 * stack.
 */
int finished()
{
  return ok() && tos == parsestack && !*tos->pos;
}

/* For errors I think I also want character ranges, literal strings,
 * and literal characters, rather than explicitly calling fail().
 * That way I can report what things were attempted at the high-water
 * mark.  XXX
 */


/* Infix expression grammar */

int expr();       /* forward declaration to permit mutual recursion */

void whitespace()
{
  begin();
  while (ok()) {
    save();                     /* accept even zero whitespace */
    int c = byte();
    if (c != ' ' && c != '\t' && c != '\n') fail();
  }
  end();
}

void op(const char *p)
{
  whitespace();
  while (*p && ok()) {
    if (*p++ != byte()) fail();
  }
}

int atom()
{
  whitespace();
  begin();
  {
    int x = 0;

    begin();
    for (;;) {
      int c = byte();
      if (c < '0' || c > '9') {
        fail();                   /* unread that digit */
        break;
      }
      save(); /* *after* reading, thus requiring at least one digit */
      x = x * 10 + c - '0';
    }
    end();
    if (commit()) return x;
  }
  or();
  {
    op("(");
    if (ok()) {                 /* prevent infinite recursion */
      int x = expr();
      op(")");
      if (commit()) return x;
    }
  }
  end();

  return 0xfeedbeef;
}

int unary()
{
  begin();
  {
    op("-");
    int x = atom();
    if (commit()) return -x;
  }
  or();

  int x = atom();
  end();

  return x;
}

int term()
{
  int x = unary();

  begin();
  while (ok()) {
    save();      /* zero or more multipliers, divisors, and modulos */
    begin();
    {
      op("*");
      int y = unary();
      if (ok()) x *= y;
    }
    or();
    {
      op("/");
      int y = unary();
      if (ok()) x /= y;
    }
    or();
    {
      op("%");
      int y = unary();
      if (ok()) x %= y;
    }
    end();
  }
  end();

  return x;
}

int expr()
{
  int x = term();

  begin();
  while (ok()) {
    save();                     /* zero or more summands */
    begin();
    {
      op("-");
      int y = term();
      if (ok()) x -= y;
    }
    or();
    {
      op("+");
      int y = term();
      if (ok()) x += y;
    }
    end();
  }
  end();

  whitespace(); /* eat trailing whitespace since this is the top level */

  return x;
}

int usage(const char *argv0)
{
  fprintf(stderr, "Usage: %s 3+4*-5\n"
          "where 3+4*-5 is an infix integer expression\n",
          argv0);
  return -1;
}

int main(int argc, char **argv)
{
  if (argc != 2) return usage(argv[0]);
  focus(argv[1]);
  int x = expr();

  if (finished()) {
    printf("%d\n", x);
    return 0;
  } else {
    /* +2 because ↑ is three bytes in UTF-8 */
    fprintf(stderr, "parse failure:\n\t%s\n\t%*s\n", argv[1], where()+2, "↑");
    return 1;
  }
}