/* Grampian: a sketch of “immediate-mode parsing”, which may or may * not be garden-variety recursive descent with PEG-like ordered * choice. * * This example application is just an integer calculator, but what * I’m exploring is how far I can go embedding a domain-specific * parsing language in an imperative language such as C or assembly. * * The basic idea here is to represent each nonterminal of a * recursive-descent grammar as an imperative function whose sequence * of calls to an underlying parsing layer defines a possible * expansion of that nonterminal and acquires input data, and the * thing I want to explore is to what extent I can repurpose a grammar * expressed in this way: to generate random strings that match it, * for example, or to report what was expected at which point in a * farce pailure. * * Why would anyone want such a thing? Well, generally when you use a * grammar in a program, that grammar is interconnected with some kind * of data structure traversal or other computation. Maybe you’re * drawing a GUI with an immediate-mode GUI system like dear imgui, or * maybe you’re parsing an expression into an AST, or maybe you’re * doing a computation, as in the example below. If you specify the * grammar in one place and the traversal somewhere else, you must * edit them both every time you add a new construct or remove an old * one; worse, with some tools, you won’t even get a compile-time * error if the two definitions are out of sync. At least for * experimentation and whipupitude, it’s more appealing to define the * grammar and the computation in one place. (For reusability it * might be more desirable to define a single grammar that is shared * between many computations.) * * There are basically three ways you can do this. First, you can * write a parser by brute force in whatever language you’re using, * interspersing it with the traversal. Second, you can use an * external domain-specific language, like yacc, where you write a * grammar in a separate language for grammars, and specify the * traversal as “semantic actions” to take for each production, * usually in the form of little snippets of code in C or whatever * other language you’re using. Third, you can use an *internal* or * “embedded” domain-specific language, in which you coerce your * implementation language to look like yacc or whatever, by * overloading operators and defining doesNotUnderstand: methods and * metaclasses and whatnot. * * But what if your implementation language is something dumb and * low-level, like assembly or C? You have no metaclasses or operator * overriding or anything. How far can you go to defining an embedded * DSL for grammars with just sequencing of function calls and the * usual kinds of control flow? * * I think you can go pretty far, so I hacked this example program * together tonight to see. * * I’m not sure what I have here is the right way to do this. * * An aspect not yet explored in this sketch is what it looks like to * backtrack a memory allocation. I think a simple arena * pointer-bumping allocator should be adequate. * * https://commons.wikimedia.org/wiki/File:Beinn_a_ghlo_012.jpg * by Stuart Westwater, Ⓒ 2007, CC-BY-SA * * https://commons.wikimedia.org/wiki/File:Grampian_Mountains01.jpg * public domain */ #include #include #include /* Parsing actions */ const char *input; const char *hwm; /* high-water mark of parsing */ struct { const char *pos; /* next byte to consume */ char ok; /* true if parsing is succeeding */ char kid_ok; /* true if a child parse has succeeded */ } parsestack[128], *tos; /* The two most fundamental parsing actions are byte(), which consumes a * byte and returns it (or EOF), and fail(), which sets the parsing * state to failed. The position of a failed parsing state is * considered to be irrelevant, but reading from it is still * permissible — it’s up to the parser to ensure that it doesn’t have * any unwanted effects. */ int byte() { if (!tos->ok) return 0xdeadc0de; if (hwm < tos->pos) hwm = tos->pos; if (*tos->pos) return *tos->pos++; return EOF; } void fail() { tos->ok = 0; } /* To invoke those, first we must start parsing a string with focus(). */ void focus(const char *new_input) { tos = parsestack; tos->pos = input = hwm = new_input; tos->ok = 1; } /* To support backtracking, we have a begin()/end() pair that pushes * and pops the parse state stack, an or() function that separates * backtracking alternatives, and a save() function that limits * backtracking if we are currently in a succeeding parse. * * XXX it would be kind of nice to store these parse states in the * stack frames of the functions that create them, wouldn’t it? * They’d need a parent pointer, but then the arbitrary limit above of * 128 would go away, and locality might improve. */ void begin() { tos->kid_ok = 0; tos++; if ((tos - parsestack) * sizeof *tos >= sizeof parsestack) abort(); tos[0] = tos[-1]; } /* save() separates a part of an alternative that has already * succeeded from a further part that may or may not succeed. After * save(), the parent begin()/end() will succeed with either what was * save()d or what came after it. If save() is called while failing, * it has no effect. * * As it happens, the other alternative-terminating functions below, * end() and or(), also invoke save(), so anything that needs to be * done whenever an alternative is terminated can be done here. */ void save() { if (tos - parsestack < 1) abort(); if (!tos->ok) return; tos[-1].kid_ok = 1; tos[-1].pos = tos->pos; } void end() { /* If the last alternative was succeeding, we want to treat that the same as if some previous alternative had succeeded. */ save(); tos--; /* pop subparse off stack */ /* Now, we only want to succeed if some subparse had succeeded: */ tos->ok = tos->ok && tos->kid_ok; /* XXX is !tos->ok && tos->kid_ok possible? */ } /* ok() returns true if the current parsing state is succeeding, which * it initially will be, and will thus remain until and unless fail() * is called. Since Grampian doesn’t use nonlocal control flow like * longjmp() to jump out of failed parses, rather allowing them to * continue (leaving it to the client parser to neutralize any * unwanted effects). So calling ok() is the only way to take a * different action if succeeding or failing. */ int ok() { return tos->ok; } /* or() after a failing state simply backtracks to the state at the * nearest begin(). Once or() is called in a succeeding state, all * future alternatives until end() will be failing, but the parent * begin()/end() will succeed with the alternative that succeeded. */ void or() { save(); tos->pos = tos[-1].pos; tos->ok = !tos[-1].kid_ok; } /* commit() is a convenience function which returns what ok() would * have returned, and additionally, if ok() was true, it invokes * end(). This is because a very common reason to invoke ok() is to * see if an alternative succeeded in order to return its results, in * which case you must call end(). This avoids numerous conditional * calls to end() inside nonterminal functions. */ int commit() { if (!ok()) return 0; end(); return 1; } /* where() returns the furthest to the right position that the parse * has successfully reached; this is what you usually want for * reporting parse errors. Experience shows that this is somewhat * imperfect. */ int where() { return hwm - input; } /* I need to be able to tell how much of the string was actually * consumed in order to detect things that were consumed and then * backtracked. This function returns true if the whole input string * has been parsed successfully with no leftover parse states on the * stack. */ int finished() { return ok() && tos == parsestack && !*tos->pos; } /* For errors I think I also want character ranges, literal strings, * and literal characters, rather than explicitly calling fail(). * That way I can report what things were attempted at the high-water * mark. XXX */ /* Infix expression grammar */ int expr(); /* forward declaration to permit mutual recursion */ void whitespace() { begin(); while (ok()) { save(); /* accept even zero whitespace */ int c = byte(); if (c != ' ' && c != '\t' && c != '\n') fail(); } end(); } void op(const char *p) { whitespace(); while (*p && ok()) { if (*p++ != byte()) fail(); } } int atom() { whitespace(); begin(); { int x = 0; begin(); for (;;) { int c = byte(); if (c < '0' || c > '9') { fail(); /* unread that digit */ break; } save(); /* *after* reading, thus requiring at least one digit */ x = x * 10 + c - '0'; } end(); if (commit()) return x; } or(); { op("("); if (ok()) { /* prevent infinite recursion */ int x = expr(); op(")"); if (commit()) return x; } } end(); return 0xfeedbeef; } int unary() { begin(); { op("-"); int x = atom(); if (commit()) return -x; } or(); int x = atom(); end(); return x; } int term() { int x = unary(); begin(); while (ok()) { save(); /* zero or more multipliers, divisors, and modulos */ begin(); { op("*"); int y = unary(); if (ok()) x *= y; } or(); { op("/"); int y = unary(); if (ok()) x /= y; } or(); { op("%"); int y = unary(); if (ok()) x %= y; } end(); } end(); return x; } int expr() { int x = term(); begin(); while (ok()) { save(); /* zero or more summands */ begin(); { op("-"); int y = term(); if (ok()) x -= y; } or(); { op("+"); int y = term(); if (ok()) x += y; } end(); } end(); whitespace(); /* eat trailing whitespace since this is the top level */ return x; } int usage(const char *argv0) { fprintf(stderr, "Usage: %s 3+4*-5\n" "where 3+4*-5 is an infix integer expression\n", argv0); return -1; } int main(int argc, char **argv) { if (argc != 2) return usage(argv[0]); focus(argv[1]); int x = expr(); if (finished()) { printf("%d\n", x); return 0; } else { /* +2 because ↑ is three bytes in UTF-8 */ fprintf(stderr, "parse failure:\n\t%s\n\t%*s\n", argv[1], where()+2, "↑"); return 1; } }