#!/usr/bin/python3 # -*- encoding: utf-8 -*- """A simple BASIC. Status: parser successfully parses example program; no interpreter. Parser output: '\n' 'x=30y=0\n' 'fori=1to100\n' 'x=x+y*0.1\n' 'y=y-x*0.1\n' 'print"x=",x,"y=",y\n' 'nexti\n', Some thoughts: I had five major bugs writing this. The biggest one, which it took me about an hour (out of 2½ total) to figure out, was that my parses were failing because Rep (Kleene closure) was returning an empty list, which was being interpreted as parse failure, because I was using Python’s truthiness rather than an explicit failure indication. I added the parse-tracing feature you see below in order to debug this, which involved adding ``__repr__`` to a bunch of my classes and also adding the Tag class, though I would have wanted that later anyway to make the parse tree interpretable. Using something like attrs or dataclasses would probably have made this easier. It’s interesting to think about what kinds of language features would have either prevented this bug (no implicit truthiness!) or made it easier to track down. Also, Seq was succeeding even if the second item in the sequence failed (so it would succeed where it shouldn’t, inserting None into the parse tree), and my memo-table fetching wasn’t advancing the parse position, causing it to loop infinitely early on. The other two major bugs were sort of also only major due to Python’s loosi-goosiness: it took me a while to track down all the places I’d used ``/`` instead of ``|`` for alternation (I changed my mind partway through), and I tried to define Alt with ``def Alt(Parser):`` instead of ``class Alt(Parser):``. I had some insights in the process. Python has a lot of whipupitude, though, as shown by getting this to work in such a short time with a quasi-reasonable embedded DSL syntax, and some of that comes from its goosiness. On the other hand, there were several ways Python lacked whipupitude for this purpose: the aforementioned lack of “dataclasses” (ML-style sum-of-products types, which come with default implementations for equality, pretty-printing, and sometimes serialization and deserialization) for which I substituted tuples and lists in some cases; the lack of pattern-matching evident in dehydrate, remove, flatten, and xmlish; the impossibility of defining new infix (or prefix or postfix) operators for operator overloading, as in Haskell or METAFONT; and not having a usable debugger. The grammar itself uses the "foo-separated bars" idiom bar (foo bar)* four times: stmt (":" stmt)*, expr ("," expr)*, factor (addop factor)*, and unary (mulop unary)*. There are seven *other* uses of ``~`` (meaning ``*``, Kleene closure) and three uses of unary ``+`` ("one or more"). It’s worth noting that one or the other of these repetition constructs is a special case of foo-separated bars, the case where foo = "". It might be ergonomically better to provide foo-separated bars as fundamental, with a spelling like ``bar * foo`` or something, rather than the other repetition operators. At first I didn’t have Deferred, so there was no way to make a parser that (transitively) referred to itself. The result of this was that you could only define regular grammars; this implies that you could parse them with a finite state machine, like Bjoern Hoehrmann’s “parselov”, which uses an approximating FSM to conservatively approximate a context-free grammar. Despite this, it was sufficient to parse the example program, including correct operator precedence, though without associativity. I finally caved in to my shame and added parenthesized expressions, but that remains the only recursion in the grammar. If you were to limit its stack depth, say to 6 like many old pocket calculators with parentheses, or to 256 so you could store the parenthesis stack in a byte, it would still be a regular language. """ from __future__ import print_function, division import pprint example_program = """ x = 30 : y = 0 for i = 1 to 100 x = x + y * 0.1 y = y - x * 0.1 print "x=", x, "y=", y next i """ # Let’s parse it with a simple Packrat parsing engine: class Parser: def as_parser(self): return self def __invert__(self): "Kleene closure, but greedy: parse zero or more items of this type." return Rep(self) def __pos__(self): "One or more: like Kleene closure but require at least one." return self + ~self def __or__(self, other): "Alternation." return Alt(self, as_parser(other)) def __ror__(self, other): return as_parser(other) | self def __add__(self, other): "Sequence." return Seq(self, as_parser(other)) def __radd__(self, other): return as_parser(other) + self def __sub__(self, other): return Neg(as_parser(other)) + self def __rsub__(self, other): return as_parser(other) - self class Rep(Parser): "Repeats a child zero or more times." def __init__(self, kid): self.kid = kid def parse(self, stream): results = [] while True: savepoint = stream.tell() new_result = stream.invoke(self.kid) if new_result is None: savepoint.seek() return results savepoint.nuke() results.append(new_result) class Alt(Parser): "Succeeds if either child succeeds." def __init__(self, a, b): self.kids = a, b def parse(self, stream): savepoint = stream.tell() a = stream.invoke(self.kids[0]) if a is not None: savepoint.nuke() return a savepoint.seek() return stream.invoke(self.kids[1]) class Seq(Parser): "Succeeds if both children succeed, one after the other." def __init__(self, a, b): self.kids = a, b def parse(self, stream): a = stream.invoke(self.kids[0]) if a is None: return a b = stream.invoke(self.kids[1]) if b is None: return b return [a, b] class Neg(Parser): "Succeeds only if kid does not; never consumes input." def __init__(self, kid): self.kid = kid def parse(self, stream): savepoint = stream.tell() kid_result = stream.invoke(self.kid) savepoint.seek() return '' if kid_result is None else None class Lit(Parser): "Succeeds if we're looking at a given literal string." def __init__(self, s): self.s = s def parse(self, stream): n = len(self.s) if stream.peek(n) == self.s: stream.advance(n) return self.s return None def __repr__(self): return 'Lit(%r)' % self.s class Tag(Parser): "Tags a piece of the parse tree as meaning something." def __init__(self, tag, kid): self.tag = tag self.kid = kid def parse(self, stream): v = stream.invoke(self.kid) if v is None: return v return (self.tag, v) def __repr__(self): return '{%s}' % self.tag class Deferred(Parser): "Permit circular references." def __init__(self): self.kid = None def resolve(self, kid): self.kid = kid def parse(self, stream): return stream.invoke(self.kid) class Range(Parser): "Succeeds if the next character is in a given range." def __init__(self, minc, maxc): self.minc = minc self.maxc = maxc def __repr__(self): return 'Range(%r, %r)' % (self.minc, self.maxc) def parse(self, stream): c = stream.peek(1) if self.minc <= c <= self.maxc: stream.advance(1) return c return None def as_parser(item): if hasattr(item, 'as_parser'): return item.as_parser() else: return Lit(item) class Stream: "Store the Packrat parse state, thus keeping it separate from the grammar." def __init__(self, s): self.s = s self.pos = 0 self.memos = {} self.savepoints = set() self.tracing = False def invoke(self, parser): k = (self.pos, parser) if k in self.memos: v, self.pos = self.memos[k] self.trace('found', parser, v) return v self.trace('trying', parser) v = parser.parse(self) self.memos[k] = v, self.pos if v is not None: self.trace('parsed', v) return v def tell(self): return Savepoint(self, self.pos) def seek(self, pos): self.pos = pos self.trace('<<') def peek(self, n): return self.s[self.pos : self.pos + n] def advance(self, n): npos = self.pos + n assert npos <= len(self.s) self.pos = npos self.trace('>>') def trace(self, *things): if not self.tracing: return ctx = (' '.join(repr(c)[1:-1] for c in self.s[self.pos-10:self.pos]) + '|' + ' '.join(repr(c)[1:-1] for c in self.s[self.pos:self.pos+10])) print(ctx, *things) def add_savepoint(self, savepoint): self.savepoints.add(savepoint) def drop_savepoint(self, savepoint): self.savepoints.remove(savepoint) class Savepoint: "Mostly syntactic sugar for backtracking." def __init__(self, stream, pos): self.stream = stream self.pos = pos stream.add_savepoint(self) def seek(self): self.stream.seek(self.pos) self.nuke() def nuke(self): self.stream.drop_savepoint(self) # Some AST manipulation routines: def dehydrate(parse_tree): "Generate the original text parsed, chunk by chunk." if isinstance(parse_tree, tuple): # Tag output yield from dehydrate(parse_tree[1]) elif isinstance(parse_tree, list): # Seq or Rep output for item in parse_tree: yield from dehydrate(item) elif isinstance(parse_tree, str): # Lit or Range output yield parse_tree else: raise ValueError(parse_tree) def remove(tags, parse_tree): "Make a copy of the tree without tags in set tags or their contents." if isinstance(parse_tree, tuple): if parse_tree[0] in tags: return None else: result = remove(tags, parse_tree[1]) if result is None: return None else: return (parse_tree[0], result) if isinstance(parse_tree, list): return [result for item in parse_tree for result in [remove(tags, item)] if result is not None] return parse_tree def flatten(tags, parse_tree): "Make a copy of the tree without tags in set tags, but not their contents." if isinstance(parse_tree, tuple): tag, content = parse_tree if tag in tags: yield from flatten(tags, content) else: yield (tag, list(flatten(tags, content))) return if isinstance(parse_tree, list): for item in parse_tree: yield from flatten(tags, item) return yield parse_tree def xmlish(parse_tree): "Generate a text sequence representing parse_tree with XML-like tags." if isinstance(parse_tree, tuple): tag, content = parse_tree yield '<%s>' % tag yield from xmlish(content) yield '' % tag return elif isinstance(parse_tree, list): for item in parse_tree: yield from xmlish(item) return yield parse_tree # Now, the grammar for the mini-BASIC itself. def _(parser): "Drop trailing whitespace." return as_parser(parser) + ~wsp wsp = Tag('wsp', Lit(' ')) nl = Lit('\n') letter = Range('a', 'z') | '_' | Range('A', 'Z') | '$' digit = Range('0', '9') character = Range('\0', u'\uffff') string_literal = Tag('string', '"' + ~(character - '\n' - '"' | '""') + ('"' | nl)) # XXX nl doesn’t work const = Tag('const', _('.' + +digit | +digit + '.' + ~digit | +digit | string_literal)) var = Tag('var', _(letter + ~(letter | digit))) expr = Deferred() atomic = var | const | _('(') + expr + _(')') unop = Tag('unop', _('+') | _('-')) unary = ~unop + atomic mulop = Tag('binop', _('*') | _('/')) factor = unary + ~(Tag('multiplicative', mulop + unary)) addop = Tag('binop', _('-') | _('+')) expr.resolve(factor + ~(Tag('additive', addop + factor))) let_stmt = Tag('let', var + _("=") + expr) for_stmt = Tag('for', _('for') + var + _('=') + Tag('init', expr) + _('to') + Tag('limit', expr)) next_stmt = Tag('next', _('next') + var) print_stmt = Tag('print', _('print') + expr + ~(_(',') + expr)) blank_line = Tag('blank', Lit('')) stmt = Tag('stmt', let_stmt | for_stmt | next_stmt | print_stmt | blank_line) line = Tag('line', Tag('leadingwsp', ~wsp) + stmt + ~(Tag('colon', _(':')) + stmt) + nl) program = Tag('program', ~line) def parse_dumbasic(s): stream = Stream(s) parse_tree = stream.invoke(program) simplified = list(flatten({'line', 'program', 'stmt'}, remove({'blank', 'leadingwsp', 'wsp', 'colon'}, parse_tree))) return (parse_tree, ''.join(dehydrate(parse_tree)), simplified, ''.join(xmlish(simplified)), stream.savepoints, # all savepoints ought to have been properly nuked len(stream.memos)) if __name__ == '__main__': pprint.pprint(parse_dumbasic(example_program))