#!/usr/bin/python3
# -*- encoding: utf-8 -*-
"""A simple BASIC.
Status: parser successfully parses example program; no interpreter.
Parser output:
'\n'
'x=30y=0\n'
'fori=1to100\n'
'x=x+y*0.1\n'
'y=y-x*0.1\n'
'print"x=",x,"y=",y\n'
'nexti\n',
Some thoughts:
I had five major bugs writing this.
The biggest one, which it took me about an hour (out of 2½ total) to
figure out, was that my parses were failing because Rep (Kleene
closure) was returning an empty list, which was being interpreted as
parse failure, because I was using Python’s truthiness rather than an
explicit failure indication. I added the parse-tracing feature you
see below in order to debug this, which involved adding ``__repr__``
to a bunch of my classes and also adding the Tag class, though I would
have wanted that later anyway to make the parse tree interpretable.
Using something like attrs or dataclasses would probably have made
this easier. It’s interesting to think about what kinds of language
features would have either prevented this bug (no implicit
truthiness!) or made it easier to track down.
Also, Seq was succeeding even if the second item in the sequence
failed (so it would succeed where it shouldn’t, inserting None into
the parse tree), and my memo-table fetching wasn’t advancing the parse
position, causing it to loop infinitely early on.
The other two major bugs were sort of also only major due to Python’s
loosi-goosiness: it took me a while to track down all the places I’d
used ``/`` instead of ``|`` for alternation (I changed my mind partway
through), and I tried to define Alt with ``def Alt(Parser):`` instead
of ``class Alt(Parser):``.
I had some insights in the process.
Python has a lot of whipupitude, though, as shown by getting this to
work in such a short time with a quasi-reasonable embedded DSL syntax,
and some of that comes from its goosiness. On the other hand, there
were several ways Python lacked whipupitude for this purpose: the
aforementioned lack of “dataclasses” (ML-style sum-of-products types,
which come with default implementations for equality, pretty-printing,
and sometimes serialization and deserialization) for which I
substituted tuples and lists in some cases; the lack of
pattern-matching evident in dehydrate, remove, flatten, and xmlish;
the impossibility of defining new infix (or prefix or postfix)
operators for operator overloading, as in Haskell or METAFONT; and not
having a usable debugger.
The grammar itself uses the "foo-separated bars" idiom bar (foo bar)*
four times: stmt (":" stmt)*, expr ("," expr)*, factor (addop
factor)*, and unary (mulop unary)*. There are seven *other* uses of
``~`` (meaning ``*``, Kleene closure) and three uses of unary ``+``
("one or more"). It’s worth noting that one or the other of these
repetition constructs is a special case of foo-separated bars, the
case where foo = "". It might be ergonomically better to provide
foo-separated bars as fundamental, with a spelling like ``bar * foo``
or something, rather than the other repetition operators.
At first I didn’t have Deferred, so there was no way to make a parser
that (transitively) referred to itself. The result of this was that
you could only define regular grammars; this implies that you could
parse them with a finite state machine, like Bjoern Hoehrmann’s
“parselov”, which uses an approximating FSM to conservatively
approximate a context-free grammar. Despite this, it was sufficient
to parse the example program, including correct operator precedence,
though without associativity. I finally caved in to my shame and
added parenthesized expressions, but that remains the only recursion
in the grammar. If you were to limit its stack depth, say to 6 like
many old pocket calculators with parentheses, or to 256 so you could
store the parenthesis stack in a byte, it would still be a regular
language.
"""
from __future__ import print_function, division
import pprint
example_program = """
x = 30 : y = 0
for i = 1 to 100
x = x + y * 0.1
y = y - x * 0.1
print "x=", x, "y=", y
next i
"""
# Let’s parse it with a simple Packrat parsing engine:
class Parser:
def as_parser(self):
return self
def __invert__(self):
"Kleene closure, but greedy: parse zero or more items of this type."
return Rep(self)
def __pos__(self):
"One or more: like Kleene closure but require at least one."
return self + ~self
def __or__(self, other):
"Alternation."
return Alt(self, as_parser(other))
def __ror__(self, other):
return as_parser(other) | self
def __add__(self, other):
"Sequence."
return Seq(self, as_parser(other))
def __radd__(self, other):
return as_parser(other) + self
def __sub__(self, other):
return Neg(as_parser(other)) + self
def __rsub__(self, other):
return as_parser(other) - self
class Rep(Parser):
"Repeats a child zero or more times."
def __init__(self, kid):
self.kid = kid
def parse(self, stream):
results = []
while True:
savepoint = stream.tell()
new_result = stream.invoke(self.kid)
if new_result is None:
savepoint.seek()
return results
savepoint.nuke()
results.append(new_result)
class Alt(Parser):
"Succeeds if either child succeeds."
def __init__(self, a, b):
self.kids = a, b
def parse(self, stream):
savepoint = stream.tell()
a = stream.invoke(self.kids[0])
if a is not None:
savepoint.nuke()
return a
savepoint.seek()
return stream.invoke(self.kids[1])
class Seq(Parser):
"Succeeds if both children succeed, one after the other."
def __init__(self, a, b):
self.kids = a, b
def parse(self, stream):
a = stream.invoke(self.kids[0])
if a is None:
return a
b = stream.invoke(self.kids[1])
if b is None:
return b
return [a, b]
class Neg(Parser):
"Succeeds only if kid does not; never consumes input."
def __init__(self, kid):
self.kid = kid
def parse(self, stream):
savepoint = stream.tell()
kid_result = stream.invoke(self.kid)
savepoint.seek()
return '' if kid_result is None else None
class Lit(Parser):
"Succeeds if we're looking at a given literal string."
def __init__(self, s):
self.s = s
def parse(self, stream):
n = len(self.s)
if stream.peek(n) == self.s:
stream.advance(n)
return self.s
return None
def __repr__(self):
return 'Lit(%r)' % self.s
class Tag(Parser):
"Tags a piece of the parse tree as meaning something."
def __init__(self, tag, kid):
self.tag = tag
self.kid = kid
def parse(self, stream):
v = stream.invoke(self.kid)
if v is None:
return v
return (self.tag, v)
def __repr__(self):
return '{%s}' % self.tag
class Deferred(Parser):
"Permit circular references."
def __init__(self):
self.kid = None
def resolve(self, kid):
self.kid = kid
def parse(self, stream):
return stream.invoke(self.kid)
class Range(Parser):
"Succeeds if the next character is in a given range."
def __init__(self, minc, maxc):
self.minc = minc
self.maxc = maxc
def __repr__(self):
return 'Range(%r, %r)' % (self.minc, self.maxc)
def parse(self, stream):
c = stream.peek(1)
if self.minc <= c <= self.maxc:
stream.advance(1)
return c
return None
def as_parser(item):
if hasattr(item, 'as_parser'):
return item.as_parser()
else:
return Lit(item)
class Stream:
"Store the Packrat parse state, thus keeping it separate from the grammar."
def __init__(self, s):
self.s = s
self.pos = 0
self.memos = {}
self.savepoints = set()
self.tracing = False
def invoke(self, parser):
k = (self.pos, parser)
if k in self.memos:
v, self.pos = self.memos[k]
self.trace('found', parser, v)
return v
self.trace('trying', parser)
v = parser.parse(self)
self.memos[k] = v, self.pos
if v is not None:
self.trace('parsed', v)
return v
def tell(self):
return Savepoint(self, self.pos)
def seek(self, pos):
self.pos = pos
self.trace('<<')
def peek(self, n):
return self.s[self.pos : self.pos + n]
def advance(self, n):
npos = self.pos + n
assert npos <= len(self.s)
self.pos = npos
self.trace('>>')
def trace(self, *things):
if not self.tracing:
return
ctx = (' '.join(repr(c)[1:-1] for c in self.s[self.pos-10:self.pos]) + '|' +
' '.join(repr(c)[1:-1] for c in self.s[self.pos:self.pos+10]))
print(ctx, *things)
def add_savepoint(self, savepoint):
self.savepoints.add(savepoint)
def drop_savepoint(self, savepoint):
self.savepoints.remove(savepoint)
class Savepoint:
"Mostly syntactic sugar for backtracking."
def __init__(self, stream, pos):
self.stream = stream
self.pos = pos
stream.add_savepoint(self)
def seek(self):
self.stream.seek(self.pos)
self.nuke()
def nuke(self):
self.stream.drop_savepoint(self)
# Some AST manipulation routines:
def dehydrate(parse_tree):
"Generate the original text parsed, chunk by chunk."
if isinstance(parse_tree, tuple): # Tag output
yield from dehydrate(parse_tree[1])
elif isinstance(parse_tree, list): # Seq or Rep output
for item in parse_tree:
yield from dehydrate(item)
elif isinstance(parse_tree, str): # Lit or Range output
yield parse_tree
else:
raise ValueError(parse_tree)
def remove(tags, parse_tree):
"Make a copy of the tree without tags in set tags or their contents."
if isinstance(parse_tree, tuple):
if parse_tree[0] in tags:
return None
else:
result = remove(tags, parse_tree[1])
if result is None:
return None
else:
return (parse_tree[0], result)
if isinstance(parse_tree, list):
return [result for item in parse_tree for result in [remove(tags, item)]
if result is not None]
return parse_tree
def flatten(tags, parse_tree):
"Make a copy of the tree without tags in set tags, but not their contents."
if isinstance(parse_tree, tuple):
tag, content = parse_tree
if tag in tags:
yield from flatten(tags, content)
else:
yield (tag, list(flatten(tags, content)))
return
if isinstance(parse_tree, list):
for item in parse_tree:
yield from flatten(tags, item)
return
yield parse_tree
def xmlish(parse_tree):
"Generate a text sequence representing parse_tree with XML-like tags."
if isinstance(parse_tree, tuple):
tag, content = parse_tree
yield '<%s>' % tag
yield from xmlish(content)
yield '%s>' % tag
return
elif isinstance(parse_tree, list):
for item in parse_tree:
yield from xmlish(item)
return
yield parse_tree
# Now, the grammar for the mini-BASIC itself.
def _(parser):
"Drop trailing whitespace."
return as_parser(parser) + ~wsp
wsp = Tag('wsp', Lit(' '))
nl = Lit('\n')
letter = Range('a', 'z') | '_' | Range('A', 'Z') | '$'
digit = Range('0', '9')
character = Range('\0', u'\uffff')
string_literal = Tag('string', '"' + ~(character - '\n' - '"' | '""') + ('"' | nl)) # XXX nl doesn’t work
const = Tag('const', _('.' + +digit | +digit + '.' + ~digit | +digit | string_literal))
var = Tag('var', _(letter + ~(letter | digit)))
expr = Deferred()
atomic = var | const | _('(') + expr + _(')')
unop = Tag('unop', _('+') | _('-'))
unary = ~unop + atomic
mulop = Tag('binop', _('*') | _('/'))
factor = unary + ~(Tag('multiplicative', mulop + unary))
addop = Tag('binop', _('-') | _('+'))
expr.resolve(factor + ~(Tag('additive', addop + factor)))
let_stmt = Tag('let', var + _("=") + expr)
for_stmt = Tag('for', _('for') + var + _('=') + Tag('init', expr) + _('to') + Tag('limit', expr))
next_stmt = Tag('next', _('next') + var)
print_stmt = Tag('print', _('print') + expr + ~(_(',') + expr))
blank_line = Tag('blank', Lit(''))
stmt = Tag('stmt', let_stmt | for_stmt | next_stmt | print_stmt | blank_line)
line = Tag('line', Tag('leadingwsp', ~wsp) + stmt + ~(Tag('colon', _(':')) + stmt) + nl)
program = Tag('program', ~line)
def parse_dumbasic(s):
stream = Stream(s)
parse_tree = stream.invoke(program)
simplified = list(flatten({'line', 'program', 'stmt'},
remove({'blank', 'leadingwsp', 'wsp', 'colon'},
parse_tree)))
return (parse_tree, ''.join(dehydrate(parse_tree)),
simplified, ''.join(xmlish(simplified)),
stream.savepoints, # all savepoints ought to have been properly nuked
len(stream.memos))
if __name__ == '__main__':
pprint.pprint(parse_dumbasic(example_program))