#!/usr/bin/python3 # -*- coding: utf-8 -*- """Parsing expression grammars. The wee hours of the morning have come, and I’m full of Mongolian barbecue and Coca-Cola, with no desire to sleep. Last night’s quick hack, μSQL, was successful insofar as it produced a more or less working SQL engine capable of evaluating simple SELECT and INSERT statements, but suffered greatly from having to use regular expressions to parse them. And it turned out that actually there is nothing in the Python standard library to help you write parsers for things like that. So I thought I’d see how much of a parsing engine I could hack together in a couple of hours. It wouldn’t have to be super efficient to have helped a huge amount with the μSQL hack. It wouldn’t have to generate code, for example, or to allow you to attach arbitrary semantic actions — it could just generate a syntax tree interpretively. And it wouldn’t have to include a parser for its own language, as we could use an embedded DSL in Python. This would save us from having to explicitly implement grouping, for example, or operator precedence, and it would allow us to smoothly integrate things like Python regular expressions. I wrote a PEG parser generator once before, called peg-bootstrap. It was written in itself, and it compiled to JS. It supported negation `!`, alternation `/`, terminal strings, and nonterminals, but not repetition `*` or `+`, although that’s very useful in practice. And it supported recursive rules, which is an important consideration for doing this as an embedded DSL in Python — you can’t just use Python variables for your nonterminals. The ideal thing to write in an embedded Python DSL for something like sentence <- _ rule sentence / _ rule. would be something like sentence = _, rule, sentence / _, rule but there are three major problems with making that work. First, the comma and / operators have the wrong relative precedence — comma’s precedence is super low — and second, comma isn't overrideable, so what you get out of it is a tuple, whose operators aren’t overridden. If you want something that binds tighter than /, you need something wild like **. You could improve the situation a bit by using |, which binds looser than everything except boolean and relational operators. So you could hope for something like sentence = _ + rule + sentence | _ + rule But then there’s the third problem, which is that you can’t overload plain name lookup in Python like you can in Ruby. You can overload it on objects, though. Except maybe with metaclasses and __prepare__ we can? Holy hell, we can. holyshitnamespaces.py is the capsule summary, but I'm doing it below, too. The result is that I have spent an hour and a half getting that Python syntactic sugar to work, but now it mostly does, except that sometimes you have to stick in an explicit `Lit` or Python says, “TypeError: Can't convert 'Nonterminal' object to str implicitly”, and you can’t refer to any variables defined elsewhere because the magic namespace intercepts all such attempts. And now I’m kind of sleepy, but I do seem to be generating reasonable parse trees for arithmetic expressions from a PEG. And it’s only been two hours! """ class PE: "Parsing expression." def __add__(self, other): return Sequence(self, peify(other)) def __or__(self, other): return Alternation(self, peify(other)) def __invert__(self): return Negation(self) def peify(obj): return obj if isinstance(obj, PE) else Lit(obj) class Sequence(PE): def __init__(self, a, b): self.items = a, b def __repr__(self): return '({!r}) + ({!r})'.format(*self.items) def parse(self, text, position): a, b = self.items result_a, pos = a.parse(text, position) result_b, pos = b.parse(text, pos) return [result_a, result_b], pos class Alternation(PE): def __init__(self, a, b): self.items = a, b def __repr__(self): return '({!r}) | ({!r})'.format(*self.items) def parse(self, text, position): a, b = self.items try: return a.parse(text, position) except ParseFailure: pass return b.parse(text, position) class Negation(PE): def __init__(self, negated): self.negated = negated def __repr__(self): return '~{!r}'.format(self.negated) def parse(self, text, position): try: val = self.negated.parse(text, position) except ParseFailure: return None, position else: raise ParseFailure(val, position) class Nonterminal(PE): def __init__(self, name): self.name = name # There’s also a .definition attribute which may be populated # later. def __repr__(self): return '${}'.format(self.name) def parse(self, text, position=0): result, pos = self.definition.parse(text, position) return (self.name, result), pos class Lit(PE): def __init__(self, text): self.text = text def __repr__(self): return '#{!r}'.format(self.text) def parse(self, text, position): end = position + len(self.text) if text[position:end] == self.text: return self.text, end raise ParseFailure(self.text, position) class ParseFailure(Exception): pass # Uh-oh, the fact that I have to include Lit here to make it visible # betokens major trouble to come. magic_names = set(['__name__', '__module__', '__qualname__', 'Lit']) class GrammarNamespace(dict): def __getitem__(self, name): if name in magic_names: return dict.__getitem__(self, name) if not dict.__contains__(self, name): val = Nonterminal(name) dict.__setitem__(self, name, val) return val else: return dict.__getitem__(self, name) def __setitem__(self, name, val): if name in magic_names: return dict.__setitem__(self, name, val) if dict.__contains__(self, name): wrapper = dict.__getitem__(self, name) else: wrapper = Nonterminal(name) dict.__setitem__(self, name, wrapper) wrapper.definition = val class GrammarMetaclass(type): @classmethod def __prepare__(cls, name, bases, **kwds): return GrammarNamespace() class Grammar(metaclass=GrammarMetaclass): pass class Example(Grammar): sentence = _ + rule + sentence | _ + rule sp = Lit(' ') | '\n' | '\t' _ = sp + _ | '' rule = name + _ + '<-' + _ + choice + '.' + _ choice = sequence + '/' + _ + choice | sequence sequence = term + sequence | Lit('->') + _ + expr | '' expr = Lit('(') + _ + exprcontents + ')' + _ exprcontents = (~Lit('(') + ~Lit(')') + char | expr) + exprcontents | '' term = (name + _ + ':' + term | Lit('!') + _ + term | string | name + _ | Lit('(') + _ + choice + ')' + _) string = Lit("'") + stringcontents + "'" + _ stringcontents = (~Lit('\\') + ~Lit("'") + char + stringcontents | Lit('\\') + char + stringcontents | '') meta = Lit('!') | "'" | '<-' | '/' | '.' | '(' | ')' | ':' | '->' name = namechar + name | namechar namechar = ~meta + ~sp + char yes = Lit('yes') | 'Yes' no = Lit('no') | 'No' response = (yes | no) + '.' class Arithmetic(Grammar): sp = Lit(' ') | '\n' | '\t' _ = sp + _ | '' digit = Lit('0') | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' digits = digit + digits | digit number = (digits + '.' + digits | digits + '.' | Lit('.') + digits | digits) + _ exponentiation = number + '**' + _ + number | number # Note that this generates the parse trees for division and # subtraction with the wrong associativity. multiplicative = (exponentiation + '/' + _ + multiplicative | exponentiation + '*' + _ + multiplicative | exponentiation) additive = (multiplicative + '+' + _ + additive | multiplicative + '-' + _ + additive | multiplicative) if __name__ == '__main__': print(Example.sentence.definition) print(Example.sp.definition) print(Example._.definition) print(Example.stringcontents.definition) print(Example.meta.definition) print(Example.response.parse('Yes.')) print(Example.response.parse('no.')) print(Arithmetic.additive.parse('13 + 2 * 3 ** 4 + 5'))