#!/usr/bin/python3 """Prototype of the Monkey’s Paw pattern-matching and parsing engine. I had used and worked on a few different PEG parsing engines, including Darius Bacon’s remarkable parson, my own peg-bootstrap, and the well-known Hammer, as well as a reimagination of D. Val Schorre’s META-II called Meta5ix, and I wanted something that would be both reasonably easy to implement and to use. I’m not there yet, but where I’m headed is being able to write a pattern like this: _ ")" {sum} | {parse_int(number)} | {read_var(var)}> _ {t} | "-" _ {minus(t)})*> {add(t, sum(ts))}> {sum} and get a runnable interpreter or compiler out of it, given the functions named. Captured strings are represented as Erlang-style IO lists: an IO list is either a string or a tuple of IO lists. The ``stringify`` function implements this: >>> stringify('foo') 'foo' >>> stringify((('a', 'b'), 'c')) 'abc' The ``unite`` function implements the desired union semantics in terms of these IO lists and Python dictionaries: >>> unite((), ()) ((), ()) >>> unite('foo', 'bar') ('foo', 'bar') >>> unite('bar', {'foo': 'Baez'}) {'foo': 'Baez'} >>> unite({'foo': 'Baez'}, 'bar') {'foo': 'Baez'} >>> unite({'a': 'b'}, {'c': 'd'}) {'a': 'b', 'c': 'd'} XXX lists? From repetition? Collisions are represented as None: >>> unite({'a': 'b'}, {'a': 'b', 'c': 'd'}) {'a': None, 'c': 'd'} ``Lit`` is a pattern to match literal strings. Match values return a tuple (ok, result). ok is False if the match failed, in which case result will be (at the moment) None. Otherwise, ok is true and result is a tuple (off, val), where val is either a dict or an IO list as described above. >>> yes = Lit('yes') >>> ok, result = yes.match('yesterday') >>> ok True >>> result[0] 3 >>> stringify(result[1]) 'yes' >>> ok, result = yes.match('ye') >>> ok False >>> yes.match('today')[0] False Pattern concatenation is done with ``+``, producing ``Cat`` nodes. >>> ok, result = (yes + Lit('ter')).match('yesterday') >>> ok True >>> result[0] 6 >>> stringify(result[1]) 'yester' Capturing named results is done with ``Cap``. Because of the semantics of ``unite`` this discards any non-captured results. It also collapses any IO lists into strings. >>> (yes + Cap('aw', Lit('ter'))).match('yesterday') (True, (6, {'aw': 'ter'})) This is more conveniently spelled with the ``.yclept`` method: >>> (yes.yclept('yus') + Lit('ter')).match('yesterday') (True, (6, {'yus': 'yes'})) Capture groups can be nested to produce nested dictionaries, but only the innermost capture groups capture raw characters. The outer groups only capture inner groups. >>> (yes.yclept('yus') + Lit('ter').yclept('tor')).yclept('yo').match('yesterday') (True, (6, {'yo': {'yus': 'yes', 'tor': 'ter'}})) Alternatives are separated by ``|``, producing an ``Alt`` node, which as usual has lower priority than ``+``. >>> xy = (Lit('x') | Lit('y')).yclept('xy') >>> xy.match('foo')[0] False >>> xy.match('xa') (True, (1, {'xy': 'x'})) >>> xy.match('yb') (True, (1, {'xy': 'y'})) Recursion is supported with the ``Defer`` node, to which you can pass a lambda which produces the desired parse node on demand. >>> xs = Defer(lambda: Lit('x') + xs | Lit('')) >>> xn = xs.yclept('xs') >>> xn.match('glorp') (True, (0, {'xs': ''})) >>> xn.match('xanga') (True, (1, {'xs': 'x'})) >>> xn.match('xxxfiles') (True, (3, {'xs': 'xxx'})) Character classes are supported with ``Cset``, which takes an iterable of characters. >>> let = Cset(chr(c) for c in range(ord('a'), ord('z') + 1)) >>> lets = Defer(lambda: let + lets | let) >>> word = lets.yclept('word') >>> word.match('we are lost') (True, (2, {'word': 'we'})) Because the right argument of the alternation above was ``let`` rather than empty, our ``word`` will fail if it can’t match at least one letter: >>> word.match(' we are lost')[0] False While, as illustrated above, you can get repetition with recursion, more convenient repetition is provided with the ``.any()`` and ``.some()`` method. These produce lists of parsed results. ``.any()`` allows zero repetitions; ``.some()`` does not. They both return ``Repeat`` nodes. >>> phrase = (word + Lit(' ').any()).some() >>> phrase.match('we are lost') (True, (11, [{'word': 'we'}, {'word': 'are'}, {'word': 'lost'}])) >>> phrase.yclept('p').match('we are lost') (True, (11, {'p': [{'word': 'we'}, {'word': 'are'}, {'word': 'lost'}]})) """ from collections import namedtuple import doctest def unite(a, b): """Implement union semantics from the Monkey’s Paw.""" if type(a) is not dict and type(b) is not dict: return (a, b) if type(a) is not dict: return b if type(b) is not dict: return a rv = a.copy() for k, v in b.items(): if k in rv: rv[k] = None # collision indicator else: rv[k] = v return rv def normalize(o): return stringify(o) if type(o) in (str, tuple) else o def stringify(o): b = bytearray() stringify_put(b, o) return b.decode('utf-8') def stringify_put(b, o): if type(o) is str: b += o.encode('utf-8') return for item in o: stringify_put(b, item) class Parser: def __add__(self, other): return Cat(self, other) def yclept(self, name): return Cap(name, self) def __or__(self, other): return Alt(self, other) def any(self): return Repeat(self, empty_ok=True) def some(self): return Repeat(self, empty_ok=False) class Lit(Parser, namedtuple('Lit', ('s',))): def match(self, target, off=0): end = off + len(self.s) if len(target) < end: return False, None elif target[off:end] == self.s: return True, (end, self.s) else: return False, None class Cat(Parser, namedtuple('Cat', ('left', 'right'))): def match(self, target, off=0): ok, result = self.left.match(target, off) if not ok: return ok, None off, left = result ok, result = self.right.match(target, off) if not ok: return ok, None off, right = result assert type(off) is int, (self, off) return ok, (off, unite(left, right)) class Cap(Parser, namedtuple('Cap', ('name', 'kid'))): def match(self, target, off=0): ok, result = self.kid.match(target, off) if not ok: return ok, result off, val = result return ok, (off, {self.name: normalize(val)}) class Alt(Parser, namedtuple('Alt', ('left', 'right'))): def match(self, target, off=0): # XXX I think we need to backtrack side effects here ok, result = self.left.match(target, off) if ok: return ok, result return self.right.match(target, off) class Defer(Parser, namedtuple('Defer', ('thunk',))): def match(self, target, off=0): return self.thunk().match(target, off) class Cset(Parser): def __init__(self, cs): self.cs = set(cs) def match(self, target, off=0): if off < len(target) and target[off] in self.cs: return True, (off+1, target[off]) else: return False, None class Repeat(Parser, namedtuple('Repeat', ('kid', 'empty_ok'))): def match(self, target, off=0): results = [] ok = self.empty_ok while True: # XXX backtrack side effects here too nok, result = self.kid.match(target, off) if not nok: assert type(off) is int, (self, off) return ok, (off, results) noff, val = result assert type(noff) is int, (self, noff) if noff <= off: raise ValueError('repetition of node failing to consume', self, target, off, noff) ok, off = nok, noff results.append(val) if __name__ == '__main__': doctest.testmod()