#!/usr/bin/python # -*- coding: utf-8 -*- """A fun nighttime pseudo-livecoding exercise: how much of SQL can I hack together quickly? This was in some sense a response to the frustrations of usql.py, where I rapidly hit the limitations of the subset of SQL you can reasonably parse with regular expressions. So here I hacked together enough of a non-memoizing PEG parsing engine to parse a tiny subset of SQL, then made a vague gesture at turning it into an AST you could conceivably evaluate. Like usql.py, this also took a couple of hours, but because I was working on the frontend rather than the backend, it can’t really evaluate queries. Like, I have two pages of PEG parsing code, and then at the end a bit less than a page of SQL parsing code. usql.py has roughly 2½ pages of query evaluation code and half a page of parsing. I say “pseudo-livecoding” because, although I was trying to hack this together quickly and in a pretty comprehensible way, nobody was actually watching. I was hoping to record snapshots along the way and go back and look at them later, but after an hour or so I realized that I didn’t have version-control enabled in this Emacs, so I lost the snapshots I thought I was saving. Things I learned ---------------- The clumsiness of the .map() mechanism given here is pretty painful; maybe some more apt kind of data model would be better. The visually associative + operator isn’t really associative. A really simple tweak would be for the primitive terminals to return lists as their default values and for + to by default concatenate the lists from its arguments. Then the ignoring of whitespace could be done down in the whitespace production itself, by returning an empty list, rather than in its parents. Alternatively, maybe you could return dicts of lists, so that the dicts are merged together using concatenation, again, by default. You could introduce nesting explicitly but it wouldn’t happen just by concatenating things. Flat is better than nested! A different approach would be the tagging-parser idea I explored in Dercuano; possibly an interface like ElementTree, or something even more magical, could work for that. The idea is that you wrap parts of the grammar with "tag(tagname, grammarbits)" and the tagname gets wrapped around the grammarbits. The relative precedence of + and / is wrong. | might work better; it has lower precedence than + in Python. I should probably have used the pprint module earlier. And attrs. Or at least namedtuple. Some separate kind of error object might have been better. Then I could report where the parse errors happen. It’s worth thinking about how to make memoization easily work for this. Also, not using the Python call stack for concatenation, because that is going to end in tears. It took me way too long to figure out that my apostrophe-delimited string was just consuming the closing apostrophe as part of the string. (And then the solution I chose was to add ButNot.) The sep pattern can reasonably be used for operator precedence parts of the grammar, but in that case the separators should be things like '+' + _ | '-' + _, which means that we don’t want to throw them away. This would incidentally have eliminated all the recursion from this grammar, so it could in theory be a single giant parsing expression. Really, if you’re going to all the trouble to write a PEG parsing engine anyway, it might be worthwhile to use it to parse not just SQL but also the PEG. Then you can do things like use prefix or postfix * or + operators, use concatenation for concatenation without an explicit operator, and write literal strings without quotes (at the expense of tagging nonterminals with :sigils of$ .some ^kind [distinguish] «them».) The only trouble with that is that if you want Python code mixed in there (whether to transform results or to reject candidates), you’re going to have to eval it, and then there’s the question of precisely which lexical environment you do that in; a better alternative might be to attach #tags to things in the grammar and then separately attach the program fragments to them. This shitty Genius keyboard is hard to use the function keys on in the dark; I kept hitting F6, which I have bound to a refactoring command, instead of F5, recompile. """ from __future__ import print_function import pprint import readline class Production: def __div__(self, other): return OrderedChoice(self, as_production(other)) def __add__(self, other): return Concatenation(self, as_production(other)) def __radd__(self, other): return as_production(other) + self def __sub__(self, other): return ButNot(self, as_production(other)) def some(self): return (self + self.any()).map(lambda items: [items[0]] + items[1]) def any(self): return KleeneClosure(self) def map(self, transform): return Map(self, transform) class CharRange(Production): def __init__(self, start, end): self.start, self.end = start, end def parse(self, s, pos=0): if pos >= len(s): return None c = s[pos] if self.start <= c <= self.end: return c, pos+1 return None char_range = CharRange class DeferredProduction(Production): def resolve(self, delegate): self.delegate = delegate def parse(self, s, pos=0): return self.delegate.parse(s, pos) defer = DeferredProduction class Concatenation(Production): def __init__(self, a, b): self.a, self.b = a, b def parse(self, s, pos=0): left = self.a.parse(s, pos) if not left: return left value, pos = left right = self.b.parse(s, pos) if not right: return right rval, pos = right return (value, rval), pos class KleeneClosure(Production): def __init__(self, contents): self.contents = contents def parse(self, s, pos=0): vals = [] while True: result = self.contents.parse(s, pos) if not result: return vals, pos val, npos = result vals.append(val) assert npos > pos pos = npos class OrderedChoice(Production): def __init__(self, a, b): self.a, self.b = a, b def parse(self, s, pos=0): left = self.a.parse(s, pos) if left: return left return self.b.parse(s, pos) class ButNot(Production): def __init__(self, yes, no): self.yes, self.no = yes, no def parse(self, s, pos=0): left = self.yes.parse(s, pos) if not left: return left right = self.no.parse(s, pos) if right: return None return left class AnyByte(Production): def parse(self, s, pos=0): if pos >= len(s): return None return s[pos], pos+1 class Map(Production): def __init__(self, contents, transformation): self.contents, self.transformation = contents, transformation def parse(self, s, pos=0): result = self.contents.parse(s, pos) if not result: return result val, pos = result return self.transformation(val), pos any_byte = AnyByte() def as_production(thing): if isinstance(thing, Production): return thing if isinstance(thing, str): return literal(thing) raise ValueError(thing) class Literal(Production): def __init__(self, contents): self.contents = contents def parse(self, s, pos=0): end = pos+len(self.contents) if s[pos:end] == self.contents: return self.contents, end return None literal = Literal def sep(item, separator): def desep(items): rv = [items[0]] for item in items[1]: rv.append(item[1]) return rv return (item + (separator + item).any()).map(desep) ### SQL class Expr: pass class Name(Expr): def __init__(self, name): self.name = name def __repr__(self): return 'Name(%r)' % self.name class Litstring(Expr): def __init__(self, chars): self.contents = ''.join("'" if c == "''" else c for c in chars) def __repr__(self): return 'Litstring(%r)' % self.contents def sql(): letter = char_range('a', 'z') / char_range('A', 'Z') wsp = literal(' ').any() def _(contents): return (contents + wsp).map(lambda xs: xs[0]) name = letter.some().map(lambda xs: Name(''.join(xs))) litstring = ("'" + ((any_byte - "'") / "''").any() + "'" ).map(lambda xs: Litstring(xs[0][1])) expr2 = _(name / litstring) expr = defer() expr.resolve((_(expr2) + _('=') + _(expr)) / expr2) expr3 = defer() expr3.resolve((_(expr) + _(literal('and') / literal('or')) + _(expr3)) / expr) commasep = lambda item: sep(item, _(',')) return ((_('select') + _(commasep(expr))).map(lambda xs: {'cols': xs[1]}) + (_('from') + _(commasep(name))).map(lambda xs: {'tbls': xs[1]}) + ((_('where') + expr3).map(lambda xs: {'select': xs[1]}) / literal('').map(lambda xs: {'select': True}))) def sqlparse(s): return sql().parse(s) if __name__ == '__main__': s = "select ack, bag from foo, bar where x = bag and foo = 'isn''t'" print(s) pprint.pprint(sqlparse(s)) while True: try: s = raw_input('?- ') except EOFError: print() break pprint.pprint(sqlparse(s))