#!/usr/bin/python3 """Build a directed acyclic word graph. This takes 17 seconds and 270383 nodes to represent my 99171-word /usr/share/dict/words. More worryingly, it also takes 110MB of RAM, while the original file is only 0.9MB. (Without using ``__slots__`` it's over 170MB.) This seems like an unreasonable amount of space: 407 bytes per binary DAG node, even with ``__slots__``! A large majority of the nodes are sort of wasted, too, in that they contain only a single branch. This output is amusing: head -100 /usr/share/dict/words | ./dawg.py --graphviz | dot -Tx11 """ import weakref, sys try: unicode except NameError: unicode = str class Node: "Represent a binary node of a DAWG." __slots__ = 'if0', 'if1', 'eow', '__weakref__' def __init__(self, if0, if1, eow): self.if0, self.if1, self.eow = if0, if1, eow nodes = weakref.WeakValueDictionary() def node(if0, if1, eow): "Hash cons nodes." k = if0, if1, eow if k in nodes: return nodes[k] node = Node(if0, if1, eow) nodes[k] = node return node _a = node(None, None, True) _b = node(_a, None, True) _c = node(None, None, True) assert _a is _c assert _a is not _b assert _b.if0 is _a def _join(wordbits, tree, index, value): "Add a word, represented as a list of bits, to a DAWG, with an associated value." if index == len(wordbits): # XXX as Darius Bacon points out, it’s probably better to use # a special self-referential sentinel node for these paths # rather than None if tree is None: return node(None, None, value) return node(tree.if0, tree.if1, value) if wordbits[index]: if tree is None: return node(None, _join(wordbits, None, index+1, value), False) return node(tree.if0, _join(wordbits, tree.if1, index+1, value), tree.eow) if tree is None: return node(_join(wordbits, None, index+1, value), None, False) return node(_join(wordbits, tree.if0, index+1, value), tree.if1, tree.eow) def _search(wordbits, tree, index): "Return whether a word, represented as a list of bits, is in a DAWG." if tree is None: return False if index == len(wordbits): return tree.eow if wordbits[index]: return _search(wordbits, tree.if1, index+1) return _search(wordbits, tree.if0, index+1) def join(word, tree, value=True): "Add a word to a DAWG, optionally with the associated value." # There’s an amusing bug here where if you specify the value 1, it # will (probably) be turned into True, because test node _a above # is already in the hash-consing dictionary, and it will compare # equal to True, for hysterical raisins. More seriously, this # could also happen to 0, which would become False! return _join(bittify(word), tree, 0, value) def search(word, tree): "Return whether a word is in a DAWG." return _search(bittify(word), tree, 0) def bittify(word): "Convert string word to a list of bits." if not isinstance(word, bytes): word = word.encode('utf-8') masks = [1 << i for i in range(8)] return [1 if c & mask else 0 for c in word # XXX this won’t work in Python 2 for mask in masks] def build(words, tree=None): for word in words: tree = join(word, tree) return tree _w = build(['tap', 'taps', 'top', 'tops']) assert search('tap', _w) is True assert search('ta', _w) is False assert search('taps', _w) is True assert search('tops', _w) is True assert search('topsy', _w) is False assert search('tab', _w) is False assert search('to', _w) is False _w = join('to', _w, 37) assert search('tap', _w) is True assert search('ta', _w) is False assert search('taps', _w) is True assert search('tops', _w) is True assert search('topsy', _w) is False assert search('tab', _w) is False assert search('to', _w) == 37 def weight(tree): stack = [tree] found = {None} while stack: item = stack.pop() if item not in found: found.add(item) stack.append(item.if0) stack.append(item.if1) return len(found) assert weight(None) == 1 assert weight(_a) == 2 assert weight(_b) == 3 class Labeler: def __init__(self): self.i = 0 self.labels = {} def __getitem__(self, item): if item not in self.labels: self.i += 1 self.labels[item] = self.i return self.labels[item] def graphviz(tree): yield 'digraph dawg {\n rankdir=LR\n' stack = [tree] labels = Labeler() visited = set() i = 1 while stack: item = stack.pop() if item not in visited: visited.add(item) label = "" if item.eow is False else str(item.eow).replace('"', '\\"') yield ' %s [label="%s"];\n' % (labels[item], label) if item.if0 is not None: yield ' %s -> %s [label=0];\n' % (labels[item], labels[item.if0]) stack.append(item.if0) if item.if1 is not None: yield ' %s -> %s [label=1];\n' % (labels[item], labels[item.if1]) stack.append(item.if1) yield '\n' yield '}\n'; if __name__ == '__main__': if '--linenumbers' in sys.argv: tree = None for i, line in enumerate(sys.stdin): # Convert 0 and 1 to strings to avoid the bug mentioned # above where they turn into False and True :) tree = join(line.strip(), tree, str(i)) elif '--linecontents' in sys.argv: tree = None for line in sys.stdin: line = line.strip() tree = join(line, tree, line) else: tree = build(line.strip() for line in sys.stdin) if '--graphviz' in sys.argv: sys.stdout.writelines(graphviz(tree)) else: print(weight(tree))