"""Simple trie package to convert sets of words into deterministic regexps. """ class TrieNode: """Recursively represent a set of strings as a simple trie. Under some circumstances this might be useful for directly testing membership, doing key-value lookups, or simply storing the set compactly, but here we use it to generate a regular expression that Python’s sre module can use to efficiently find tokens from the set. This is a hack to train on a dictionary file with sort of reasonable accuracy. """ def __init__(self): self.children = [] # list of (char, TrieNode) pairs self.final = False # True if some string in the set ends here def __str__(self): return self.to_re() def to_re(self): body = '|'.join('%s%s' % (c, kid.to_re()) for c, kid in self.children) return '(?:%s)?' % body if self.final else '(?:%s)' % body def add_child(self, c): new_node = TrieNode() self.children.append((c, new_node)) return new_node def to_graphviz(trie): yield 'digraph trie {\n' stack = [trie] while stack: node = stack.pop() yield ' %s [label="%s"];\n' % (id(node), 'Λ' if node.final else '') for letter, child in node.children: yield ' %s -> %s [label="%s"];\n' % (id(node), id(child), letter) stack.append(child) yield '}\n' def graphviz_trie(wordlist): sys.stdout.writelines(to_graphviz(to_trie(wordlist))) def to_trie(wordlist): wordlist = sorted(wordlist) last_word = '' stack = [TrieNode()] for word in wordlist: n = prefix_len(word, last_word) while n < len(stack) - 1: stack.pop() for c in word[n:]: stack.append(stack[-1].add_child(c)) stack[-1].final = True last_word = word return stack[0] def prefix_len(word, last_word): n = 0 while (word[:n] == last_word[:n] and n <= len(word)): n += 1 return n - 1 def from_trie(trie): stack = [('', trie)] while stack: prefix, trie = stack.pop() if trie.final: yield prefix for c, subtrie in trie.children[::-1]: stack.append((prefix + c, subtrie))