#!/usr/bin/python # -*- coding: utf-8 -*- """Convert a text file into a tree in graphviz format. This is related to flat-dict-lang.md in Dercuano. The idea is that this text this is a test farm animal too much represents the minimal unordered edge-labeled tree containing the following paths: this is a test this is a farm animal this is too much So for each word we store its start position so we know when we have outdented back to it and we can pop it off the stack. Also each """ from __future__ import print_function import re import sys def parse(lines): stack = [] node_counter = 1 edges = [] for line in lines: col = len(re.match(r'\s*', line).group(0)) while stack and stack[-1][0] >= col: stack.pop() word, start, empty = [], col, () while col < len(line): c = line[col] if word and re.match(r'\s', c): nw = ''.join(word) word[:] = empty if stack: edges.append((stack[-1][2], nw, node_counter)) else: edges.append((0, nw, node_counter)) assert start is not None stack.append((start, nw, node_counter)) node_counter += 1 start = None elif re.match(r'\S', c): if not word: start = col word.append(c) col += 1 if word: nw = ''.join(word) if stack: edges.append((stack[-1][2], nw, node_counter)) else: edges.append((0, nw, node_counter)) assert start is not None stack.append((start, nw, node_counter)) node_counter += 1 start = None return edges def graphviz(edges, name='cosas'): yield 'digraph '; yield name; yield ' {\n' yield ' rankdir=LR;\n' yield ' node [label="", shape=circle];\n' for start, label, end in edges: yield ' '; yield str(start); yield ' -> '; yield str(end); yield ' [label="'; yield label; yield '"];\n' yield '}\n' if __name__ == '__main__': sys.stdout.writelines(graphviz(parse(sys.stdin)))