#!/usr/bin/python """Make a KWIC index of an input stream in tiny code. Like this Perl, but more memory-efficient: while (<>) { @w = split; push @lines, "@w[$_+1..$#w] | @w[0..$_] $.\n" for (0..$#w-1); } print sort @lines; The idea here is to 1. assign an id to each word, storing a list of ids for each line 2. compute the lexicographical ordering permutation of those ids 3. rewrite the id list for each line using that permutation, so that the lexicographical ordering of line representations would reflect the underlying representation 4. compute a list of cyclic permutation points 5. sort that list with a custom comparison function 6. output the words """ import sys pos, aw, a = {}, [], [] # Each word is represented by index in aw for r in sys.stdin: # Build list of lists of word ids in a b = r.split() for w in b: if w not in pos: # Assign an id to the new word pos[w] = len(aw) aw.append(w) a.append([pos[w] for w in b]) # Encode the line with word ids g = sorted(range(len(aw)), key=aw.__getitem__) # Compute word sort permutation v = [None] * len(aw) # Invert the permutation... for i, j in enumerate(g): v[j] = i # Now v[g[i]] == i. a = [[v[w] for w in b] for b in a] # Rewrite lines with permuted ids cs = [(ln, wn) for ln, b in enumerate(a) for wn, _ in enumerate(b)] cs.sort(lambda (lnx, wnx), (lny, wny): cmp(a[lnx][wnx:] + a[lnx][:wnx], a[lny][wny:] + a[lny][:wny])) p = lambda ws: ' '.join(aw[g[w]] for w in ws) # map back to original words sys.stdout.writelines("%5d: %s | %s\n" % (ln, p(a[ln][wn:]), p(a[ln][:wn])) for ln, wn in cs)