#!/usr/bin/python
"""Make a KWIC index of an input stream in tiny code.
Like this Perl, but more memory-efficient:
while (<>) {
@w = split;
push @lines, "@w[$_+1..$#w] | @w[0..$_] $.\n" for (0..$#w-1);
}
print sort @lines;
The idea here is to
1. assign an id to each word, storing a list of ids for each line
2. compute the lexicographical ordering permutation of those ids
3. rewrite the id list for each line using that permutation, so that
the lexicographical ordering of line representations would reflect
the underlying representation
4. compute a list of cyclic permutation points
5. sort that list with a custom comparison function
6. output the words
"""
import sys
pos, aw, a = {}, [], [] # Each word is represented by index in aw
for r in sys.stdin: # Build list of lists of word ids in a
b = r.split()
for w in b:
if w not in pos: # Assign an id to the new word
pos[w] = len(aw)
aw.append(w)
a.append([pos[w] for w in b]) # Encode the line with word ids
g = sorted(range(len(aw)), key=aw.__getitem__) # Compute word sort permutation
v = [None] * len(aw) # Invert the permutation...
for i, j in enumerate(g): v[j] = i # Now v[g[i]] == i.
a = [[v[w] for w in b] for b in a] # Rewrite lines with permuted ids
cs = [(ln, wn) for ln, b in enumerate(a) for wn, _ in enumerate(b)]
cs.sort(lambda (lnx, wnx), (lny, wny): cmp(a[lnx][wnx:] + a[lnx][:wnx],
a[lny][wny:] + a[lny][:wny]))
p = lambda ws: ' '.join(aw[g[w]] for w in ws) # map back to original words
sys.stdout.writelines("%5d: %s | %s\n" % (ln, p(a[ln][wn:]), p(a[ln][:wn]))
for ln, wn in cs)