#!/usr/bin/python # -*- coding: utf-8 -*- """Split an input file into chunks consistently across insertions. This is like the hashsplitting done by bup, but less vulnerable to malicious input. """ import sys def main(argv): seq = seq_for_8_bytes(sys.stdin.read()) # This strategy of just using the same hash value for every level # is not actually that great. Also 16 hashes is too few levels of # granularity for useful amounts of multilevel hashsplitting. for sslist, _ in slopesplit(slopesplit(slopesplit(seq, 4), 4), 4): print '[' for slist in sslist: print ' [', for s in slist: print ' ' * 8 and repr(''.join(s)), print ' ]' print ']' def slopesplit(seq, n): """Transform (item, hashval) pairs into (item list, hashval) pairs. The lists are chunks averaging size n of the input sequence; the surviving hashval is the one of the last item in the chunk. The algorithm tends to split after items with unusually large hashvals, but also to keep the chunks around n items. The relative strengths of these tendencies can be adjusted by adjusting the size of the hashvals. If they are uniformly distributed over a range much larger than n²/4, the hashvals will have more influence, while if the range is much smaller than n²/4, the chunk sizes will instead be more consistent, at the expense of preferred split points. """ threshold = n buf = [] for item, k in seq: buf.append(item) threshold -= 1 if k > threshold: yield buf, k buf = [] threshold = k + n if buf: yield buf, k def seq_for_8_bytes(text): """Split text into chunks of roughly 8 bytes. We want the top 4 counts of the hash space to be roughly ¼ of the space, so hashing up to about 16 is OK. """ for c in text: o = ord(c) yield c, (o & 15) ^ (o >> 4) if __name__ == '__main__': sys.exit(main(sys.argv))