#!/usr/bin/python # -*- coding: utf-8 -*- """Some noodling on how to write self-delimiting nested byte streams in Python. The stream is delimited at the end by a sequence of two NULs and is guaranteed not to contain two NULs internally. Internally it is a sequence of chunks; each chunk ends in a length trailer, which may be one byte or two bytes. If it is one byte, that byte (call it N) is in the range 0x01 to 0x7f; it indicates that the chunk consists of N-1 bytes of data, which are the previous N-1 bytes, followed by two NULs, which are not represented explicitly. If it is two bytes, call those bytes N and M, where M is the last byte. M is in the range 0x80 to 0xff, and N is in the range 0x01 (FIXME) to 0xff. Let P = (M - 0x80 << 8) + N + 0x7e. (FIXME) The chunk consists of P bytes of data, which are the previous P bytes, followed by two NULs, which are not represented explicitly, unless P is max_chunk, which is 0x7fff + 0x7e (FIXME). Additionally, the last chunk in the stream does not end in two NULs. That is, chunks always end in “virtual” double-NULs, which are part of the decoded content, but not part of the encoding; with two exceptions — the last chunk does not have a double-NUL appended, and any chunk of size max_chunk does not have a double-NUL appended. These somewhat complicated rules provide the following guarantees: - The encoding overhead is never more than four bytes per stream plus 0.006%. At times it is negative. - The sequence NUL NUL never occurs in the encoded body of the stream, only at its end. - The stream faithfully preserves arbitrary binary data written to it. - The stream is written purely sequentially with only one byte of buffering at most. - Data that does not contain double-NULs is written in large enough chunks to amortize typical per-transaction costs into insignificance. - When reading a stream of streams, it is possible to rapidly skip past substreams that are not of interest, looking at only about two out of every 16000 bytes. However, it has the rather surprising result that you must start reading the stream at the end. Also, when a stream ends in a nested stream, you get an extra 0x01 byte added at the end to add a phantom delimiter after the real delimiter that really ended the contents; this could be avoided by using the delimiters as substream *separators* rather than substream *terminators*. This probably implies that writing the delimiter shouldn’t be done by the close() method, but rather its caller, which in turn would mean that the streams aren’t self-delimiting any longer. The current version of the program has the following bugs: - max_chunk is almost certainly wrong - the low byte of a two-byte length trailer might be NUL, and the last byte of the preceding data might also be NUL, so the encoding needs to change slightly - the two-byte length trailers are probably wrong in general - the max-length phantom-delimiter logic is untested - it doesn’t buffer NULs on output to ensure they aren't followed by another, separately written NUL - there’s no reader yet, just the writer - it uses an O(N²) algorithm for handling large writes which causes it to pause for a minute or more if you write just a couple of megs to it - the Writer should totally work as a context manager and doesn’t yet """ delimiter = '\0\0' max_chunk = 0x7fff + 0x7e class Writer: def __init__(self, output): self.output = output self.chunk_so_far = 0 self.closed = False def __del__(self): if not self.closed: self.close() def write(self, data): assert not self.closed while True: split = data.find(delimiter) if split == -1: break self.add_to_chunk(data[:split]) self.encode_delimiter() data = data[split+len(delimiter):] self.add_to_chunk(data) def add_to_chunk(self, data): # FIXME forgetting the case where it ends in \0 while True: spare = max_chunk - self.chunk_so_far if spare > len(data): self.output.write(data) self.chunk_so_far += len(data) return else: self.output.write(data[:spare]) self.chunk_so_far += spare self.encode_delimiter() # a phantom delimiter data = data[spare:] def close(self): assert not self.closed self.encode_delimiter() # another phantom delimiter self.output.write(delimiter) self.closed = True def encode_delimiter(self): if self.chunk_so_far <= 0x7e: self.output.write(chr(self.chunk_so_far + 1)) else: to_encode = self.chunk_so_far - 0x7e # FIXME this could be a NUL byte. Fixing this is going to # require changing the representation from base 256 to # base 255 (with excess-1 in the least significant byte # and excess-128 in the most significant byte) self.output.write(chr(to_encode & 0xff)) high_byte = 0x80 + (to_encode >> 8) assert 0x80 <= high_byte <= 0xff self.output.write(chr(high_byte)) self.chunk_so_far = 0 if __name__ == '__main__': import sys w1 = Writer(sys.stdout) if sys.argv[1:2] == ['-b']: # blob stress test w2 = Writer(w1) w2.write(open(sys.argv[2], 'rb').read()) w2.close() elif len(sys.argv) != 1: # Simple archive file format for arg in sys.argv[1:]: w1.write(arg + delimiter) w2 = Writer(w1) for line in open(arg): while line[-1:] in ('\r', '\n'): line = line[:-1] w2.write(line + delimiter) # let’s assume no \0\0 in inputs w2.close() else: # Database table example for ii in range(4): w2 = Writer(w1) w2.write('row' + delimiter) for jj, f in enumerate(['foo', 'bar', 'quux']): if jj: w2.write(delimiter) w2.write(f) w2.write(str(ii) * (ii+1)) w2.close() w1.close()