#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Some noodling on how to write self-delimiting nested byte streams in Python.

The stream is delimited at the end by a sequence of two NULs and is
guaranteed not to contain two NULs internally.  Internally it is a
sequence of chunks; each chunk ends in a length trailer, which may be
one byte or two bytes.

If it is one byte, that byte (call it N) is in the range 0x01 to 0x7f;
it indicates that the chunk consists of N-1 bytes of data, which are
the previous N-1 bytes, followed by two NULs, which are not
represented explicitly.

If it is two bytes, call those bytes N and M, where M is the last
byte.  M is in the range 0x80 to 0xff, and N is in the range 0x01
(FIXME) to 0xff.  Let P = (M - 0x80 << 8) + N + 0x7e. (FIXME) The
chunk consists of P bytes of data, which are the previous P bytes,
followed by two NULs, which are not represented explicitly, unless P
is max_chunk, which is 0x7fff + 0x7e (FIXME).

Additionally, the last chunk in the stream does not end in two NULs.

That is, chunks always end in “virtual” double-NULs, which are part of
the decoded content, but not part of the encoding; with two
exceptions — the last chunk does not have a double-NUL appended, and
any chunk of size max_chunk does not have a double-NUL appended.

These somewhat complicated rules provide the following guarantees:

- The encoding overhead is never more than four bytes per stream plus
  0.006%.  At times it is negative.
- The sequence NUL NUL never occurs in the encoded body of the stream,
  only at its end.
- The stream faithfully preserves arbitrary binary data written to it.
- The stream is written purely sequentially with only one byte of
  buffering at most.
- Data that does not contain double-NULs is written in large enough
  chunks to amortize typical per-transaction costs into
  insignificance.
- When reading a stream of streams, it is possible to rapidly skip
  past substreams that are not of interest, looking at only about two
  out of every 16000 bytes.

However, it has the rather surprising result that you must start
reading the stream at the end.  Also, when a stream ends in a nested
stream, you get an extra 0x01 byte added at the end to add a phantom
delimiter after the real delimiter that really ended the contents;
this could be avoided by using the delimiters as substream
*separators* rather than substream *terminators*.  This probably
implies that writing the delimiter shouldn’t be done by the close()
method, but rather its caller, which in turn would mean that the
streams aren’t self-delimiting any longer.

The current version of the program has the following bugs:

- max_chunk is almost certainly wrong
- the low byte of a two-byte length trailer might be NUL, and the last
  byte of the preceding data might also be NUL, so the encoding needs
  to change slightly
- the two-byte length trailers are probably wrong in general
- the max-length phantom-delimiter logic is untested
- it doesn’t buffer NULs on output to ensure they aren't followed by
  another, separately written NUL
- there’s no reader yet, just the writer
- it uses an O(N²) algorithm for handling large writes which causes it
  to pause for a minute or more if you write just a couple of megs to
  it
- the Writer should totally work as a context manager and doesn’t yet

"""

delimiter = '\0\0'
max_chunk = 0x7fff + 0x7e

class Writer:
    def __init__(self, output):
        self.output = output
        self.chunk_so_far = 0
        self.closed = False

    def __del__(self):
        if not self.closed:
            self.close()

    def write(self, data):
        assert not self.closed

        while True:
            split = data.find(delimiter)
            if split == -1:
                break

            self.add_to_chunk(data[:split])
            self.encode_delimiter()
            data = data[split+len(delimiter):]

        self.add_to_chunk(data)

    def add_to_chunk(self, data):
        # FIXME forgetting the case where it ends in \0
        while True:
            spare = max_chunk - self.chunk_so_far
            if spare > len(data):
                self.output.write(data)
                self.chunk_so_far += len(data)
                return
            else:
                self.output.write(data[:spare])
                self.chunk_so_far += spare
                self.encode_delimiter() # a phantom delimiter
                data = data[spare:]

    def close(self):
        assert not self.closed
        self.encode_delimiter() # another phantom delimiter
        self.output.write(delimiter)
        self.closed = True

    def encode_delimiter(self):
        if self.chunk_so_far <= 0x7e:
            self.output.write(chr(self.chunk_so_far + 1))
        else:
            to_encode = self.chunk_so_far - 0x7e
            # FIXME this could be a NUL byte.  Fixing this is going to
            # require changing the representation from base 256 to
            # base 255 (with excess-1 in the least significant byte
            # and excess-128 in the most significant byte)
            self.output.write(chr(to_encode & 0xff))
            high_byte = 0x80 + (to_encode >> 8)
            assert 0x80 <= high_byte <= 0xff
            self.output.write(chr(high_byte))

        self.chunk_so_far = 0

if __name__ == '__main__':
    import sys
    w1 = Writer(sys.stdout)
    if sys.argv[1:2] == ['-b']:
        # blob stress test
        w2 = Writer(w1)
        w2.write(open(sys.argv[2], 'rb').read())
        w2.close()
    elif len(sys.argv) != 1:
        # Simple archive file format
        for arg in sys.argv[1:]:
            w1.write(arg + delimiter)
            w2 = Writer(w1)
            for line in open(arg):
                while line[-1:] in ('\r', '\n'): line = line[:-1]
                w2.write(line + delimiter) # let’s assume no \0\0 in inputs
            w2.close()
    else:
        # Database table example
        for ii in range(4):
            w2 = Writer(w1)
            w2.write('row' + delimiter)

            for jj, f in enumerate(['foo', 'bar', 'quux']):
                if jj: w2.write(delimiter)
                w2.write(f)
                w2.write(str(ii) * (ii+1))
            w2.close()
    w1.close()