#!/usr/bin/python """Represent lists of small integers in bytes in self-delimiting sequences. The representation is not self-*describing*, only self-*delimiting*. This representation, based on the Protocol Buffers signed int representation, has these properties: - numbers in [-63, 64] take one byte - numbers in [-32767, 32768] take two bytes - larger numbers take an extra byte per 7 bits - a sequence takes one more byte than its constituent numbers - it's reasonably fast to decode in C - it supports arbitrarily large numbers - numbers' and sequences' representations are unique - all byte strings are prefixes of valid representations These last two properties make the relation between sequences of ints and sequences of bytes as close to 1-to-1 as the self-delimiting constraint permits. It also has these undesirable properties: - numbers' representations are not lexicographically ordered (i.e. the representation does not preserve lexicographic ordering) - all byte strings are prefixes of valid representations - the decoding process cannot safely resume from the middle of the stream - it is substantially slower than raw binary The encoding of the sequence is a concatenation of self-delimiting integer encodings, followed by a '\0' terminator. Integers are encoded sign-magnitude big-endian in the low 7 bits of each nonfinal byte, whose high bit is clear. In the final byte, the high bit is set, the low bit represents the sign (1 for negative, 0 for positive), and the 6 bits in the middle are the least-significant 6 bits of the magnitude of the number. Encoding an integer is required to use the minimal number of bytes, so leading zero bytes are not permitted. '\0' bytes can occur in the middle of numbers, but not at the beginning, so it is safe to use them to represent sequence terminations. """ def dumps_array(a): return ''.join(_dumps(a)) # Memo table to cut down on the work of serializing integers. global_dumps_memo = {} def _dumps(a): dumps_memo = global_dumps_memo for i in a: try: yield dumps_memo[i] except KeyError: if len(dumps_memo) > 1048576: dumps_memo.clear() dumps_memo[i] = rv = dumps_int(i) yield rv yield '\0' def dumps_bytes(b): b = str(b) return dumps_int(len(b)) + b def dumps_int(i): b = [] i = abs(i) << 1 | (1 if i < 0 else 0) while True: b.append(i & 0x7f) i >>= 7 if not i: break b[0] |= 0x80 return ''.join(chr(c) for c in reversed(b)) def load_array(s): return list(_loads(s)) def load_int(s): return _loads(iter(s)).next() def load_bytes(s): s = iter(s) n = load_int(s) return ''.join(s.next() for i in range(n)) def _loads(s): n = 0 for b in s: if n == 0 and b == '\0': break b = ord(b) n = (n << 7) | (b & 0x7f) if b & 0x80: yield -(n >> 1) if n & 1 else n >> 1 n = 0 class EncodingError(Exception): pass def ok(a, b): assert a == b, (a, b) ok(load_array(dumps_array([10, -30, 5, 1000, 0, 1])), [10, -30, 5, 1000, 0, 1]) ok(load_array(dumps_array(range(-1000, 1000))), range(-1000, 1000))