#!/usr/bin/python
"""Represent lists of small integers in bytes in self-delimiting sequences.
The representation is not self-*describing*, only self-*delimiting*.
This representation, based on the Protocol Buffers signed int
representation, has these properties:
- numbers in [-63, 64] take one byte
- numbers in [-32767, 32768] take two bytes
- larger numbers take an extra byte per 7 bits
- a sequence takes one more byte than its constituent numbers
- it's reasonably fast to decode in C
- it supports arbitrarily large numbers
- numbers' and sequences' representations are unique
- all byte strings are prefixes of valid representations
These last two properties make the relation between sequences of ints
and sequences of bytes as close to 1-to-1 as the self-delimiting
constraint permits.
It also has these undesirable properties:
- numbers' representations are not lexicographically ordered (i.e. the
representation does not preserve lexicographic ordering)
- all byte strings are prefixes of valid representations
- the decoding process cannot safely resume from the middle of the
stream
- it is substantially slower than raw binary
The encoding of the sequence is a concatenation of self-delimiting
integer encodings, followed by a '\0' terminator. Integers are
encoded sign-magnitude big-endian in the low 7 bits of each nonfinal
byte, whose high bit is clear. In the final byte, the high bit is
set, the low bit represents the sign (1 for negative, 0 for positive),
and the 6 bits in the middle are the least-significant 6 bits of the
magnitude of the number. Encoding an integer is required to use the
minimal number of bytes, so leading zero bytes are not permitted.
'\0' bytes can occur in the middle of numbers, but not at the
beginning, so it is safe to use them to represent sequence
terminations.
"""
def dumps_array(a):
return ''.join(_dumps(a))
# Memo table to cut down on the work of serializing integers.
global_dumps_memo = {}
def _dumps(a):
dumps_memo = global_dumps_memo
for i in a:
try:
yield dumps_memo[i]
except KeyError:
if len(dumps_memo) > 1048576:
dumps_memo.clear()
dumps_memo[i] = rv = dumps_int(i)
yield rv
yield '\0'
def dumps_bytes(b):
b = str(b)
return dumps_int(len(b)) + b
def dumps_int(i):
b = []
i = abs(i) << 1 | (1 if i < 0 else 0)
while True:
b.append(i & 0x7f)
i >>= 7
if not i:
break
b[0] |= 0x80
return ''.join(chr(c) for c in reversed(b))
def load_array(s):
return list(_loads(s))
def load_int(s):
return _loads(iter(s)).next()
def load_bytes(s):
s = iter(s)
n = load_int(s)
return ''.join(s.next() for i in range(n))
def _loads(s):
n = 0
for b in s:
if n == 0 and b == '\0':
break
b = ord(b)
n = (n << 7) | (b & 0x7f)
if b & 0x80:
yield -(n >> 1) if n & 1 else n >> 1
n = 0
class EncodingError(Exception): pass
def ok(a, b): assert a == b, (a, b)
ok(load_array(dumps_array([10, -30, 5, 1000, 0, 1])), [10, -30, 5, 1000, 0, 1])
ok(load_array(dumps_array(range(-1000, 1000))), range(-1000, 1000))