#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Encode bytes with move-to-front coding. Not useful yet.
The idea of move-to-front coding is to make text more compressible
with a static technique like Huffman coding by taking some minimal
advantage of local structure.
It does seem to work in the sense of being a reversible
transformation:
>>> mtf_file = open('mtf.py').read()
>>> ''.join(mtf.decode(mtf.encode(mtf_file))) == mtf_file
True
However, it doesn’t seem to actually work to increase compression;
combining (an earlier version of) this file with another
similarly-sized file written in another language, the encoding worsens
the message entropy:
>>> mtf.entropy(collections.Counter((open('obliquing-italics.html').read()
+ open('mtf.py').read())).most_common(256))
12610.656937935639
>>> mtf.entropy(collections.Counter(mtf.encode(open('obliquing-italics.html').read()
+ open('mtf.py').read())).most_common(256))
13640.383124491518
Now, I also hacked together a funky modified version of this scheme
where each character “owns” some set of followers — the characters
whose most recent occurrence was following that character — and we
move-to-front *within* that set. Except if we need to steal a
character whose most recent occurrence was following some other
character; the other follow sets are counted in most-recently-used
order following the current one. This means that any sequence of
letters and spaces that follows the same sequence as last time is just
0, 0, 0, 0,..., and if you’re at a popular letter, the other letters
that most commonly follow it are likely to be 1, 2, or 3 (unless
someone stole them).
So, at the end of "this is a test", the follow sets are:
t: e, h
s: t
e: s
a: SPC
SPC: a, i
i:
h:
If we immediately follow this by encoding a second "this is a test",
we get:
2 (steal the t from s)
2 (the h; now t has h, t, e, and we are in the followerless h state)
6 (steal the i from SPC; now we have i: ; h: i; t: h, t, e; s: ; e: s ...)
4 (steal the s from e; now we are in the followerless s state:
s: ; i: s; h: i; (jinx) t: h, t, e; e: ; a: SPC; ...)
5 (steal the SPC from a; SPC will rarely be far away; it still owns a)
2 (steal the i from h for SPC)
0 (i is again followed by s)
0 (s is again followed by SPC)
1 (SPC is followed by a this time, which hadn’t been stolen yet:
a: ; SPC: i, a; s: SPC; i: s; h: ; t: h, t, e; e: ;)
2 (leap over a’s empty follower and SPC’s a, i to get to s’s SPC)
5 (steal t from itself; now we have SPC: t, i, a)
1 (e is the second remaining follower of t; now we have
e: ; t: h, e; SPC: t, i, a; a: ; s: SPC; i: s; h: ;)
7 (i, which we have to steal s from, is effectively in last place)
3 (steal the t from SPC)
The current scheme in this situation instead gives 0, 6, 6, 3, 5, 2,
2, 2, 6, 1, 5, 6, 4, 2, which at least to me looks like it’s less
biased to zeroes and ones and more biased to sixes. There’s no free
lunch, of course, but this looks promising...
My attempt below at implementing the funky modified encoding scheme
below gives 2, 2, 5, 4, 6, 3, 0, 0, 1, 2, 5, 1, 6, 3 in that scenario,
which may be correct (it’s likely that I made errors simulating it in
my head above). However, the entropy of its output on this file is
even higher; it does indeed have a lot more zeroes, but this is more
than compensated for by flattening out the rest of the curve.
It *does* work better when applied to a PNM file (for which,
incidentally, ordinary move-to-front encoding is also a win, if a
smaller one):
>>> mtf.entropy(collections.Counter((open('whatsapp-background.pnm').read())).most_common(256))
495849.33522203815
>>> mtf.entropy(collections.Counter(mtf.encode(open('whatsapp-background.pnm').read())).most_common(256))
370610.0069765521
>>> mtf.entropy(collections.Counter(mtf.funkymodified_encode(open('whatsapp-background.pnm').read())).most_common(256))
318365.5798174492
However, the PNG of that PNM is less than half the
entropy, so it’s not competitive with LZ77:
>>> mtf.entropy(collections.Counter((open('whatsapp-background.png').read())).most_common(256))
144252.9590497413
"""
import math
def encode(text):
last = [chr(i) for i in range(256)]
for letter in text:
pos = last.index(letter)
yield chr(pos)
last[pos:pos+1] = []
last[:0] = [letter]
def decode(codes):
last = [chr(i) for i in range(256)]
for code in codes:
pos = ord(code)
letter = last[pos]
yield letter
last[pos:pos+1] = []
last[:0] = [letter] # XXX is this a bug?
def funkymodified_encode(text):
last_followers = {chr(i): [chr(i)] for i in range(256)}
owners = {chr(i): chr(i) for i in range(256)}
last = [chr(i) for i in range(256)]
for letter in text:
owner = owners[letter]
pos_in_followers = last_followers[owner].index(letter)
intervening = sum(len(last_followers[obstacle])
for obstacle in last[:last.index(owner)])
yield chr(intervening + pos_in_followers)
owners[letter] = last[0]
last_followers[owner][pos_in_followers:pos_in_followers+1] = []
last_followers[last[0]][:0] = [letter]
pos_in_last = last.index(letter)
last[pos_in_last:pos_in_last+1] = []
last[:0] = [letter]
def entropy(items_freqs):
"""Return the number of bits needed for arithmetic coding of a sequence."""
freqs = [freq for item, freq in items_freqs]
total = sum(freqs)
# The entropy of a given symbol occurrence is the logarithm of the
# symbol’s frequency.
entropy_per_occurrence = [math.log(freq)-math.log(total) for freq in freqs]
# The message entropy attributed to a symbol is the sum of the
# entropy of each of the symbol’s occurrences.
entropy_total = [entropy * freq
for entropy, freq in zip(entropy_per_occurrence, freqs)]
return sum(entropy_total)/math.log(0.5)
def _ok(a, b):
assert a == b, (a, b)
_ok(''.join(decode(encode('this is a test'))), 'this is a test')