#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Evaluate different mental hash algorithms.
I carry around a paper notebook with me instead of an electronic
PDA. I recently added an address-book section to it, but alphabetized
address books on paper are a pain because moving entries up and down
the page to insert a new entry is, at best, very time-consuming.
So I’m using a hash table with separate chaining. The table itself is
a 26x26 grid on the first page, with entry numbers in the squares. The
entries are sequential after that, and each entry has a space for a
“next” pointer in case there’s a collision --- that is, the number of
the next entry that shares the same hash value.
I’m currently using the first and last letter for the hash value,
since that’s easy to compute in my head. This program did an analysis
with the British National Corpus suggesting that this extracts about
two-thirds of the information in the word, about 7.6 bits per word. It
also suggests that the first and third letter would be slightly
better, at 7.9 bits per word.
I suspect that, with any reasonable hash function, this approach gives
much faster lookups for up to a few hundred entries than
alphabetization, in addition to avoiding the problem with inserting
new entries in a full space. But I haven’t done the test.
This program analyzes the performance of a variety of different hash
functions. The objective is to pick a hash function that is easy to
compute mentally and gives very few collisions. I have some
word-frequency data. Rather than compute an expected number of
collisions for different index sizes, I will merely compute how much
information-theoretical information each algorithm will preserve.
(You could presumably use this information to design an input method
too: type the first and third letter of a word, and the IME would
insert the most likely word given the context of the previous word or
two and the next word or two, then give you the opportunity to correct
it. (So you would enter the previous sentence as ‘yucupeuetiift dsa
ipmtto:tptefradtilto a wr,adteiewuistemslkwrgvtecno tepewro
toadtenxwro to’, plus corrections.))
Output is:
4.10 bits/word (39.8%)
3.75 bits/word (36.4%)
4.54 bits/word (44.0%)
7.17 bits/word (69.6%)
6.49 bits/word (62.9%)
6.35 bits/word (61.6%)
4.10 bits/word (39.8%)
6.49 bits/word (62.9%)
7.20 bits/word (69.8%)
6.94 bits/word (67.3%)
6.40 bits/word (62.0%)
5.95 bits/word (57.7%)
8.17 bits/word (79.2%)
6.77 bits/word (65.7%)
With stopwords ['the', 'of', 'and', 'to', 'a', 'in', 'it', 'is', 'was', 'that'] removed:
4.31 bits/word (37.0%)
3.75 bits/word (32.3%)
4.66 bits/word (40.1%)
7.64 bits/word (65.7%)
6.90 bits/word (59.3%)
6.60 bits/word (56.7%)
4.31 bits/word (37.0%)
6.90 bits/word (59.3%)
7.87 bits/word (67.6%)
7.71 bits/word (66.3%)
7.18 bits/word (61.7%)
6.66 bits/word (57.2%)
8.87 bits/word (76.2%)
7.22 bits/word (62.1%)
"""
from __future__ import division
import math, string
def total_bits(symbol_frequencies):
"""Calculate the total information conveyed by a given process.
The information conveyed by a single symbol is -lg P bits, where P
is the prior probability of that symbol. So if the probability of
a symbol is 1/8, lg 1/8 is -3, so the symbol conveys 3 bits each
time it occurs. So given that some symbol occurs `n` times out of `m`
total occurrences, its probability is `n`/`m`, each of its occurrences
conveys -lg `n`/`m` = lg `m`/`n` bits, and the symbol in total conveys `n`
lg `m`/`n` bits.
"""
m = sum(symbol_frequencies.values())
return sum(n * lg(m/n)
for n in symbol_frequencies.values())
def lg(x):
return math.log(x) / math.log(2)
def compute_hash_frequencies(hash_function, frequencies):
hashes = multimap((hash_function(word), word)
for word in frequencies.keys())
return dict((symbol,
sum(frequencies[hash]
for hash in hashes[symbol]))
for symbol in hashes.keys())
def multimap(seq):
rv = Multimap()
for k, v in seq:
rv.add(k, v)
return rv
class Multimap:
def __init__(self):
self._items = {}
def add(self, k, v):
try:
self._items[k].append(v)
except KeyError:
self._items[k] = [v]
def keys(self):
return self._items.keys()
def __getitem__(self, key):
try:
return self._items[key]
except KeyError:
return []
def evaluate_hash_functions(functions, frequencies):
best_possible = total_bits(frequencies)
total_words = sum(frequencies.values())
for hash_function in functions:
symfreqs = compute_hash_frequencies(hash_function,
frequencies)
this_score = total_bits(symfreqs)
# The leading four spaces make the output
# valid Markdown.
format_str = " %5.2f bits/word (%.1f%%) %s"
print format_str % (this_score / total_words,
this_score / best_possible *100,
hash_function)
hashfuncs = []
hashfunc = hashfuncs.append
@hashfunc
def first_letter(word): return word[0]
@hashfunc
def last_letter(word): return word[-1]
@hashfunc
def one_letter_flat_hash(word):
# This function is impractical to compute in one’s
# head, but is probably about as good as a one-letter
# hash function could get.
# It doesn’t get quite -lg(1/26) = 4.7 bits per word
# because a few words are more common than one out of
# 26, so this can’t flatten the hash fully.
return string.ascii_lowercase[hash(word) % 26]
@hashfunc
def first_and_last_letter(word): return word[0] + word[-1]
@hashfunc
def last_two_letters(word): return word[-2:]
def first_and_nth_letter(n):
def rv(word):
# “second” letter is word[1] if present
return word[0] + word[n-1:n]
rv.__name__ = 'first_and_nth_letter(%d)' % n
return rv
# 4, 5, 6 don’t work very well because too many words are shorter than
# 6 letters. first_and_nth_letter(1) is the same as first_letter, and
# first_and_nth_letter(-1) is the same as first_and_last_letter.
for index in [1, 2, 3, 4, 5, 6, -1]:
hashfunc(first_and_nth_letter(index))
@hashfunc
def two_letter_flat_hash(word):
# See remark on `one_letter_flat_hash`.
return hash(word) % (26*27)
@hashfunc
def first_letter_and_length(word):
return (word[0], len(word))
def read_wordlist(infile):
return dict((word, int(freq)) for freq, word in
(line.split() for line in infile))
def main():
frequencies = read_wordlist(file('wordlist'))
evaluate_hash_functions(hashfuncs, frequencies)
# Exclude English’s most common ten words.
stopwords = "the of and to a in it is was that".split()
print "\nWith stopwords %s removed:\n" % (stopwords,)
for word in stopwords:
del frequencies[word]
evaluate_hash_functions(hashfuncs, frequencies)
if __name__ == '__main__': main()