#!/usr/bin/python # -*- coding: utf-8 -*- """I read that cuckoo hashing with three hash functions can hit 91% full. That’s absolutely astounding. Is it true? The idea is that a given hash table has three hash functions associated with it and some set of buckets. To see if a key is contained in it, you hash it with the three functions to give you three buckets, and check those three buckets. If it is not in one of them, it is not in the table, period. There is no further iterative process. Deletion, similarly, requires just blanking a single entry. When inserting into such a hash table, it may be the case that all three possible locations for the newly inserted key are full. In that case you need to move one of them to an alternative location, which may involve moving another item, and so on. If this process turns out not to terminate, you may have to rebuild the hash table with different functions, which may not help. By using three separate tables instead of a single table with three hash functions, you avoid the possibility that a single key’s various hashes will collide with each other, slightly improving performance. Mitzenmacher’s ESA 2009 paper says that random-walk cuckoo hashing for more than two alternatives is still not well understood, while breadth-first search is expensive, but experimentally the threshold where insertion and rehashing will eventually succeed may still be 91%. At present (three hours into writing it!) this code doesn’t work. It loses and duplicates keys when it’s trying to relocate keys, but only sometimes, and apparently only when the table is nearly full, maybe just after rehashing? """ import itertools import random class Cuck3: def __init__(self, size): self.size = size self.relocating = set() self.rehash([], []) def rehash(self, keys, values, tries=16): print "rehashing", tries if tries == 0: raise CouldntRehash(len(keys), self.size) self._keys = [[None for ii in range(self.size)] for jj in range(3)] self._values = [[None for ii in range(self.size)] for jj in range(3)] self.hashes = [random_hash(self.size) for jj in range(3)] for k, v in itertools.izip(keys, values): if not self.insert(k, v): return self.rehash(keys, values, tries-1) # Verify that rehashing left the table correct for h, kk in itertools.izip(self.hashes, self._keys): for ii, ki in enumerate(kk): if ki is not None: assert h(ki) == ii, (ii, ki, h(ki)) def items(self): for kk, vv in itertools.izip(self._keys, self._values): for ki, vi in itertools.izip(kk, vv): if ki is not None: yield ki, vi def keys(self): return (ki for ki, vi in self.items()) def values(self): return (vi for ki, vi in self.items()) def __getitem__(self, k): for ii, h in enumerate(self.hashes): index = h(k) if self._keys[ii][index] == k: return self._values[ii][index] raise KeyError(k) def __setitem__(self, k, v, tries=16): orig = self._keys, self._values, self.hashes orig_keys = sorted(self.keys()) try: while tries > 0 and not self.insert(k, v): self.rehash([k for ks in self._keys for k in ks], [v for vs in self._values for v in vs]) tries -= 1 except CouldntRehash: self._keys, self._values, self.hashes = orig raise #print "done setting", self._keys if k in orig_keys: assert orig_keys == sorted(self.keys()) else: assert sorted(orig_keys + [k]) == sorted(self.keys()) def insert(self, k, v, tries=128): "Try to insert without rehashing, returning True on success." #print "inserting", k, tries if tries == 0: return False # Recursion too deep. indices = [] # Key already present? for ii, h in enumerate(self.hashes): index = h(k) if (ii, index) in self.relocating: continue if self._keys[ii][index] == k: #print "updated at", ii, index, self.relocating self._values[ii][index] = v return True indices.append(index) # Blank spot? for ii, index in enumerate(indices): if (ii, index) in self.relocating: continue if self._keys[ii][index] is None: #print "new at", ii, index, self.relocating self._keys[ii][index] = k self._values[ii][index] = v return True # Okay, try recursively reinserting a randomly chosen # colliding value to make room. if not indices: # All the possible hash indices were already in # `self.relocating`, so we’re out. #print "looped" return False orig_keys = sorted(self.keys()) victim = random.randrange(len(indices)) vi = indices[victim] self.relocating.add((victim, vi)) vk = self._keys[victim][vi] vv = self._values[victim][vi] if self.insert(vk, vv, tries=tries-1): #print "relocated from", victim, vi self._keys[victim][vi] = None new_keys = sorted(self.keys()) assert new_keys == orig_keys, (orig_keys, new_keys, vk, k) self._keys[victim][vi] = k self._values[victim][vi] = v self.relocating = set() # XXX excess work return True #print "failed" return False def random_hash(table_size): """Generate a random hash function for table_size buckets. The family of hash functions used here is not super awesome but probably adequate. """ while True: # *4 as a minimum to avoid cases where the first few buckets # are 50% or 100% more likely than the last few. The reason # for GCD isn’t well thought out. n = random.randrange(table_size*4+1, 2**31) if gcd(n, table_size) == 1: return lambda k: hash(k) * n % table_size def gcd(a, b): while b != 0: a, b = b, a % b return a class CouldntRehash(Exception): pass def main(): x = Cuck3(25) x['hi'] = 'bye' for ii in range(5): x[ii] = ii**3 for jj in range(ii+1): assert x[jj] == jj**3 print x['hi'] assert x['hi'] == 'bye' print x[3] print x._keys try: for ii in range(1000): x[ii] = ii**3 for jj in range(ii+1): print "put", ii, "checking", jj assert x[jj] == jj**3 print x._keys print x[3] except CouldntRehash: print "CouldntRehash", x._keys kk = sorted(x.keys()) print kk, len(kk) print x[3] if __name__ == '__main__': main()