#!/usr/bin/python
# -*- coding: utf-8 -*-
"""I read that cuckoo hashing with three hash functions can hit 91% full.

That’s absolutely astounding.  Is it true?

The idea is that a given hash table has three hash functions
associated with it and some set of buckets.  To see if a key is
contained in it, you hash it with the three functions to give you
three buckets, and check those three buckets.  If it is not in one of
them, it is not in the table, period.  There is no further iterative
process.  Deletion, similarly, requires just blanking a single entry.

When inserting into such a hash table, it may be the case that all
three possible locations for the newly inserted key are full.  In that
case you need to move one of them to an alternative location, which
may involve moving another item, and so on.  If this process turns out
not to terminate, you may have to rebuild the hash table with
different functions, which may not help.

By using three separate tables instead of a single table with three
hash functions, you avoid the possibility that a single key’s various
hashes will collide with each other, slightly improving performance.

Mitzenmacher’s ESA 2009 paper says that random-walk cuckoo hashing for
more than two alternatives is still not well understood, while
breadth-first search is expensive, but experimentally the threshold
where insertion and rehashing will eventually succeed may still be
91%.

At present (three hours into writing it!) this code doesn’t work.  It
loses and duplicates keys when it’s trying to relocate keys, but only
sometimes, and apparently only when the table is nearly full, maybe
just after rehashing?

"""
import itertools
import random

class Cuck3:
    def __init__(self, size):
        self.size = size
        self.relocating = set()
        self.rehash([], [])

    def rehash(self, keys, values, tries=16):
        print "rehashing", tries
        if tries == 0:
            raise CouldntRehash(len(keys), self.size)
        self._keys   = [[None for ii in range(self.size)] for jj in range(3)]
        self._values = [[None for ii in range(self.size)] for jj in range(3)]
        self.hashes = [random_hash(self.size) for jj in range(3)]
        for k, v in itertools.izip(keys, values):
            if not self.insert(k, v):
                return self.rehash(keys, values, tries-1)

        # Verify that rehashing left the table correct
        for h, kk in itertools.izip(self.hashes, self._keys):
            for ii, ki in enumerate(kk):
                if ki is not None:
                    assert h(ki) == ii, (ii, ki, h(ki))

    def items(self):
        for kk, vv in itertools.izip(self._keys, self._values):
            for ki, vi in itertools.izip(kk, vv):
                if ki is not None:
                    yield ki, vi

    def keys(self):
        return (ki for ki, vi in self.items())

    def values(self):
        return (vi for ki, vi in self.items())

    def __getitem__(self, k):
        for ii, h in enumerate(self.hashes):
            index = h(k)
            if self._keys[ii][index] == k:
                return self._values[ii][index]
        raise KeyError(k)

    def __setitem__(self, k, v, tries=16):
        orig = self._keys, self._values, self.hashes
        orig_keys = sorted(self.keys())
        try:
            while tries > 0 and not self.insert(k, v):
                self.rehash([k for ks in self._keys for k in ks],
                            [v for vs in self._values for v in vs])
                tries -= 1
        except CouldntRehash:
            self._keys, self._values, self.hashes = orig
            raise
        #print "done setting", self._keys
        if k in orig_keys:
            assert orig_keys == sorted(self.keys())
        else:
            assert sorted(orig_keys + [k]) == sorted(self.keys())

    def insert(self, k, v, tries=128):
        "Try to insert without rehashing, returning True on success."
        #print "inserting", k, tries
        if tries == 0:
            return False        # Recursion too deep.

        indices = []
        # Key already present?
        for ii, h in enumerate(self.hashes):
            index = h(k)
            if (ii, index) in self.relocating:
                continue

            if self._keys[ii][index] == k:
                #print "updated at", ii, index, self.relocating
                self._values[ii][index] = v
                return True
            indices.append(index)

        # Blank spot?
        for ii, index in enumerate(indices):
            if (ii, index) in self.relocating:
                continue

            if self._keys[ii][index] is None:
                #print "new at", ii, index, self.relocating
                self._keys[ii][index] = k
                self._values[ii][index] = v
                return True

        # Okay, try recursively reinserting a randomly chosen
        # colliding value to make room.
        if not indices:
            # All the possible hash indices were already in
            # `self.relocating`, so we’re out.
            #print "looped"
            return False

        orig_keys = sorted(self.keys())
        victim = random.randrange(len(indices))
        vi = indices[victim]
        self.relocating.add((victim, vi))
        vk = self._keys[victim][vi]
        vv = self._values[victim][vi]
        if self.insert(vk, vv, tries=tries-1):
            #print "relocated from", victim, vi

            self._keys[victim][vi] = None
            new_keys = sorted(self.keys())
            assert new_keys == orig_keys, (orig_keys, new_keys, vk, k)

            self._keys[victim][vi] = k
            self._values[victim][vi] = v
            self.relocating = set() # XXX excess work
            return True

        #print "failed"

        return False

def random_hash(table_size):
    """Generate a random hash function for table_size buckets.

    The family of hash functions used here is not super awesome but
    probably adequate.

    """
    while True:
        # *4 as a minimum to avoid cases where the first few buckets
        # are 50% or 100% more likely than the last few.  The reason
        # for GCD isn’t well thought out.
        n = random.randrange(table_size*4+1, 2**31)
        if gcd(n, table_size) == 1:
            return lambda k: hash(k) * n % table_size

def gcd(a, b):
    while b != 0:
        a, b = b, a % b
    return a

class CouldntRehash(Exception): pass

def main():
    x = Cuck3(25)
    x['hi'] = 'bye'
    for ii in range(5):
        x[ii] = ii**3
        for jj in range(ii+1):
            assert x[jj] == jj**3

    print x['hi']
    assert x['hi'] == 'bye'
    print x[3]
    print x._keys

    try:
        for ii in range(1000):
            x[ii] = ii**3
            for jj in range(ii+1):
                print "put", ii, "checking", jj
                assert x[jj] == jj**3
        print x._keys
        print x[3]
    except CouldntRehash:
        print "CouldntRehash", x._keys
    kk = sorted(x.keys())
    print kk, len(kk)
    print x[3]

if __name__ == '__main__':
    main()