Hash Tables With Open Addressing
April 30, 2019
We start with a hashing function on strings, which are commonly used for symbol tables and other applications. We parameterize the function so it can be used for both hashes of the double hash algorithm:
(define (hash parm key)
(let loop ((ks (string->list key)) (h 0))
(if (null? ks) h
(loop (cdr ks) (+ (* h parm) (char->integer (car ks)))))))
We will use 31 and 37 as the parameters for the two hash functions. Thus:
> (hash 31 "hello") 99162322 > (hash 37 "hello") 200180656
A hash table is just a vector, with each item initially 'empty:
(define m 7) ; small for testing (define ht (make-vector m 'empty))
Now we look at linear probing:
(define (lookup ht eql? k)
(let loop ((h (modulo (hash 31 k) m)))
(let ((bucket (vector-ref ht h)))
(cond ((eq? bucket 'empty) (list))
((and (pair? bucket)
(eql? (car bucket) k)) bucket)
(else (loop (modulo (+ h 1) m)))))))
(define (insert! ht eql? k v)
(let loop ((h (modulo (hash 31 k) m)))
(let ((bucket (vector-ref ht h)))
(cond ((or (eq? bucket 'empty)
(eq? bucket 'deleted)
(eql? (car bucket) k))
(vector-set! ht h (cons k v)) ht)
(else (loop (modulo (+ h 1) m)))))))
(define (delete! ht eql? k)
(let loop ((h (modulo (hash 31 k) m)))
(let ((bucket (vector-ref ht h)))
(cond ((eq? bucket 'empty) ht)
((and (pair? bucket)
(eql? (car bucket) k))
(vector-set! ht h 'deleted) ht)
(else (loop (modulo (+ h 1) m)))))))
(define (enlist ht)
(let loop ((h 0) (xs (list)))
(if (= h m) xs
(let ((bucket (vector-ref ht h)))
(if (pair? bucket)
(loop (+ h 1) (cons bucket xs))
(loop (+ h 1) xs))))))
The code is straight forward but interesting. Notice how 'empty and 'delete appear or fail to appear in the various functions: lookup looks for 'empty or a key/value pair, insert! looks for 'empty, 'delete or a key/value pair, delete! looks for 'empty or a key/value pair, and enlist looks only for a key/value pair. Note also that insert! treats 'empty, 'deleted and a matching key/value pair as the same thing; all mark a place where the requested key/value pair is to appear.
Changing these functions to use double hashing is easy. The variable we have called h becomes h1, we add another call the hash to compute h2, which is the gap between successive probes, and the computation (+ h 1) becomes (+ h1 h2); the computation of h2 must be modified so it never returns zero. We won’t show the code here, but you can see it at https://ideone.com/6xFGsc. Here are some examples; they work the same way using either linear probing or double hashing:
> (set! ht (insert! ht string=? "a" 1))
> (set! ht (insert! ht string=? "c" 3))
> (set! ht (insert! ht string=? "e" 5))
> (set! ht (insert! ht string=? "f" 6))
> (set! ht (insert! ht string=? "g" 7))
> (set! ht (insert! ht string=? "h" 8))
> (length (enlist ht))
6
> (set! ht (delete! ht string=? "c"))
> (set! ht (delete! ht string=? "g"))
> (lookup ht string=? "a")
("a" . 1)
> (lookup ht string=? "c")
()
> ht
#(("h" . 8) deleted empty ("e" . 5) ("f" . 6) deleted
("a" . 1))
> (enlist ht)
(("a" . 1) ("f" . 6) ("e" . 5) ("h" . 8))
You can run the program at https://ideone.com/6xFGsc.
In general, double hashing is preferable to linear probing, which has the problem that multiple keys that hash to the same bucket become clustered; double hashing does a better job of spreading them out, so it permits a higher load factor.
In Python. The full code is on ideone.
from itertools import count, accumulate, chain, cycle from random import randrange, sample empty = ("_empty", "_empty", 0) deleted = ("_deleted", "_deleted", 0) def inc_linprob(hashval, M): """ increment for linear probing""" return 1 def inc_double_hashing(hashval, M): """ increment for double hashing - make sure it is in range [1, M-1]""" return hashval % (M-1) + 1 class dictlh(object): """ Store (key, value, hash(key)) """ def __init__(self, mode=inc_linprob, load_factor=0.5): self.M = 997 self.buckets = [empty] * self.M self.load_factor = load_factor self.max_load = load_factor * self.M self.n_elements = 0 self.n_deleted = 0 self._increment = mode self.searches = 0 def _find_insert_bucket(self, key): """ result: find the key OR find an empty bucket to insert """ hashval = hash(key) b, M = self.buckets, self.M inc = self._increment(hashval, M) h = hashval % M bh = b[h] while bh != empty and bh[0] != key: h = (h + inc) % M bh = b[h] self.searches += 1 return h def _insert(self, key, value, hashval=None): if hashval is None: hashval = hash(key) bi = self._find_insert_bucket(key) isempty = self.buckets[bi] == empty self.buckets[bi] = (key, value) if isempty: self.n_elements += 1 if self.n_elements > self.max_load: self._rehash() def _find_key(self, key): bi = self._find_insert_bucket(key) if self.buckets[bi][0] == key: return bi raise KeyError("key not found") def _lookup(self, key): h = self._find_key(key) return self.buckets[h][1] def _rehash(self): load = self.n_elements - self.n_deleted self.M = next_prime(4 * load) self.max_load = load_factor * self.M self.n_elements = self.n_deleted = 0 old_buckets = list(self.buckets) self.buckets = [empty] * self.M for item in old_buckets: if item not in (deleted, empty): self._insert(*item)