Pearson Hashing
May 25, 2018
We begin by defining a permutation table, using functions from the Standard Prelude:
(define t (list->vector (shuffle (range 256))))
Then it is easy to compute the 8-bit Pearson hash of a string; we use addition mod 256 rather than xor:
(define (pearson8 str)
(fold-left (lambda (n h) (vector-ref t (modulo (+ n h) 256)))
0 (map char->integer (string->list str))))
Here’s an example:
> (pearson8 "Programming Praxis") 213
And here is the 16-bit version of the algorithm:
(define (pearson16 str)
(let* ((cs1 (map char->integer (string->list str)))
(cs2 (cons (modulo (+ (car cs1) 1) 256) (cdr cs1))))
(let loop ((h1 0) (h2 0) (cs1 cs1) (cs2 cs2))
(if (null? cs1) (+ (* h1 256) h2)
(let ((h1 (vector-ref t (modulo (+ (car cs1) h1) 256)))
(h2 (vector-ref t (modulo (+ (car cs2) h2) 256))))
(loop h1 h2 (cdr cs1) (cdr cs2)))))))
> (pearson16 "Programming Praxis") 54602
You can run the program at https://ideone.com/hi4xWM.
Here is a solution in standard R7RS Scheme that uses the table from
the paper and provides 8 variants.
A curiosity: Using addition instead of xor produces collisions for the
sample strings in the 8-bit version. Replacing the original table
with a couple of randomly generated ones that I tried gets rid of the
collisions. I could also reproduce the collisions using
@programmingpraxis’s implementation (with the fixed table from the
paper).
(import (scheme base) (scheme write) (scheme file) (srfi 8) (only (srfi 151) bitwise-xor)) ;; Table I from Pearson's paper (p. 678) (define pearson-table #(1 87 49 12 176 178 102 166 121 193 6 84 249 230 44 163 14 197 213 181 161 85 218 80 64 239 24 226 236 142 38 200 110 177 104 103 141 253 255 50 77 101 81 18 45 96 31 222 25 107 190 70 86 237 240 34 72 242 20 214 244 227 149 235 97 234 57 22 60 250 82 175 208 5 127 199 111 62 135 248 174 169 211 58 66 154 106 195 245 171 17 187 182 179 0 243 132 56 148 75 128 133 158 100 130 126 91 13 153 246 216 219 119 68 223 78 83 88 201 99 122 11 92 32 136 114 52 10 138 30 48 183 156 35 61 26 143 74 251 94 129 162 63 152 170 7 115 167 241 206 3 150 55 59 151 220 90 53 23 131 125 173 15 238 79 95 89 16 105 137 225 224 217 160 37 123 118 73 2 157 46 116 9 145 134 228 207 212 202 215 69 229 27 188 67 124 168 252 42 4 29 108 21 247 19 205 39 203 233 40 186 147 198 192 155 33 164 191 98 204 165 180 117 76 140 36 210 172 41 54 159 8 185 232 113 196 231 47 146 120 51 65 28 144 254 221 93 189 194 139 112 43 71 109 184 209)) ;; Naively map a Unicode character to the range 0..255. ;; (map char-hash-byte (string->list "Hello, दुनिया!")) (define (char-hash-byte c) (let loop ((n (char->integer c))) (receive (q r) (floor/ n 256) (if (zero? q) r (loop (bitwise-xor r (loop q))))))) (define (make-pearson-hash table char-num-proc mix-proc) (lambda (str) (let ((h 0)) (string-for-each (lambda (c) (set! h (vector-ref table (mix-proc h (char-num-proc c))))) str) h))) (define sample-strings '("Hello" "Hello," "Hell,o" "Hel,lo" "Hello, World!" "Hell,o World!" "Hel,lo World!" "Hello, दुनिया!" "Hell,o दुनिया!" "Hel,lo दुनिया!")) (define (demo-hash hproc) (display (map hproc sample-strings)) (newline)) (demo-hash (make-pearson-hash pearson-table char->integer (lambda (a b) (modulo (+ a b) 256)))) (demo-hash (make-pearson-hash pearson-table char->integer (lambda (a b) (modulo (bitwise-xor a b) 256)))) (demo-hash (make-pearson-hash pearson-table char-hash-byte (lambda (a b) (modulo (+ a b) 256)))) (demo-hash (make-pearson-hash pearson-table char-hash-byte bitwise-xor)) (define (make-pearson-hash-2 table char-num-proc mix-proc) (let ((phash (make-pearson-hash table char-num-proc mix-proc)) (tsize (vector-length table))) (lambda (str) (if (string=? "" str) (phash str) (+ (* tsize (phash str)) (phash (string-append (string (integer->char (modulo (+ 1 (char-num-proc (string-ref str 0))) tsize))) (string-copy str 1)))))))) (demo-hash (make-pearson-hash-2 pearson-table char->integer (lambda (a b) (modulo (+ a b) 256)))) (demo-hash (make-pearson-hash-2 pearson-table char->integer (lambda (a b) (modulo (bitwise-xor a b) 256)))) (demo-hash (make-pearson-hash-2 pearson-table char-hash-byte (lambda (a b) (modulo (+ a b) 256)))) (demo-hash (make-pearson-hash-2 pearson-table char-hash-byte bitwise-xor))Here’s a solution in C.
#include <stdint.h> #include <stdio.h> #include <stdlib.h> static uint8_t T[] = { 1,87,49,12,176,178,102,166,121,193,6,84,249,230,44,163, 14,197,213,181,161,85,218,80,64,239,24,226,236,142,38,200, 110,177,104,103,141,253,255,50,77,101,81,18,45,96,31,222, 25,107,190,70,86,237,240,34,72,242,20,214,244,227,149,235, 97,234,57,22,60,250,82,175,208,5,127,199,111,62,135,248, 174,169,211,58,66,154,106,195,245,171,17,187,182,179,0,243, 132,56,148,75,128,133,158,100,130,126,91,13,153,246,216,219, 119,68,223,78,83,88,201,99,122,11,92,32,136,114,52,10, 138,30,48,183,156,35,61,26,143,74,251,94,129,162,63,152, 170,7,115,167,241,206,3,150,55,59,151,220,90,53,23,131, 125,173,15,238,79,95,89,16,105,137,225,224,217,160,37,123, 118,73,2,157,46,116,9,145,134,228,207,212,202,215,69,229, 27,188,67,124,168,252,42,4,29,108,21,247,19,205,39,203, 233,40,186,147,198,192,155,33,164,191,98,204,165,180,117,76, 140,36,210,172,41,54,159,8,185,232,113,196,231,47,146,120, 51,65,28,144,254,221,93,189,194,139,112,43,71,109,184,209 }; uint8_t pearson1(char* str) { uint8_t h = 0; while (*str) { h = T[h ^ *str]; ++str; } return h; } uint16_t pearson2(char* str) { uint8_t h1 = pearson1(str); uint8_t h2 = 0; if (*str) { h2 = T[h2 ^ (*str + 1)]; ++str; } while (*str) { h2 = T[h2 ^ *str]; ++str; } uint16_t h = h1; h <<= 8; h += h2; return h; } int main(int argc, char* argv[]) { if (argc != 2) { fprintf(stderr, "Usage: %s <STR>\n", argv[0]); return EXIT_FAILURE; } printf("%d\n", pearson1(argv[1])); printf("%d\n", pearson2(argv[1])); return EXIT_SUCCESS; }Example:
Solution in Ruby.
def pearson_hash(message, table, size_in_bytes) size_in_bytes.times.reduce("") do |memo, i| hash = 0 message[0] = ((message[0].ord + i) % 256).chr message.chars do |char| hash = table[(hash + char.ord) % 256] end memo += hash.to_s end end # use `(0..255).to_a.shuffle` to generate a new one table = [ 68, 6, 218, 45, 29, 132, 105, 22, 192, 155, 146, 66, 226, 231, 123, 25, 212, 64, 156, 52, 71, 220, 77, 194, 178, 36, 34, 153, 181, 130, 99, 39, 108, 215, 92, 119, 55, 216, 175, 72, 168, 169, 136, 185, 57, 53, 1, 233, 125, 237, 78, 184, 48, 61, 201, 161, 60, 217, 124, 173, 32, 15, 176, 172, 70, 9, 129, 100, 43, 142, 222, 207, 158, 189, 206, 82, 163, 44, 2, 17, 177, 20, 102, 174, 190, 126, 122, 109, 144, 252, 58, 121, 160, 127, 46, 221, 73, 112, 250, 4, 38, 18, 135, 209, 133, 69, 63, 152, 151, 115, 191, 110, 243, 230, 85, 167, 148, 246, 203, 74, 128, 238, 150, 204, 79, 81, 84, 180, 154, 104, 199, 208, 164, 182, 30, 143, 14, 241, 80, 42, 223, 249, 219, 3, 227, 239, 254, 186, 171, 232, 145, 98, 107, 157, 106, 75, 51, 251, 228, 26, 200, 89, 183, 83, 193, 210, 255, 67, 37, 211, 111, 59, 7, 35, 101, 47, 236, 234, 224, 137, 138, 159, 5, 134, 91, 13, 118, 139, 165, 240, 56, 149, 49, 33, 23, 86, 116, 179, 96, 16, 27, 21, 88, 93, 87, 90, 19, 54, 113, 50, 28, 120, 202, 62, 188, 245, 198, 195, 117, 94, 24, 170, 205, 244, 141, 0, 225, 103, 95, 214, 229, 11, 65, 97, 31, 248, 76, 114, 147, 41, 242, 196, 197, 235, 12, 247, 253, 140, 8, 10, 40, 162, 131, 213, 166, 187 ] # Test puts pearson_hash('Hola', table, 1) puts pearson_hash('Programming Praxis', table, 1) puts pearson_hash('Programming Praxis', table, 4)Outputs:
25
165
165121139220
Here are updated functions from my program above. I added casts to unsigned 8-bit ints for memory safety, to prevent reading outside array T’s boundary.
uint8_t pearson1(char* str) { uint8_t h = 0; while (*str) { h = T[h ^ (uint8_t)*str]; ++str; } return h; } uint16_t pearson2(char* str) { uint8_t h1 = pearson1(str); uint8_t h2 = 0; if (*str) { h2 = T[h2 ^ (uint8_t)((uint8_t)*str + 1)]; ++str; } while (*str) { h2 = T[h2 ^ (uint8_t)*str]; ++str; } uint16_t h = h1; h <<= 8; h += h2; return h; }@V, in your example that sets size_in_bytes=4, the result, 165121139220, is represented with 38 bits (more than 32 bits = 4 bytes).
Here’s a modification that uses bit-shifting, instead of concatenating strings between “0” and “255”.
def pearson_hash(message, table, size_in_bytes) output = 0 size_in_bytes.times do |i| hash = 0 message.each_char.with_index do |char, j| hash = table[(hash + char.ord + (j == 0 ? i : 0)) % 256] end output = (output << 8) + hash end output endI also modified the function so that the first letter of the message is non-destructively incremented by the index of each iteration (a opposed to being incremented by 0, 1, 3, then 6, over four iterations).
[…] the previous exercise we studied a simple cryptographic hash. Today we will use that hash to write an equally-simple […]
Bit surprised we haven’t seen a Python one yet.
def pearson_hash(message, byte_length=1, table=None, hash_type=1): # message is expected as bytes, bytearray or a utf-8 string if not table: table = [ 1,87,49,12,176,178,102,166,121,193,6,84,249,230,44,163, 14,197,213,181,161,85,218,80,64,239,24,226,236,142,38,200, 110,177,104,103,141,253,255,50,77,101,81,18,45,96,31,222, 25,107,190,70,86,237,240,34,72,242,20,214,244,227,149,235, 97,234,57,22,60,250,82,175,208,5,127,199,111,62,135,248, 174,169,211,58,66,154,106,195,245,171,17,187,182,179,0,243, 132,56,148,75,128,133,158,100,130,126,91,13,153,246,216,219, 119,68,223,78,83,88,201,99,122,11,92,32,136,114,52,10, 138,30,48,183,156,35,61,26,143,74,251,94,129,162,63,152, 170,7,115,167,241,206,3,150,55,59,151,220,90,53,23,131, 125,173,15,238,79,95,89,16,105,137,225,224,217,160,37,123, 118,73,2,157,46,116,9,145,134,228,207,212,202,215,69,229, 27,188,67,124,168,252,42,4,29,108,21,247,19,205,39,203, 233,40,186,147,198,192,155,33,164,191,98,204,165,180,117,76, 140,36,210,172,41,54,159,8,185,232,113,196,231,47,146,120, 51,65,28,144,254,221,93,189,194,139,112,43,71,109,184,209 ] try: message = bytearray(message) except TypeError: message = bytearray(message, encoding='utf8') output = 0 for cycle in range(byte_length): h = 0 message[0] = (message[0]+cycle) % 256 for byte in message: if hash_type == 1: h = table[(h+byte) % 256] elif hash_type == 2: h = table[h ^ byte] else: raise ValueError('Unknown hash type - try 1 or 2') output = (output << 8) + h return output