String Search: Rabin-Karp

September 1, 2009

Our function is quite straight forward:

(define (rk-search pat str . s)
  (define (hash s)
    (fold-left (lambda (x y) (+ (* 256 x) y)) 0
      (map char->integer (string->list s))))
  (let* ((q 1073741789)
         (plen (string-length pat))
         (slen (string-length str))
         (h (modulo (expt 256 (- plen 1)) q))
         (phash (modulo (hash pat) q))
         (s (if (null? s) 0 (car s)))
         (shash (modulo (hash (substring str s (+ s plen))) q)))
    (let loop ((s s) (shash shash))
      (cond ((and (= phash shash)
                  (string=? pat (substring str s (+ s plen)))) s)
            ((<= (- slen plen) s) #f)
            (else (loop (+ s 1)
                        (modulo (+ (* 256 (- shash (* h (char->integer (string-ref str s)))))
                                   (char->integer (string-ref str (+ s plen)))) q)))))))

Rabin-Karp is O(n) to search sequentially through the string, and O(m) to confirm each match. If each sub-string generates a false match, the total time will be O(mn), which is worse than the other string-searching algorithms. But Rabin-Karp excels when multiple strings are being sought; put all the hashes in a set or bloom filter, then look for them all in parallel (the details get a little tricky if the patterns can all have varying lengths).

Rk-search uses fold-left from the Standard Prelude. You can see the Rabin-Karp searcher in action, including a run of the standard test suite, at http://programmingpraxis.codepad.org/ybaO1twV.

Pages: 1 2

4 Responses to “String Search: Rabin-Karp”

  1. […] Praxis – String Search: Rabin-Karp By Remco Niemeijer Today’s Programming Praxis problem marks the end of the string search series. In this final entry, we have […]

  2. Remco Niemeijer said

    My Haskell solution (see http://bonsaicode.wordpress.com/2009/09/01/programming-praxis-string-search-rabin-karp/ for a version with comments):

    import Data.Char
    import Data.List
    
    asciis :: String -> [Integer]
    asciis = map (fromIntegral . ord)
    
    hash :: Num a => [a] -> a
    hash = sum . zipWith (*) (iterate (* 256) 1) . reverse
    
    hashes :: Int -> String -> [Integer]
    hashes p xs = scanl (\s (f,t) -> 256*s - 256^p*f + t) (hash $ take p ascs) .
                  zip ascs $ drop p ascs where ascs = asciis xs
    
    rabinKarp :: String -> Maybe Int -> String -> Maybe Int
    rabinKarp p s = lookup (hash $ asciis p) . drop (maybe 0 id s) .
                    flip zip [0..] . hashes (length p)
    
  3. Vikas Tandi said

    my implementation in C

    #include <string.h>
    
    static long double rolling_hash(long double hashval, long double val, char s, char e);
    static long double hash(char *s, int m);
    
    int string_index_4(char *s, int slen, char *p, int plen)
    {
    	int i;
    	long double hsub, hs, pow;
    
    	/* sanity check */
    	if(!s || !p || plen <= 0)
    		return -1;
    
    	for(i = 1, pow = 1; i <= (plen-1); i++)
    		pow = pow * 256;
    
    	/* calculate hash for sub-string and string */
    	hs = hash(s, plen);
    	hsub = hash(p, plen);
    
    	/* search sub-string */
    	for(i = 0; i <= (slen - plen); i++)
    	{
    		if(hs == hsub)
    			if(strncmp(&s[i], p, plen) == 0)
    				return i+1;
    
    		hs = rolling_hash(hs, pow, s[i], s[i+plen]);
    	}
    
    	return -1;
    }
    
    static long double hash(char *s, int m)
    {
    	int i;
    	long double hashval;
    	for(hashval = 0.0, i = 0; s[i] != '\0' && i < m; i++)
    		hashval = 256 * hashval + s[i];
    
    	return hashval;
    }
    
    static long double rolling_hash(long double hashval, long double val, char s, char e)
    {
    	hashval = hashval - s * val;
    	hashval = hashval * 256 + e;
    	return hashval;
    }
    

Leave a comment