String Search: Knuth-Morris-Pratt

August 25, 2009

Our kmp-search function works in two phases; the first loop pre-computes the skip vector, the second loop scans across the search string:

(define (kmp-search pat str . s)
  (let* ((plen (string-length pat))
         (slen (string-length str))
         (skip (make-vector plen 0)))
    (let loop ((i 1) (j 0))
      (cond ((= i plen))
            ((char=? (string-ref pat i) (string-ref pat j))
              (vector-set! skip i (+ j 1))
              (loop (+ i 1) (+ j 1)))
            ((< 0 j) (loop i (vector-ref skip (- j 1))))
            (else (vector-set! skip i 0)
                  (loop (+ i 1) j))))
    (let loop ((p 0) (s (if (null? s) 0 (car s))))
      (cond ((= s slen) #f)
            ((char=? (string-ref pat p) (string-ref str s))
              (if (= p (- plen 1))
                  (- s plen -1)
                  (loop (+ p 1) (+ s 1))))
            ((< 0 p) (loop (vector-ref skip (- p 1)) s))
            (else (loop p (+ s 1)))))))

If m is the number of characters in the pattern, and n is the number of characters in the search string, brute-force search takes O(mn) time, but Knuth-Morris-Pratt search takes O(m) time to build the skip array and O(n) time to scan the search string for a total time of O(m+n), which is a win for all but the smallest strings. That’s worst case; depending on the number of characters skipped, the search phase can be sub-linear. For general-purpose string searching, Knuth-Morris-Pratt is the algorithm of choice, and we add it to the Standard Prelude under the name string-find.

You can see the string searcher in action at http://programmingpraxis.codepad.org/sgwL8jFb, including its application to the test suite of the previous exercise.

Advertisement

Pages: 1 2

4 Responses to “String Search: Knuth-Morris-Pratt”

  1. Connochaetes said

    I remember this from college, I think the approach I took here is due to Sedgewick if my memory serves me. The initial -1 in the skip array is for the null-prefix of the pattern.

    Ruby:
    [sourcecode lang='ruby']
    def preprocess(pattern)
    skip = [-1]
    i, j = 0, -1
    while i = 0) and (pattern[i] != pattern[j])
    j = skip[j]
    end
    i, j = i+1, j+1
    skip[i] = j
    end
    return skip
    end

    def kmp_search(str, pattern)
    i = 0; j = 0
    skip = preprocess(pattern)
    while i =0) & (str[i] != pattern[j])
    j = skip[j]
    end
    i,j = i+1, j+1
    return i-j if j == pattern.length
    end
    return nil
    end
    [sourcecode]

  2. Connochaetes said

    I remember this from college, I think the approach I took here is due to Sedgewick if my memory serves me. The initial -1 in the skip array is for the null-prefix of the pattern.

    This is the correcty formatted version, please kill the previous post :)

    Ruby:

    def preprocess(pattern)
    skip = [-1]
    i, j = 0, -1
    while i = 0) and (pattern[i] != pattern[j])
    j = skip[j]
    end
    i, j = i+1, j+1
    skip[i] = j
    end
    return skip
    end
    
    def kmp_search(str, pattern)
    i = 0; j = 0
    skip = preprocess(pattern)
    while i =0) & (str[i] != pattern[j])
    j = skip[j]
    end
    i,j = i+1, j+1
    return i-j if j == pattern.length
    end
    return nil
    end
    
  3. Connochaetes said

    I lost tabs – oh well, I give up :) Time for bed.

  4. Vikas Tandi said

    My implementation in C

    #include "interface.h"
    #include <stdlib.h>
    #include <string.h>
    
    static void build_table(char *p, int *table);
    
    int string_index_2(char *s, char *p)
    {
    	int m, i;
    	int *table;
    
    	table = (int*)malloc(sizeof(int)*strlen(p));
    	if(table == NULL)
    		return -1;
    
    	/* precompute the skip table */
    	build_table(p, table);
    
    	/* search the pattern in string using skip table */
    	for(m = 0, i = 0; s[m+i] != '\0';)
    	{
    		for(; p[i] != '\0' && p[i] == s[m+i]; i++);
    		if(p[i] == '\0')
    		{
    			free(table);
    			return m+1;
    		}
    
    		/* set next values of m and i using skip table */
    		m = m + i - table[i];
    		if(table[i] < 0)
    			i = 0;
    		else
    			i = table[i];
    	}
    	free(table);
    	return -1;
    }
    
    static void build_table(char *p, int *table)
    {
    	int i, j;
    
    	table[0] = -1;
    	if(p[1] == '\0')
    		return;
    
    	table[1] = 0;
    
    	for(i = 0, j = 2; p[j] != '\0';)
    	{
    		if(p[i] == p[j-1])				/* the substring continues */
    			table[j++] = ++i;
    		else if(i > 0)					/* doesn't match but we can fallback */
    			i = table[i];
    		else
    			table[j++] = 0;
    	}
    }
    

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: