Levenshtein Distance

September 12, 2014

The recursive calculation is straight forward:

(define (exponential-levenshtein x y)
  (let loop ((xs (string->list x)) (ys (string->list y)))
    (cond ((null? xs) (length ys))
          ((null? ys) (length xs))
          ((char=? (car xs) (car ys))
            (loop (cdr xs) (cdr ys)))
          (else (+ 1 (min (loop (cdr xs) ys)
                          (loop xs (cdr ys))
                          (loop (cdr xs) (cdr ys))))))))

The first two cond clauses are the recursive bases. The third clause recurs, adding nothing to the Levenshtein distance, when two characters are identical. The fourth clause computes the cost of deleting from the first string, deleting from the second string, and deleting from both strings, then takes the minimum. Here are two examples:

> (time (exponential-levenshtein "persimmon" "cantaloupe"))
(time (exponential-levenshtein "persimmon" ...))
    3 collections
    359 ms elapsed cpu time, including 0 ms collecting
    346 ms elapsed real time, including 0 ms collecting
    21126272 bytes allocated, including 25252720 bytes reclaimed
10
> (time (exponential-levenshtein "persimmons" "cantaloupes"))
(time (exponential-levenshtein "persimmons" ...))
    9 collections
    1560 ms elapsed cpu time, including 15 ms collecting
    1557 ms elapsed real time, including 2 ms collecting
    75050576 bytes allocated, including 75850032 bytes reclaimed
10

We can clearly see the exponential time complexity of the algorithm: adding a single character quadruples the time required for the computation. The dynamic programming approach, by comparison, is O(mn) where m and n are the lengths of the two strings:

(define (dynamic-programming-levenshtein x y)
  (let* ((x-len (string-length x))
         (y-len (string-length y))
         (costs (make-matrix (+ x-len 1) (+ y-len 1) 0)))
    (do ((i 0 (+ i 1))) ((< x-len i))
      (do ((j 0 (+ j 1))) ((< y-len j))
        (matrix-set! costs i j (+ i j))))
    (do ((i 1 (+ i 1))) ((< x-len i))
      (do ((j 1 (+ j 1))) ((< y-len j))
        (let ((add-cost (+ (matrix-ref costs (- i 1) j) 1))
              (del-cost (+ (matrix-ref costs i (- j 1)) 1))
              (sub-cost (+ (matrix-ref costs (- i 1) (- j 1))
                           (if (char=? (string-ref x (- i 1))
                                       (string-ref y (- j 1)))
                               0 1))))
          (matrix-set! costs i j
            (min add-cost del-cost sub-cost)))))
    (matrix-ref costs x-len y-len)))

Here we build up an m by n matrix containing the Levenshtein distances between the initial m and n characters of the two input strings; since the substring distances are already known when they are needed for the calculation, there is no repeated calculation, and the Levenshtein distance is quickly computed. This algorithm is much faster than the first, and grows much more slowly:

> (time (dynamic-programming-levenshtein "persimmon" "cantaloupe"))
(time (dynamic-programming-levenshtein "persimmon" ...))
    no collections
    0 ms elapsed cpu time
    0 ms elapsed real time
    25824 bytes allocated
10
> (time (dynamic-programming-levenshtein "persimmons" "cantaloupes"))
(time (dynamic-programming-levenshtein "persimmons" ...))
    no collections
    0 ms elapsed cpu time
    0 ms elapsed real time
    30800 bytes allocated
10

This algorithm requires O(mn) space. That can be reduced to O(2m) by storing only the current row and prior row as the matrix is being built, but we won’t show that here. You can run the program at http://programmingpraxis.codepad.org/tmX4ta6y, which includes the matrix code from the Standard Prelude.

Advertisements

Pages: 1 2

3 Responses to “Levenshtein Distance”

  1. Nate said

    Very cool. Here’s my version in Swift — memoize caches the results by paired substrings for reuse:

    let levenshteinDistance = memoize {
        (levenshteinDistance: (String, String) -> Int, s1: String, s2: String) -> Int in
    
        if isEmpty(s1) { return countElements(s2) }
        if isEmpty(s2) { return countElements(s1) }
        
        // drop first letter of each string
        let s1Crop = s1[s1.startIndex.successor()..<s1.endIndex]
        let s2Crop = s2[s2.startIndex.successor()..<s2.endIndex]
        
        // if first characters are equal, continue with both cropped
        if s1[s1.startIndex] == s2[s2.startIndex] {
            return levenshteinDistance(s1Crop, s2Crop)
        }
        
        // otherwise find smallest of the three options
        let (c1, c2, c3) = (levenshteinDistance(s1Crop, s2), levenshteinDistance(s1, s2Crop), levenshteinDistance(s1Crop, s2Crop))
        return 1 + min(min(c1, c2), c3)
    }
    
    levenshteinDistance("hill", "hilt")
    // 1
    
  2. Nate said

    And here’s the memoize function:

    func memoize<T1: Hashable, T2: Hashable, U>(body: ((T1, T2) -> U, T1, T2) -> U) -> ((T1, T2) -> U) {
        var memo = [T1: [T2: U]]()
        var result: ((T1, T2) -> U)!
        result = {
            (value1: T1, value2: T2) -> U in
            if let cached = memo[value1]?[value2] { return cached }
        
            let toCache = body(result, value1, value2)
            if memo[value1] == nil { memo[value1] = [:] }
            memo[value1]![value2] = toCache
            return toCache
        }
        return result
    }
    
  3. Paul said

    Two versions in Python. A recursive version with memoization and an iterative version with 2 matrix rows.

    memo = {}
    def levenshtein_distance_recursive(a, b):
        if a == b:
            return 0
        if a == "":
            return len(b)
        if b == "":
            return len(a)
        if (a, b) not in memo:
            l1 = levenshtein_distance_recursive(a[1:], b) + 1
            l2 = levenshtein_distance_recursive(a, b[1:]) + 1
            l3 = levenshtein_distance_recursive(a[1:], b[1:]) + (a[0] != b[0])
            memo[(a,b)] = min(l1, l2, l3)
        return memo[(a,b)]
        
    def levenshtein_distance(s, t):
        if s == t:
            return 0
        if s == "":
            return len(t)
        if t == "":
            return len(s)
        v0 = range(len(t) + 1)
        v1 = [0] * len(v0)
        for i, si in enumerate(s):
            v1[0] = i + 1
            for j, tj in enumerate(t):
                v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + (si != tj))
            v0[:] = v1
        return v1[-1]
    

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: