Levenshtein Distance
September 12, 2014
The recursive calculation is straight forward:
(define (exponential-levenshtein x y)
(let loop ((xs (string->list x)) (ys (string->list y)))
(cond ((null? xs) (length ys))
((null? ys) (length xs))
((char=? (car xs) (car ys))
(loop (cdr xs) (cdr ys)))
(else (+ 1 (min (loop (cdr xs) ys)
(loop xs (cdr ys))
(loop (cdr xs) (cdr ys))))))))
The first two cond clauses are the recursive bases. The third clause recurs, adding nothing to the Levenshtein distance, when two characters are identical. The fourth clause computes the cost of deleting from the first string, deleting from the second string, and deleting from both strings, then takes the minimum. Here are two examples:
> (time (exponential-levenshtein "persimmon" "cantaloupe"))
(time (exponential-levenshtein "persimmon" ...))
3 collections
359 ms elapsed cpu time, including 0 ms collecting
346 ms elapsed real time, including 0 ms collecting
21126272 bytes allocated, including 25252720 bytes reclaimed
10
> (time (exponential-levenshtein "persimmons" "cantaloupes"))
(time (exponential-levenshtein "persimmons" ...))
9 collections
1560 ms elapsed cpu time, including 15 ms collecting
1557 ms elapsed real time, including 2 ms collecting
75050576 bytes allocated, including 75850032 bytes reclaimed
10
We can clearly see the exponential time complexity of the algorithm: adding a single character quadruples the time required for the computation. The dynamic programming approach, by comparison, is O(mn) where m and n are the lengths of the two strings:
(define (dynamic-programming-levenshtein x y)
(let* ((x-len (string-length x))
(y-len (string-length y))
(costs (make-matrix (+ x-len 1) (+ y-len 1) 0)))
(do ((i 0 (+ i 1))) ((< x-len i))
(do ((j 0 (+ j 1))) ((< y-len j))
(matrix-set! costs i j (+ i j))))
(do ((i 1 (+ i 1))) ((< x-len i))
(do ((j 1 (+ j 1))) ((< y-len j))
(let ((add-cost (+ (matrix-ref costs (- i 1) j) 1))
(del-cost (+ (matrix-ref costs i (- j 1)) 1))
(sub-cost (+ (matrix-ref costs (- i 1) (- j 1))
(if (char=? (string-ref x (- i 1))
(string-ref y (- j 1)))
0 1))))
(matrix-set! costs i j
(min add-cost del-cost sub-cost)))))
(matrix-ref costs x-len y-len)))
Here we build up an m by n matrix containing the Levenshtein distances between the initial m and n characters of the two input strings; since the substring distances are already known when they are needed for the calculation, there is no repeated calculation, and the Levenshtein distance is quickly computed. This algorithm is much faster than the first, and grows much more slowly:
> (time (dynamic-programming-levenshtein "persimmon" "cantaloupe"))
(time (dynamic-programming-levenshtein "persimmon" ...))
no collections
0 ms elapsed cpu time
0 ms elapsed real time
25824 bytes allocated
10
> (time (dynamic-programming-levenshtein "persimmons" "cantaloupes"))
(time (dynamic-programming-levenshtein "persimmons" ...))
no collections
0 ms elapsed cpu time
0 ms elapsed real time
30800 bytes allocated
10
This algorithm requires O(mn) space. That can be reduced to O(2m) by storing only the current row and prior row as the matrix is being built, but we won’t show that here. You can run the program at http://programmingpraxis.codepad.org/tmX4ta6y, which includes the matrix code from the Standard Prelude.
Very cool. Here’s my version in Swift — memoize caches the results by paired substrings for reuse:
let levenshteinDistance = memoize { (levenshteinDistance: (String, String) -> Int, s1: String, s2: String) -> Int in if isEmpty(s1) { return countElements(s2) } if isEmpty(s2) { return countElements(s1) } // drop first letter of each string let s1Crop = s1[s1.startIndex.successor()..<s1.endIndex] let s2Crop = s2[s2.startIndex.successor()..<s2.endIndex] // if first characters are equal, continue with both cropped if s1[s1.startIndex] == s2[s2.startIndex] { return levenshteinDistance(s1Crop, s2Crop) } // otherwise find smallest of the three options let (c1, c2, c3) = (levenshteinDistance(s1Crop, s2), levenshteinDistance(s1, s2Crop), levenshteinDistance(s1Crop, s2Crop)) return 1 + min(min(c1, c2), c3) } levenshteinDistance("hill", "hilt") // 1And here’s the memoize function:
func memoize<T1: Hashable, T2: Hashable, U>(body: ((T1, T2) -> U, T1, T2) -> U) -> ((T1, T2) -> U) { var memo = [T1: [T2: U]]() var result: ((T1, T2) -> U)! result = { (value1: T1, value2: T2) -> U in if let cached = memo[value1]?[value2] { return cached } let toCache = body(result, value1, value2) if memo[value1] == nil { memo[value1] = [:] } memo[value1]![value2] = toCache return toCache } return result }Two versions in Python. A recursive version with memoization and an iterative version with 2 matrix rows.
memo = {} def levenshtein_distance_recursive(a, b): if a == b: return 0 if a == "": return len(b) if b == "": return len(a) if (a, b) not in memo: l1 = levenshtein_distance_recursive(a[1:], b) + 1 l2 = levenshtein_distance_recursive(a, b[1:]) + 1 l3 = levenshtein_distance_recursive(a[1:], b[1:]) + (a[0] != b[0]) memo[(a,b)] = min(l1, l2, l3) return memo[(a,b)] def levenshtein_distance(s, t): if s == t: return 0 if s == "": return len(t) if t == "": return len(s) v0 = range(len(t) + 1) v1 = [0] * len(v0) for i, si in enumerate(s): v1[0] = i + 1 for j, tj in enumerate(t): v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + (si != tj)) v0[:] = v1 return v1[-1]