Soundex
February 16, 2010
This task is harder than it looks; I will never admit how many wrong tries I made before I got a working result. Here’s my version:
(define (soundex str)
(define (code c)
; ABCDEFGHIJKLMNOPQRSTUVWXYZ
(string-ref "01230120022455012623010202"
(- (char->integer (char-upcase c)) 65)))
(define (finish zs)
(substring (list->string (reverse
(append (list #\0 #\0 #\0) zs))) 0 4))
(let* ((cs (cdr (map char-upcase (string->list str))))
(f (string-ref str 0)) (fx (code f))
(prev (if (char=? fx #\0) #\0 fx)))
(let loop ((cs cs) (zs (list f)) (prev prev))
(if (null? cs) (finish zs)
(let ((z (code (car cs))))
(cond ((char=? z prev) (loop (cdr cs) zs prev))
((char=? z #\0) (loop (cdr cs) zs #\0))
(else (loop (cdr cs) (cons z zs) z))))))))
The code function maps letters according to the soundex rules, with zero representing an unused character. The finish function extends or truncates the final result to four characters. The heart of the algorithm is in the last three lines, where the current character is skipped if it maps to the same soundex code as the previous character, or if it maps to zero, or otherwise is added to the accumulating output, and the previous character is updated.
This test is due to Knuth; calling (test) should produce no output:
(define (test)
(let ((names1 (list "Euler" "Gauss" "Hilbert"
"Knuth" "Lloyd" "Lukasiewicz"))
(names2 (list "Ellery" "Ghosh" "Heilbronn"
"Kant" "Ladd" "Lissajous"))
(result (list "E460" "G200" "H416"
"K530" "L300" "L222")))
(assert (map soundex names1) result)
(assert (map soundex names2) result)))
We used the assert macro from the Standard Prelude. You can run the program at http://programmingpraxis.codepad.org/i0n0uIS7.
My Haskell solution (see http://bonsaicode.wordpress.com/2010/02/16/programming-praxis-soundex/ for a version with comments):
import Data.Char
import Data.List
soundex :: String -> String
soundex = f . map head . group . map toUpper where
f [] = []
f (x:xs) = x : take 3 [toNum c | c <- xs ++ repeat ‘0’,
notElem c "AEHIOUWY"]
toNum c = maybe ‘0’ snd . find (elem c . fst) $
zip (words "BFPV CGJKQSXZ DT L MN R") [‘1’..]
Hm. I think I may have accidentally used src instead of css for the language in the post above. Here’s the correct version.
import Data.Char import Data.List soundex :: String -> String soundex = f . map head . group . map toUpper where f [] = [] f (x:xs) = x : take 3 [toNum c | c <- xs ++ repeat '0', notElem c "AEHIOUWY"] toNum c = maybe '0' snd . find (elem c . fst) $ zip (words "BFPV CGJKQSXZ DT L MN R") ['1'..]It’s doesn’t return the same results for ‘lloyd’ and ‘Lukasiewicz’ but it’s close enough
/* soundex.c - by harry moreno */ #include <stdio.h> #include <string.h> void removechars(char string[]); void encode(char string[]); void trim(char string[]); main(void){ char buffer[100]; int i; printf("Enter name:\n"); scanf("%s", buffer); // make string lowercase for(i=0; buffer[i] != '\0'; i++){ buffer[i] = tolower(buffer[i]); } removechars(buffer); printf("trimmed name:%s\n", buffer); encode(buffer); printf("encoded name:%s\n", buffer); trim(buffer); printf("Code produced: %s\n", buffer); } void removechars(char string[]){ int i,j; for(i=1; string[i] != '\0'; i++){ if(string[i] == string[i+1] || string[i] == 'a' || string[i] == 'e' || string[i] == 'h' || string[i] == 'i' || string[i] == 'o' || string[i] == 'u' || string[i] == 'w' || string[i] == 'y'){ for(j=i; string[j] != '\0'; j++) string[j] = string[j+1]; i--; } } } void trim(char string[]){ int i,j; // remove dupes for(i=1; string[i] != '\0'; i++){ if(string[i] == string[i+1]){ for(j=i; string[j] != '\0'; j++) string[j] = string[j+1]; i--; } } if(strlen(string)>4) string[4] = '\0'; else { for(i=1; i<4; i++){ if(string[i] == '\0'){ for(j=i; j<4; j++) string[j] = '0'; } } string[4] = '\0'; } } void encode(char string[]){ int i; for(i=1; string[i] != '\0'; i++){ if(string[i] == 'b' || string[i] == 'f' || string[i] == 'p' || string[i] == 'v') string[i] = '1'; if(string[i] == 'c' || string[i] == 'g' || string[i] == 'j' || string[i] == 'k' || string[i] == 'q' || string[i] == 's' || string[i] == 'x' || string[i] == 'z') string[i] = '2'; if(string[i] == 'd' || string[i] == 't') string[i] = '3'; if(string[i] == 'l') string[i] = '4'; if(string[i] == 'm' || string[i] == 'n') string[i] = '5'; if(string[i] == 'r') string[i] = '6'; } }[…] Praxis – Soundex By Remco Niemeijer In today’s Programming Praxis exercise we have to implement the soundex algorithm for encoding people’s […]
class Soundex def self.parse(str) input = str.dup.gsub(/(\w)(\1)/i, '\1').gsub(/(?!\A)[a|e|i|o|u|w|y|h]/i, '') input[0].chr + Array.new(3) { |i| input[1..-1][i] || 48 }.map do |ch| %w(0 bfpv cgjkqsxz dt l mn r).each_with_index.detect { |s| s.first.include? ch.chr }.last end.join end endThat Haskell up there makes me want to coo :)
http://pastebin.com/bgDF5adL
C#, I’m sure it could be faster but it works nicely.
using System;
using System.Collections.Generic;
namespace Soundex
{
class Program
{
static void Main()
{
string str = “Soundex”;
Console.WriteLine(str.Soundex());
Console.ReadKey();
}
}
static class Extensions
{
public static string Soundex(this string str)
{
char f = ”;
char last = ”;
List num;
var chr = new List(
str.ToLower().ToCharArray()
).FindAll(c =>
c >= ‘a’ && c <= 'z' && c != 'a' && c != 'e' && c != 'h' &&
c != 'i' && c != 'o' && c != 'u' && c != 'w' && c != 'y'
);
f = chr[0];
chr.RemoveAt(0);
num = new List(chr.Count);
chr.ForEach(
delegate(char c)
{
if (num.Count < 3 && c != last)
{
switch (c)
{
case 'b': case 'f':
case 'p': case 'v':
num.Add('1');
break;
case 'c': case 'g':
case 'j': case 'k':
case 'q': case 's':
case 'x': case 'z':
num.Add('2');
break;
case 'd': case 't':
num.Add('3');
break;
case 'l':
num.Add('4');
break;
case 'm': case 'n':
num.Add('5');
break;
default:
num.Add('6');
break;
}
}
last = c;
}
);
return ((f < 95) ? f : (char)(f – 32)) + new string(num.ToArray()).PadRight(3, '0');
}
}
}
Python version.
_code is a mapping of letter to soundex code or “” for letters in the first group.
The algorithm doesn’t say how to handle punctuation (spaces, hyphens, apostrophes), so I strip them.
groupby processes its first argument to find groups of items having the same key, as determined by the second argument. _code.get does a dictionary lookup, returning the soundex code. So, groupby returns (k,v)-pairs, where k is a soundex code, and v is a group of adjacent letters having that code. The join concatentates the soundex codes, skipping the first.
from itertools import groupby _groups = "AEHIOUWY BFPV CGJKQSXZ DT L MN R".split() _code = dict( (ltr, str(n or '')) for n,s in enumerate(_groups) for ltr in s ) def soundex( name ): uname = name.upper().replace(" '-", "") tail = ''.join( [ k for k,_ in groupby(uname, _code.get) ][1:] ) return (uname[0] + tail + '000')[:4] #test from knuth namelists = ["Euler Gauss Hilbert Knuth Lloyd Lukasiewicz", "Ellery Ghosh Heilbronn Kant Ladd Lissajous" ] result = "E460 G200 H416 K530 L300 L222".split() for names in namelists: assert map( soundex, names.split() ) == resultMy implementation in C
#include <string.h> #include <ctype.h> static void squeeze(char *s, char *t); static void assign_code(char *s); static void remove_adjacent_code(char *s); static void remove_initial_letters(char *s); void soundex_code(char *name) { int len; char *s = "hw"; char *q = "aeiouy"; remove_initial_letters(name); /* convert to code */ assign_code(name+1); /* drop all occurrences of h, w except first letter */ squeeze(name+1, s); /* remove adjacent code */ remove_adjacent_code(name); /* drop all occurrences of a, e, i, o, u, y except first letter */ squeeze(name+1, q); if((len = strlen(name)) > 4) name[4] = '\0'; if(len < 4) { while(len < 4) name[len++] = '0'; name[len] = '\0'; } } static void remove_initial_letters(char *s) { char *p; for(p = s+1; tolower(*p) == tolower(*s); p++); for(; *p != '\0'; p++) *++s = *p; *++s = '\0'; } static void assign_code(char *s) { char c; for(; *s != '\0'; s++) { c = tolower(*s); if(c == 'b' || c == 'f' || c == 'p' || c == 'v') *s = '1'; else if(c == 'c' || c == 'g' || c == 'j' || c == 'k' || c == 'q' || c == 's' || c == 'x' || c == 'z') *s = '2'; else if(c == 'd' || c == 't') *s = '3'; else if(c == 'l') *s = '4'; else if(c == 'm' || c == 'n') *s = '5'; else if(c == 'r') *s = '6'; } } static void remove_adjacent_code(char *s) { char *p; for(p = s+1; *p != '\0'; p++) { if(tolower(*s) == tolower(*p)) continue; *++s = *p; } *++s = '\0'; } static void squeeze(char *s, char *t) { char *p, *q; for(p = s; *p != '\0'; p++) { for(q = t; *q != '\0' && tolower(*p) != tolower(*q); q++); if(*q == '\0') *s++ = *p; } *s = '\0'; }