Decoding Text-Speak

July 2, 2013

Sm ppl cmprs txt msgs by rtnng only ths vwls tht bgn a wrd and by rplcng dbld ltrs wth sngl ltrs.

With a proper dictionary, it is possible to expand all the possibilities for a word. For instance, the “Sm” that starts the sentence above is properly translated “Some” but these other words are possible: same, sam, sum, seem, seam, sumo, and others.

Your task is to write a program that, given a sentence in text-speak, returns a list of all possibilities for each word. When you are finished, you are welcome to read or run a suggested solution, or to post your own solution or discuss the exercise in the comments below.

Posted by programmingpraxis

Filed in Exercises

7 Comments »

7 Responses to “Decoding Text-Speak”

Paul said

July 2, 2013 at 12:12 PM

Here an attempt in Python. I follow a simple approach. First dump the english dictionary in a trie. Then take the signature word and try to find all words in the trie by expanding with double consonants, vowels and the characters from the signature word.

_end = '_end_'
VOWELS = "aeiou".upper()

def make_trie(*words):
    root = dict()
    for word in words:
        current_dict = root
        for letter in word:
            current_dict = current_dict.setdefault(letter, {})
        current_dict = current_dict.setdefault(_end, _end)
    return root
    
def search_words(trie, signature):
    """ expand the sign word with all possible vowels
    """
    Q = [(signature[0], signature[1:], trie[signature[0]])]
    while Q:
        matched, signature, trie = Q.pop()
        if not signature:
            if _end in trie:
                yield matched
        last_matched = matched[-1]
        if last_matched not in VOWELS and last_matched in trie: #try repeat
            Q.append((matched + last_matched, signature, trie[last_matched]))
        for v in VOWELS:
            if v in trie:
                Q.append((matched + v, signature, trie[v]))
        if signature and signature[0] in trie:
            Q.append((matched + signature[0], signature[1:], trie[signature[0]]))
            
words = [w[1:-1] for w in open(FNAME).read().split(",")]
root = make_trie(*words)
sentence = "Sm ppl cmprs txt msgs by rtnng only ths vwls tht bgn a wrd and by rplcng dbld ltrs wth sngl ltrs".upper()
for s in sentence.split():
    print "{:10s}:".format(s), 
    for word in search_words(root, s):
        print word,
    print

Paul said

July 2, 2013 at 3:41 PM

Another Python version kore in line with the Programming Praxis solution. This one is much faster and much shorter.

from collections import defaultdict
sign = defaultdict(list)
for w in (w[:-1].upper() for w in open(FNAME).readlines()):
    short = [w[0]]
    for c in w[1:]:
        if not (c in VOWELS or c == short[-1]):
            short.append(c)
    sign["".join(short)].append(w)
            
sentence = "Sm ppl cmprs txt msgs by rtnng only ths vwls tht bgn a wrd and by rplcng dbld ltrs wth sngl ltrs".upper()
for s in sentence.split():
    print "\n{:10s}:".format(s), 
    for word in sign[s]:
        print word,

Colin said

July 2, 2013 at 7:09 PM

In clojure:

(ns txtmsg
  (:require [clojure.string :as cstr]))

; "compress" words by "squeezing" repeats and eliminating non-initial vowels

(def vowels #{\a \e \i \o \u})
(defn unvowel [w]  (lazy-cat (take-while vowels w) (remove vowels w)))
(defn undouble [s]  (map first (partition-by identity s)))

; non-idempotent: "people" -> "ppl" -> "pl"
(defn compress [w]  (->> w cstr/lower-case undouble unvowel cstr/join))

(defn make-table [words]  (group-by compress words))
(defn translate [table word]  (get table (cstr/lower-case word) [word]))

Mike said

July 5, 2013 at 7:13 AM

from collections import defaultdict

def encode(text):
    t = text.lower()
    return ''.join(t[n] for n in range(len(t)) if n == 0 or t[n] not in 'aeiou' and t[n] != t[n-1])
    
def decode(txt):
    words = code[txt.lower()]
    if txt[0].isupper():
        words = [w.title() for w in words]
    return words if words else {txt}

# build lookup table from word list.
# using 2of12inf.txt from 12dicts, because it includes inflections, e.g., plurals, -ed, -ing, etc.  
# Some words in the list are marked with a "%", I strip the "%" but keep the word.
code = defaultdict(set)
with open('/12dicts/2of12inf.txt', 'rt') as f:
    for line in f:
        word = line.replace('%','').strip().lower()
        code[encode(word)].add(word)

if __name__ == "__main__":
    import re
    sentence = "Sm ppl cmprs txt msgs by rtnng only ths vwls tht bgn a wrd and by rplcng dbld ltrs wth sngl ltrs."

    for word in re.findall(r'\w+', sentence):
	if word[0].isalpha():
		print(word, decode(word))

@Paul – I don’t think your second version compresses words correctly if the have two of the same consonant separated by vowels, e.g., “people” should compress to “ppl”, but yours compresses to “pl”.

Paul said
July 5, 2013 at 8:49 PM
@Mike. That is correct. I saw it already, but did not post a correction. Thanks for pointing out the word lists from 12dicts.
cacaegg said
August 2, 2013 at 3:18 AM
“””
Try to match all the words in encrypt text
“””
import re

ENCRYPT_TEXT = “Sm ppl cmprs txt msgs by rtnng only ths vwls tht bgn ”
ENCRYPT_TEXT += “a wrd and by rplcng dbld ltrs wth sngl ltrs”
ENCRYPT_TEXT = ENCRYPT_TEXT.split()

def main():
“””
The main function
“””
word_dict = {}
answer = {}

# Build the dictionary by first letter
for capital in range(ord(‘a’), ord(‘z’)+1):
word_dict[chr(capital)] = []

with open(“/usr/share/dict/words”, “r”) as dict_file:
for word in dict_file.readlines():
word = word.strip()
if len(word) > 0:
word_dict[word[0].lower()].append(word)

for capital in word_dict.keys():
word_dict[capital] = “\n”.join(word_dict[capital])

# For each word, try to find it in dictionary
for word in ENCRYPT_TEXT:
reg_re = word[0] + “”.join([“.*”+c for c in word[1:]]) + “.*”
reg_re = reg_re.lower()
answer[word] = re.findall(reg_re, word_dict[reg_re[0]])

print answer

main()
cacaegg said
August 2, 2013 at 3:19 AM
http://pastebin.com/rkfZwvsL

S	M	T	W	T	F	S
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30	31

Programming Praxis