Word Breaks

August 12, 2011

The obvious solution uses recursive backtracking to pick off a prefix of the input that is in the dictionary, then recur on the remainder of the string, stopping when the entire input string is in the dictionary:

(define (segments str) ; exponential (if (member str dict) str (let ((len (string-length str))) (let loop ((i 1)) (if (= i len) "" (let ((prefix (substring str 0 i))) (if (not (member prefix dict)) (loop (+ i 1)) (let ((suffix (segments (substring str i len)))) (if (string=? suffix "") (loop (+ i 1)) (string-append prefix " " suffix))))))))))

That has exponential time complexity, but memoizing the function reduces the time complexity to quadratic:

(define cache (make-hash string-hash string=? #f 97))

(define (segments str) ; quadratic (if (member str dict) str (if (cache 'lookup str) (cache 'lookup str) (let ((len (string-length str))) (let loop ((i 1)) (if (= i len) "" (let ((prefix (substring str 0 i))) (if (not (member prefix dict)) (loop (+ i 1)) (let ((suffix (segments (substring str i len)))) (if (string=? suffix "") (loop (+ i 1)) (let ((output (string-append prefix " " suffix))) (cache 'insert str output) output))))))))))))

Here are some examples:

> (define dict '("a" "aa" "aaa" "ab" "apple" "apricot" "is" "pie" "test" "this")) > (segments "applepie") "apple pie" > (segments "thisisatest") "this is a test" > (segments "aaab") "a a ab"

We used hash tables from the Standard Prelude. You can run the program at http://programmingpraxis.codepad.org/6fUMlr92.

Posted by programmingpraxis

Filed in Exercises

15 Comments »

15 Responses to “Word Breaks”

Daniel Tunkelang said
August 12, 2011 at 1:25 PM
A solution in Scheme. Nice!

Adolfo said

August 12, 2011 at 2:48 PM

My straightforward (and naïve) solutions in both Racket and Python:

#lang racket

(define dictionary
  (set "a" "brown" "apple" "pie"))

(define (in-prefixes str)
  (define (pos->element i)
    (values (substring str 0 (+ 1 i)) (substring str (+ 1 i))))
  (define (next-pos i)
    (+ i 1))
  (define initial-position 0)
  (define (contains-index? i)
    (< i (string-length str)))
  (define (contains-value? prefix rest)
    #t)
  (define (contains-index-and-value? i prefix rest)
    #t)
  (make-do-sequence
   (lambda ()
     (values pos->element
             next-pos
             initial-position
             contains-index?
             contains-value?
             contains-index-and-value?))))

(define (string-empty? str)
  (zero? (string-length str)))

(define (word-break dictionary word)
  (for/first (((prefix remaining) (in-prefixes word))
              #:when (set-member? dictionary prefix)
              (rest (in-value (if (string-empty? remaining)
                                  '()
                                  (word-break dictionary remaining))))
              #:when rest)
    (cons prefix rest)))

#! /usr/bin/env python

DICTIONARY = {'a', 'apple', 'pie', 'brown'}

def word_break(dictionary, word):
    def break_string(s):
        return [] if s == '' else word_break(dictionary, s)

    for split_point in range(len(word) + 1):
        prefix = word[:split_point]

        if prefix in dictionary:
            rest = break_string(word[split_point:])

            if rest is not None:
                return [prefix] + rest

Graham said

August 12, 2011 at 3:00 PM

A Python solution:

#!/usr/bin/env python

def segments(s, d):
    segs = []
    while s:
        if s in d:
            segs.append(s)
            s = ""
        for i in xrange(1, len(s)):
            if s[:i] in d:
                segs.append(s[:i])
                s = s[i:]
                break
            if i == len(s) - 1:    # already tested full string s
                s = ""
    return " ".join(segs)

if __name__ == "__main__":
    d = set(["a", "aa", "aaa", "ab", "apple", "apricot", "is", "pie", "test", "this"])
    print segments("thisisatest", d)
    print segments("aaab", d)

Not pretty, to be sure, but it seems to get the job done.

Mike said

August 12, 2011 at 11:54 PM

Another Python solution.

I looked at a histogram of word length for the dictionary. In order of decreasing frequency, the word lengths are [8, 7, 9, 6, …]. gen_split() tries to split off a prefix of the input string using the word lengths in order of decreasing frequency.

gen_split() generates different splits of the input string. split() returns just the first one.

from collections import Counter
from itertools import chain, count

with open("12dicts/5desk.txt", "rt") as f:
    words = {line.strip() for line in f}


# word lengths in order of decreasing frequency
lengths = [t[0] for t in Counter(map(len, words)).most_common()]

def gen_split(string, dictionary):
    if string:
        for n in lengths:
            if n > len(string): continue
            
            if string[:n] in dictionary:
                for rest in gen_split(string[n:], dictionary):
                    yield [string[:n]] + rest

    else:
        yield []


def split(string, dictionary):
    words = next(gen_split(string, dictionary), None)
    return ' '.join(words) if words else None


# testing
from random import sample

for n in range(2,7):
    test = ''.join(sample(words, n))
    print('split("{}") ->\n\t"{}"\n'.format(test, split(test, words)))

# sample output
#split("cobbleddruggist") ->
#	"cobbled druggist"
#
#split("phospholipidoasisdeterment") ->
#	"phospholipid oasis determent"
#
#split("remuneratorunmelodiouspancreaticnonviable") ->
#	"remunerator unmelodious pancreatic nonviable"
#
#split("sweatinesssignatorycampsitereflowerpapaw") ->
#	"sweatiness signatory campsite reflower papaw"
#
#split("awestruckmonasticismBarclayshamKampalapressured") ->
#	"awestruck monastic ism Barclay sham Kampala pressure d"

Kevin Hostelley said
August 13, 2011 at 2:56 AM
My clojure solution
Kevin Hostelley said
August 13, 2011 at 3:01 AM
Sorry, let’s try this again.

My clojure solution
Kevin Hostelley said
August 14, 2011 at 2:11 AM
Maybe this will be better:

Jussi Piitulainen said

August 14, 2011 at 6:46 PM

Consider a procedure, (for-each-partition proc s word?), that walks
proc over all partitions of the given string s into words recognized
by the given predicate, word?, like this:

> (for-each-partition write "frukosten" word?)
("frukosten")("fru" "kosten")("fru" "ko" "sten")>

The problem statement asks for joining the parts into a single string;
that would be easy to do. The example uses this dictionary predicate:

(define (word? s)
  (member s '("fru" "kost" "kosten" "frukost" "ost"                                                                          
              "frukosten" "ko" "sten")))

Then an escape procedure can be used to receive the first solution
found by for-each-partition, or #f if there are none, as follows:

> (call-with-current-continuation
     (lambda (k) (for-each-partition k "frukosten" word?) #f))
("frukosten")

It is now possible to add conditions. For example, the first partition
into three words:

> (call-with-current-continuation
     (lambda (k) (for-each-partition (lambda (p)                                                                             
                                        (if (= 3 (length p)) (k p)))
                    "frukosten" word?)
                 #f))
("fru" "ko" "sten")

(The indentations look all wrong to me after cut-and-paste and I see no
revied button. Let us see.)

Jussi Piitulainen said

August 14, 2011 at 6:56 PM

Ok, the indentations look right in the published version, within the
sourcode brackets. Below is an implementation of for-each-partition.
It works on an agenda of reversed position sequences, so that the
first index in an agenda task is a position where the next word needs
to be found. It memoizes the end positions of the recognized words at
those start positions where it needs to find more ways forward.

(define (for-each-partition proc s word?)
  (let* ((n (string-length s))
         (memory (make-vector n #f))
         (agenda '((0))))
    (let do-agenda ()
      (if (and (pair? agenda) (positive? n))
          (let* ((task (car agenda)) (b (car task)))
            (set! agenda (cdr agenda))
            (if (not (vector-ref memory b))
                (vector-set! memory b
                             (do ((e (+ b 1) (+ e 1))
                                  (es '() (if (word? (substring s b e))
                                              (cons e es)
                                              es)))
                                 ((> e n) es))))
            (for-each (lambda (e)
			(let ((next (cons e task)))
                          (if (= e n)
                              (proc (substrings s (reverse next)))
                              (set! agenda (cons next agenda)))))
                      (vector-ref memory b))
            (do-agenda))))))

(define (substrings s ix)
  (map (lambda (b e) (substring s b e))
       (reverse (cdr (reverse ix)))
       (cdr ix)))

Jussi Piitulainen said
August 14, 2011 at 7:03 PM
(I have no idea what happened to the indentation of line 17 above. It
is right in my editor window. There are tabs, but there are tabs on
many other lines that did not break.)
Ben Mabey said
August 15, 2011 at 6:01 PM
Here are my Clojure and Ruby solutions: http://benmabey.com/2011/08/14/word-break-in-clojure-and-ruby-and-laziness-in-ruby.html
bablu said
September 14, 2011 at 9:06 AM
static void Main(string[] args)
{
string[] lookupTable = { “apple”, “Bat”, “Candle”, “Donkey”, “Eat”, “Sat” };
string str = string.Empty;
Console.WriteLine(“Please enter a string”);
str = Console.ReadLine();
string strToSearch = string.Empty;
for (int i = 0; i < str.Length; i++)
{
if (' ' == str[i])
continue;
else
{
strToSearch += str[i];
for (int j = 0; j < lookupTable.Length; j++)
{
if (strToSearch.ToUpper() == lookupTable[j].ToUpper())
{
int pos = str.IndexOf(strToSearch) + strToSearch.Length;
str = str.Insert(pos, " ");
strToSearch = string.Empty;
}
}
}

}
Console.WriteLine(str);
}
Sleiman jneidi said
October 6, 2011 at 9:01 PM
This is my C# Code

string[] dic = { “a”, “brown” ,”apple”, “pie”};
string input = “abrownapplepie”;

StringBuilder sb = new StringBuilder();
Regex[] regexes = dic.Select(c => new Regex(c)).ToArray();
foreach (var reg in regexes)
{
sb.Append(reg.Match(input).Value + ” “);

}
Console.WriteLine(sb.ToString().Trim());

Juan Ignacio Saba said

January 8, 2014 at 6:24 PM

My 2 cents, in Perl.

In this implementation, I try to find the longest prefix in the input that exists in the dictionary, and then do a recursive call for the rest of the input.

In order to test it, and just for fun, I built myself a small dictionary file (using the 187 most used words in the english dictionary), taken from here:
http://www.manythings.org/vocabulary/lists/l/

And I tested it like this:

juansaba-macbook-pro:word_breaks juansaba$ ./word_break.pl everymanshouldgo dict.txt 
every man should go
juansaba-macbook-pro:word_breaks juansaba$ ./word_break.pl nowornever dict.txt 
now or never
juansaba-macbook-pro:word_breaks juansaba$ ./word_break.pl because dict.txt 
because
juansaba-macbook-pro:word_breaks juansaba$ ./word_break.pl forthewin dict.txt                                (the word 'win' was just not among the 187 most used words :( ...)
NO_SOLUTION
juansaba-macbook-pro:word_breaks juansaba$ ./word_break.pl youdonotknowjusthowlong dict.txt 
you do not know just how long

And here’s the source:

#!/usr/bin/perl -w

use strict;
use warnings;

my ($string, $dictionary_path) = @ARGV;

# Read the dictionary
my $dictionary = {};
open my $dfh, "<", $dictionary_path or die "Could not open '$dictionary_path' for reading: $!";
while(<$dfh>) { chomp; $dictionary->{lc($_)} = 1 }
close $dfh;

# Find the first phrase
my $phrase = findPhrase($string, $dictionary);
print "".(defined $phrase ? $phrase : "NO_SOLUTION")."\n";

sub findPhrase {
    my $string = shift;
    my $dictionary = shift;

    for(my $prefix_length = length($string); $prefix_length >= 1; $prefix_length--) {
        my $prefix = substr($string, 0, $prefix_length);
        my $rest = substr($string, $prefix_length);
        if(defined $dictionary->{$prefix}) {
            return $prefix if $rest eq "";

            my $rec_phrase = findPhrase($rest, $dictionary);
            return $prefix." ".$rec_phrase if defined $rec_phrase;
        }
    }

    return undef;
}

davedhaval87 said
January 4, 2015 at 5:42 PM
DP and Recursive Solution with working code at http://www.gohired.in/2014/12/word-break-problem.html

S	M	T	W	T	F	S
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30	31

Programming Praxis

Word Breaks

August 12, 2011

15 Responses to “Word Breaks”

Leave a comment

Categories

Archives

Archives

Programming Praxis

Word Breaks

August 12, 2011

Share this:

Related

15 Responses to “Word Breaks”

Leave a comment

Categories

Archives

Archives