Longest Duplicated Substring

December 14, 2010

In a previous exercise, we looked at the problem of finding the longest palindrome in a string. In today’s exercise, we look at a similar problem, finding the longest duplicated substring in a string.

The algorithm for this task is well known. First build a suffix list that has all the substring suffices of the input string. For instance, if the input is “banana”, the suffix list is “a”, “na”, “ana”, “nana”, “anana”, “banana”. Then sort the suffix list, bringing suffices with common beginnings together. Finally, scan the sorted suffix list, computing the common lengths of adjacent pairs, and report the longest.

Your task is to write a program to find the longest duplicated substring within a string. When you are finished, you are welcome to read or run a suggested solution, or to post your own solution or discuss the exercise in the comments below.

About these ads

Pages: 1 2

7 Responses to “Longest Duplicated Substring”

  1. [...] today’s Programming Praxis exercise, our task is to implement the algorithm to find the longest duplicated [...]

  2. My Haskell solution (see http://bonsaicode.wordpress.com/2010/12/14/programming-praxis-longest-duplicated-substring/ for a version with comments):

    import Data.List
    import Data.List.HT (mapAdjacent)
    import qualified Data.List.Key as K
    
    lds :: Ord a => [a] -> [a]
    lds = K.maximum length . mapAdjacent lcp . sort . tails where
        lcp (x:xs) (y:ys) | x == y = x : lcp xs ys
        lcp _      _               = []
    
  3. Khanh Nguyen said

    Mine in F#

    let rec suffixs (s:string) = 
        match s.Length with
        | 0 -> []
        | _ -> s :: (suffixs (s.Remove(0,1)))
    
    let rec count_common_length (s1:string) (s2:string) =
        if s1.StartsWith(s2) then s2.Length
        elif s2.StartsWith(s1) then s1.Length
        else 0
    
    let longest_duplicated (s:string) = 
        let suffix = suffixs s |> List.sort
        let pos = suffix               
                  |> Seq.ofList 
                  |> Seq.pairwise 
                  |> Seq.map (fun (x,y) -> count_common_length x y)
        let maxPos = Seq.findIndex (fun x -> x = Seq.max pos) pos              
        suffix.[maxPos]
    
    longest_duplicated "banana"
    
    
  4. Khanh Nguyen said

    Correction,

    let rec suffixs (s:string) = 
        match s.Length with
        | 0 -> []
        | _ -> s :: (suffixs (s.Remove(0,1)))
    
    let rec count_common_length_charlist (ss1: char list) (ss2: char list) =     
        match ss1, ss2 with 
        | [], _ -> 0
        | _, [] -> 0
        | x::xs, y::ys -> if x = y then 1 + (count_common_length_charlist xs ys) 
                          else (count_common_length_charlist xs ys)
    
    
    let rec count_common_length (s1:string) (s2:string) =
        let s1_list = s1.ToCharArray() |> List.ofArray
        let s2_list = s2.ToCharArray() |> List.ofArray
        count_common_length_charlist s1_list s2_list
    
    
    let longest_duplicated (s:string) = 
        let suffix = suffixs s |> List.sort
        let pos = suffix               
                  |> Seq.ofList 
                  |> Seq.pairwise 
                  |> Seq.map (fun (x,y) -> count_common_length x y)
        let maxPos = Seq.findIndex (fun x -> x = Seq.max pos) pos              
        suffix.[maxPos]
    
    longest_duplicated "banana"
    
    
  5. Deepak said

    need solution in Core Java

  6. Mike said

    My Python version. It turns out to be very similar to Remco’s Haskell code. The combination of starmap and pairwise corresponds to “mapAdjacent” or and the ‘suffices’ generator expression corresponds to “tail”.

    from itertools import imap, izip, starmap, takewhile, tee
    from operator import eq, truth
    
    
    def pairwise(iterable):
        """copied from recipies in itertools docs"""
        
        a,b = tee(iterable)
        next(b,None)
        return izip(a,b)
    
    
    def prefix(a,b):
        """returns longest common prefix of strings 'a' and 'b'."""
        
        return a[:sum(takewhile(truth,imap(eq,a,b)))]
    
    
    def longest_duplicate(s):
        """return longest duplicated sequence of characters from s."""
    
        suffices = (s[n:] for n in range(len(s)))
        return max(starmap(prefix, pairwise(sorted(suffices))), key=len)
    
  7. a little bit late:
    my commented solution in OCaml (http://www.gleocadie.net/?p=543&lang=en) using OCaml Batteries

    open BatString;;
    
    let suffixes =
      let rec aux r = function
      | s when is_empty s -> r
      | s -> aux (s::r) (lchop s)
    in aux []
    ;;
    
    let lc s1 s2 =
      let rec aux res = function
      | s, _ when is_empty s -> res
      | _, s when is_empty s -> res
      | s1, s2 when s1.[0] <> s2.[0] -> res
      | s1, s2 -> aux (s1.[0]::res) (lchop s1,lchop s2)
    in List.rev (aux [] (s1,s2)) |> of_list
    ;;
    
    let lcs =
      let rec lcs' res = function
      | [] | [_] -> res
      | l1::l2::t ->
          let res' = lc l1 l2 in
          lcs' (if String.length res > String.length res'
                then res else res') (l2::t)
    in lcs'
    ;;
    
    let longest s =
      let module M = Map.Make(struct type t = char let compare = Pervasives.compare end) in
        M.fold (fun _ u v -> lcs v (List.fast_sort Pervasives.compare u))
               (List.fold_left
                  (fun y x -> let k = x.[0] in
                              M.add k (x::(try M.find k y with _ -> [])) y)
                   M.empty (suffixes s))
               ""
    ;;
    

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Follow

Get every new post delivered to your Inbox.

Join 616 other followers

%d bloggers like this: