String Subsets

November 23, 2010

This exercise is part of our on-going series of interview questions:

Given two strings, determine if all the characters in the second string appear in the first string; thus, DA is a subset of ABCD. Counts matter, so DAD is not a subset of ABCD, since there are two D in the second string but only one D in the first string. You may assume that the second string is no longer than the first string.

Your task is to write a function to determine if one string is a subset of another string. You should work as you would in a programming interview; if you find one solution, search for a better solution. When you are finished, you are welcome to read or run a suggested solution, or to post your own solution or discuss the exercise in the comments below.

Pages: 1 2

22 Responses to “String Subsets”

  1. […] today’s Programming Praxis exercise,we have to write functions to determine if one string is a subset of […]

  2. My Haskell solution (see http://bonsaicode.wordpress.com/2010/11/23/programming-praxis-string-subsets/ for comments and the preceding versions):

    import qualified Data.IntMap as I
    
    subsetOf4 :: Enum a => [a] -> [a] -> Bool
    subsetOf4 xs ys = I.null $ I.differenceWith
        (\x y -> if x <= y then Nothing else Just x) (f xs) (f ys)
        where f = I.fromListWith (+) . map (flip (,) 1 . fromEnum)
    
  3. here is a commented version http://www.gleocadie.net/?p=355&lang=en#more-355

    #include
    #include
    #include

    int *freq_table(char *s)
    {
    int i;
    int *ft = (int*)calloc(256, sizeof(int));

    for (i = 0; i < strlen(s); i++)
    ft[(int)s[i]]++;

    return ft;
    }

    int is_subset(int *ft, char *s)
    {
    int i;

    for (i = 0; i < strlen(s); i++){
    int off = (int)s[i];
    if (ft[off] == 0)
    return -42;
    ft[off]–;
    }
    return 42;
    }

    int main(int argc, char** argv)
    {
    int *ft;

    if (argc != 3) {
    printf("You should provide 2 arguments %d\n", argc);
    return -1;
    }
    ft = freq_table(argv[1]);
    if (is_subset(ft, argv[2]) == 42)
    fprintf(stdout, "%s is subset of %s\n", argv[2], argv[1]);
    else
    fprintf(stdout, "%s is not subset of %s\n", argv[2], argv[1]);

    return 0;
    }

  4. slabounty said

    A simple ruby solution

  5. slabounty said

    Hmmm … lost the code somehow …

    class String
        def subset_of?(s)
            self.chars do |c|
                return false if !s.include? c
                s.sub!(c, "")
            end
            true
        end
    end
    
    puts "DA".subset_of?("ABCD")
    puts "DXA".subset_of?("ABCD")
    puts "DAD".subset_of?("ABCD")
    
  6. Bogdan Popa said

    Felt like using dict comprehensions.

    #!/usr/bin/env python3
    """usage: subset.py string second_string
    """
    import sys
    
    def count(s):
        l = []
        for i in range(len(s)):
            c, n = s[i], 1
            if c in l:
                continue
            for k in s[i + 1:]:
                if k == c:
                    n += 1
            yield c, n
            l.append(c)
    
    def is_subset(s, s2):
        ap = {c: n for c, n in count(s)}
        ap2 = {c: n for c, n in count(s2)}
        for key in ap2.keys():
            try:
                if ap2[key] > ap[key]:
                    return False
            except KeyError:
                return False
        return True
    
    def main(args):
        try:
            print(is_subset(args[1], args[2]))
        except IndexError:
            print(__doc__, end="")
            return 1
        return 0
    
    if __name__ == "__main__":
        sys.exit(main(sys.argv))
    
  7. My PHP version

    function subset_of($needle,$haystack) {
        foreach(str_split($needle) as $i) {
            $pos = strpos($haystack,$i);
            if ($pos === FALSE) return FALSE;
            $haystack[$pos] = '';
        }
        return TRUE;
    }
    

    subset_of(‘DA’,’ABCD’); // TRUE
    subset_of(‘DAD’,’ABCD’); // FALSE

  8. In Python 2.7 using the Counter class:

    from collections import Counter

    def contains_all(a, b):
    counts = Counter(a)
    counts.subtract(b)
    return min(counts.values()) >= 0

  9. Graham said

    Python solution, with comments about expected running time to give justification of design choices:

    
    #!/usr/bin/env python2.6
    
    def is_str_subset(str1, str2):
        d = {}      # dict of letter --> count for letter in str2
        for letter in str2:
    # theta(m) where m = |str2|; most expensive part here, but cheaper on its own
    # than sorting m first to quickly search [theta(m lg m)] and same as linearly
    # searching unsorted str2
            try:
                d[letter] += 1
            except KeyError:
                d[letter] = 1
        for letter in str1:
    # O(n) where n = |str1|; n < m, so total work is theta(m)
            try:
                if d[letter] == 0:
                    return False
                else:
                    d[letter] -= 1
            except KeyError:
                return False
        return True
    
    
    if __name__ == '__main__':
        str1 = "DA"
        str2 = "ABCD"
        str3 = "DAD"
        print "Is str1 a string subset of str2?\t%s" % is_str_subset(str1, str2)
        print "Is str3 a string subset of str2?\t%s" % is_str_subset(str3, str2)
    
  10. Clever said

    Overly verbose yes – but quick and easy to understand I hope.

    # First string is the hopeful subset, second string is the set
    def stringSubset (str1, str2):
    	# Contains a count for each letter in the strings
    	count1 = {}
    	count2 = {}
    	
    	# Walk through the first string counting characters
    	for c in str1:
    		if not count1.has_key(c):
    			count1[c] = 1
    		else:
    			count1[c] = count1[c] + 1
    			
    	# Walk through the second string counting characters
    	for c in str2:
    		if not count2.has_key(c):
    			count2[c] = 1
    		else:
    			count2[c] = count2[c] + 1
    			
    	# Walk through subset dictionary
    	# Return false if any character count doesn't match
    	for char in count1.keys():
    		if not count1[char] == count2[char]:
    			return False
    			
    	# False was not returned above therefore we can return true
    	return True
    
    # Test
    print(stringSubset("DA","ABCD"))
    print(stringSubset("DA","DAD"))
    
  11. Axio said

    ;; Use a better one next time, please.
    (define (char-sort l)
      (if (null? l)
        l
        (let ((h (car l))
              (t (cdr l)))
          (append
            (char-sort (filter (lambda (x) (char<=? x h)) t))
            (list h)
            (char-sort (filter (lambda (x) (char>? x h)) t))))))

    (define (string-subset? s2 s1)
      (define (magic set1 set2)
        (cond
          ((null? set1) #t)
          ((null? set2) #f)
          ((char=? (car set1) (car set2)) (magic (cdr set1) (cdr set2)))
          ((char<? (car set1) (car set2)) #f)
          (else (magic set1 (cdr set2)))))
      (and (<= (string-length s1) (string-length s2))
           (magic (char-sort (string->list s1))
                  (char-sort (string->list s2)))))

  12. Clever:
    you code is clear, but you miss that “DA” is subset of “ABCDA”.
    The guilty part is:

    for char in count1.keys():
      if not count1[char] == count2[char]:
        return False
    

    you would have had used:

    for char in count1.keys():
      if not count1[char] <= count2[char]:
        return False
    

    and those tests would have passed:

    # Test
    print(stringSubset("DA","ABCD"))
    print(stringSubset("DAD","ABCD"))
    print(stringSubset("DA","DAD"))
    print(stringSubset("DA","ABCDA"))
    ----
    True
    False
    True
    True
    
  13. Clever said

    Je vous remercie, Grégory! I completely glossed over that with my insufficient testing.

  14. Kevin said

    Some of the details definitely not necessary…just for readability. Feedback please! I’m trying to learn :D

    #include <iostream>
    #include <string>
    #include <algorithm>
    using namespace std;
    
    int main( int argc, char* argv[] )
    {
    	cout << "Give me two strings:" << endl;
    	string str1, str2, str1bak, str2bak;
    	cin >> str1;
    	cin >> str2;
    	str1bak = str1;
    	str2bak = str2;
    
    	if(str1.length() > str2.length())
    	{
    		cout << "1st string is larger than the 2nd so it cannot be a string subset!" << endl;
    		return 0;
    	}
    
    	sort(str1.begin(), str1.end());
    	sort(str2.begin(), str2.end());
    
    	if(str1.length() == str2.length() && str1 == str2)
    	{
    		cout << "These two strings are set equivalent!" << endl;
    		return 0;
    	}
    	else if(str1.length() != str2.length())
    	{
    		int i=0;
    		while(str1 != "")
    		{
    			if(str1[0] == str2[i])
    			{
    				str1 = str1.substr(1);
    			}
    			else if(str1[0] < str2[i])
    			{
    				cout << str1bak << " is not a string subset of " << str2bak << endl;
    				return 0;
    			}
    			i++;
    		}
    		cout << str1bak << " is a string subset of " << str2bak << endl;
    	}
    
    	return 0;
    }
    
  15. Steve Valentine said

    In the following algorithm, if you get to the end of the proposed subset string, it is subset else it is not.

    #include
    #include
    #include

    int main () {

    using namespace std;
    using namespace boost::assign;
    typedef multiset MUL_ST;
    typedef multiset::iterator MS_I;

    MUL_ST target;
    MUL_ST subset;

    target += ‘a’, ‘a’, ‘a’, ‘b’, ‘c’, ‘c’, ‘c’, ‘d’, ‘u’, ‘z’, ‘b’;
    target += ‘b’, ‘b’, ‘c’, ‘d’, ‘u’, ‘z’, ‘v’;
    subset += ‘b’, ‘b’, ‘c’, ‘d’, ‘u’, ‘v’, ‘v’;

    cout << "\n"; BOOST_FOREACH(char c, subset) cout << c << " "; cout << "\n";

    MS_I si = subset.begin(); MS_I ti = target.begin();

    while ( si != subset.end() && ti != target.end() ) {
    if ( *si *ti ) {
    ++ti;
    if ( ti == target.end() ) goto judgement;
    }
    if ( *si != *ti ) goto judgement;
    ++si; ++ti;
    }

    judgement:
    if ( si == subset.end() ) cout << "is a subset of" << "\n";
    else cout << "is NOT a subset of" << "\n";

    BOOST_FOREACH(char c, target) cout << c << " "; cout << "\n";

    return 0;
    }

  16. lojze said

    Language: Clojure
    (defn subset-of [s1 s2]
    (every? #(<= (count (filter #{%} s1)) (count (filter #{%} s2))) s1))

  17. Khanh Nguyen said

    In F#

    let rec remove_first (c:char) (l:char list) =
        match l with
        | [] -> []
        | x::xs -> if x = c then xs else x :: (remove_first c xs)
    
    let rec charlist_subsets (a:char list) (b:char list) = 
        match a, b with 
        | _, [] -> true
        | a, x::xs -> let t = remove_first x a 
                      not(t = a) && (charlist_subsets t xs)
    
    let string_subsets (a:string) (b:string) = 
        charlist_subsets (a.ToCharArray() |> List.ofArray) (b.ToCharArray() |> List.ofArray)
    
    string_subsets "ABCD" "DA"
    string_subsets "ABCD" "DAD"
    
    
  18. Guillaume said
    ; Language: scheme (Gambit)
    ; - do not sort any string,
    ; - count characters only on the target substring, because it is smaller,
    ; - stop walking the longer string as soon as possible (success or failure).
    
    (define (string-subset? s sub)
      (if (< (string-length s) (string-length sub))
          #f
          (let* ((tc (table-count sub)) ; count characters on the target substring
                 (n (table-length tc))  ; number of unique characters
                 (nfound 0))
            
            (let loop ((rest (string->list s)))
              
              (cond ((= n nfound) #t)  ; success -> stop walking the longer string
                    ((null? rest) #f)  ; failure -> stop walking the longer string
                    (else
                     (let* ((first (car rest))
                            (v (- (table-ref tc first) 1)))
                       (table-set! tc (- v 1))
                       (and (= v 0) ; done for this type of character
                            (set! nfound (+ nfound 1)))
                       (loop (cdr rest)))))))))
            
    (define (table-count s)
      (let ((ret (make-table init: 0)))
    
        (let loop ((rest (string->list s)))
    
          (if (null? rest)
              ret
              (let* ((first (car rest))
                     (next (cdr rest)))
                (table-set! ret first (+ (table-ref ret first) 1))
                (if (null? next)
                    ret
                    (loop next)))))))
    
    (string-subset? "ABCD" "")
    ; #t
    (string-subset? "ABCD" "DA")
    ; #t
    (string-subset? "ABCD" "DAD")
    ; #f
    
  19. Jebb said

    Three Python solutions, similar to the ones you proposed:

    #!/usr/bin/python2.7
    
    def is_subset(string, substring):
    # This runs in O(n2)
    	L = list(string)
    	for letter in substring:
    		if letter in L:
    			L.remove(letter)
    		else:
    			return False	
    	return True
    
    def is_subset_faster(string, substring):
    # This runs in the same order as the sorting function (O(n log n)?)
    	lstring = sorted(list(string))
    	lsubstring = sorted(list(substring))
    	i = 0
    	for letter in lsubstring:
    		if i == len(lstring):
    			return False
    		while letter > lstring[i]:
    			if i < len(lstring):
    				i += 1
    			else:
    				return False
    		if letter == lstring[i]:
    			i += 1
    		else:	#letter < lstring[i]: letter in substring not in string
    			return False
    	return True
    
    def is_subset_fastest(string, substring):
    # This runs in O(n)
    	sLetters = {}
    	subLetters = {}
    	for letter in string:
    		if letter not in sLetters:
    			sLetters[letter] = 1
    		else:
    			sLetters[letter] += 1
    	for letter in substring:
    		if letter not in subLetters:
    			subLetters[letter] = 1
    		else:
    			subLetters[letter] += 1
    	for letter in subLetters.keys():
    		print letter
    		if (letter not in sLetters.keys()) or (sLetters[letter] < subLetters[letter]):
    			return False
    	return True
    

    I was delighted to see that I came up with the same two O(n2) and O(nlogn) solutions that you proposed… too bad I didn’t see the O(n) solution (that one was a “D’oh” moment, really) before reading the comments. Guess I won’t quit my day job yet :-)

  20. Jonathan said

    A common lisp version…

    (defun string-subset (substring total-set)
      "Returns true if the substring only contains characters that are in total-set."
      (let ((n (length substring))
            (m (length substring))
            (freq (make-hash-table))
            (uniques 0))
        (and (<= n m)
          (progn
            (iter (for c in-vector substring)
              (if (gethash c freq)
                (incf (gethash c freq))
                (progn
                  (setf (gethash c freq) 1)
                  (incf uniques))))
            (iter (for c in-vector total-set)
                  (for count = (gethash c freq))
              (when count
                (cond
                  ((= count 1)
                    (progn
                      (decf (gethash c freq))
                      (decf uniques)
                      (when (zerop uniques)
                        (return-from string-subset t))))
                  ((> count 1)
                    (decf (gethash c freq)))))
              (finally (return-from string-subset nil)))))))
    
  21. Sasha B said

    A much shorter solution involves removing the word from the subset one letter at a time.

    def isSubset(word, subset):
        """
        Determines whether all of the characters in subset appear in word.
        The second string cannot be longer than the first string.
        >>> isSubset("ABCD", "DA")
        True
        >>> isSubset("ABCD", "DAD")
        False
        """
        assert len(subset) <= len(word), "The second string cannot be longer than the first string."
        # Go along the subset and try and remove corresponding letters from the word
        for letter in subset:
            if word.replace(letter, '') != word: # it was replaced correctly
                word = word.replace(letter, '') # remove the letter and continue
            else:
                return False # couldn't find the letter to replace, so its not there
        return True # all letters exist in the word
    
  22. This works out.

    def issubstring(src, strfind):
        if len(strfind) > len(src):
            return False
        iIndex = 0
        iLocation = 0
        for s in src:
            if iLocation == len(strfind):
                return True
            if s == strfind[iLocation]:
                iLocation+=1
            else:
                iLocation = 0
            iIndex+=1
         
        return iLocation == len(strfind) - 1
    

Leave a comment