Sum Distinct Items

August 20, 2019

There are two obvious answers to this task, one using sorting and the other using searching, though to many people one of the answers will be obvious and the other will be obscure. Sorting first:

(define (sum-distinct xs)
  (let loop ((xs (sort < xs)) (sum 0))
    (cond ((null? xs) sum)
          ((and (pair? (cdr xs)) (= (car xs) (cadr xs)))
            (loop (drop-while (lambda (x) (= x (car xs))) xs) sum))
          (else (loop (cdr xs) (+ sum (car xs)))))))

> (sum-distinct '(4 2 3 1 7 4 2 7 1 7 5))
8

Searching seems more natural to me, though your taste may vary. We use lists to track once and more, but if the input is large it would be faster to use hash tables or some sort of balanced tree:

(define (sum-distinct xs)
  (let loop ((xs xs) (once (list)) (more (list)) (sum 0))
    (cond ((null? xs) sum)
          ((member (car xs) more)
            (loop (cdr xs) once more sum))
          ((member (car xs) once)
            (loop (cdr xs) once (cons (car xs) more) (- sum (car xs))))
          (else (loop (cdr xs) (cons (car xs) once) more (+ sum (car xs)))))))

> (sum-distinct '(4 2 3 1 7 4 2 7 1 7 5))
8

Every integer is added to the sum the first time it is seen. If it is seen a second time, it is subtracted from the sum (since it was improperly added the first time). Additional appearances after the second are ignored.

Note the time-for-space trade-off here. The sorting solution is O(n log n). The searching solution is O(n), assuming we use hash tables with O(1) search-time to store the once and more sets, but requires space to store the sets.

This makes an interesting interview question. Watch the candidate subconsciously select either the sorting solution or the searching solution. Challenge the candidate to find the other solution. Then have the candidate talk about the trade-offs between the two solutions.

You can run the program at https://ideone.com/8AKBAY.

Advertisements

Pages: 1 2

13 Responses to “Sum Distinct Items”

  1. Paul said

    In Python it is easy using a Counter.

    from collections import Counter
    
    seq = [4, 2, 3, 1, 7, 4, 2, 7, 1, 7, 5]
    c = Counter(seq)
    print(sum(key for key, value in c.items() if value==1))
    
  2. chaw said

    Here is a solution in standard R7RS Scheme with hash-tables of SRFI
    69. It runs in linear time (modulo usual hashing caveats).

    (import (scheme base)
            (scheme write)
            (only (srfi 69)
                  make-hash-table hash-table-update!/default hash-table-values))
    
    (define (sum-distinct-items nums)
      (let ((ht (make-hash-table =)))
        (for-each (lambda (n)
                    (hash-table-update!/default ht
                                                n
                                                (lambda (v) (if v 0 n))
                                                #f))
                  nums)
        (apply + (hash-table-values ht))))
    
    (display (sum-distinct-items '(4 2 3 1 7 4 2 7 1 7 5)))
    (newline)
    

    Output:

    8
    

  3. Klong version

    This gave me a change to work through the problem one step at a time.  Thanks!
            =[4 2 3 1 7 4 2 7 1 7 5]
    [[0 5] [1 6] [2] [3 8] [4 7 9] [10]]
            {2>#x}'=[4 2 3 1 7 4 2 7 1 7 5]
    [0 0 1 0 0 1]
            &{2>#x}'=[4 2 3 1 7 4 2 7 1 7 5]
    [2 5]
            l@&{2>#x}'=l::[4 2 3 1 7 4 2 7 1 7 5]
    [3 4]
            l
    [4 2 3 1 7 4 2 7 1 7 5]
            l@&{2>#x}'l2::=l::[4 2 3 1 7 4 2 7 1 7 5]
    [3 4]
            {2>#x}'l2::=l::[4 2 3 1 7 4 2 7 1 7 5]
    [0 0 1 0 0 1]
            &{2>#x}'l2::=l::[4 2 3 1 7 4 2 7 1 7 5]
    [2 5]
            l2@&{2>#x}'l2::=l::[4 2 3 1 7 4 2 7 1 7 5]
    [[2] [10]]
            l@l2@&{2>#x}'l2::=l::[4 2 3 1 7 4 2 7 1 7 5]
    [[3] [5]]
            l2@&{2>#x}'l2::=l::[4 2 3 1 7 4 2 7 1 7 5]
    [[2] [10]]
            ,/l2@&{2>#x}'l2::=l::[4 2 3 1 7 4 2 7 1 7 5]
    [2 10]
            l@,/l2@&{2>#x}'l2::=l::[4 2 3 1 7 4 2 7 1 7 5]
    [3 5]
            +/l@,/l2@&{2>#x}'l2::=l::[4 2 3 1 7 4 2 7 1 7 5]
    8
    
  4. Bill Wood said

    Rust generic version, no hash table required:

    /* Author Bill Wood
       Generic function count_n, finds sum of values that appear "n" times
       */
    
    fn count_n<T: Ord + std::ops::Add + num::Zero + Copy>(x: &[T], n: usize) -> T {
        let mut x = x.to_vec();
        x.sort();
        let mut sum = T::zero();
        let mut count = 1;
        for i in 0..x.len() {
            if i + 1 == x.len() || x[i] != x[i + 1] {
                if count == n {
                    sum = sum + x[i];
                }
                count = 1;
            } else {
                count += 1;
            }
        }
        sum
    }
    
    fn main() {
        let x = vec!(4, 2, 3, 1, 7, 4, 2, 7, 1, 7, 5, );
        println!("{}", count_n(&x, 1));
    }
    

    Playground: https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=ace1791e4d6532b0bb39c4d5f4b3eb99

  5. Bill Wood said

    Rust generic version, no hash table required:

    /* Author Bill Wood
       Generic function count_n, finds sum of values that appear "n" times
       */
    
    fn count_n<T: Ord + std::ops::Add + num::Zero + Copy>(x: &[T], n: usize) -> T {
        let mut x = x.to_vec();
        x.sort();
        let mut sum = T::zero();
        let mut count = 1;
        for i in 0..x.len() {
            if i + 1 == x.len() || x[i] != x[i + 1] {
                if count == n {
                    sum = sum + x[i];
                }
                count = 1;
            } else {
                count += 1;
            }
        }
        sum
    }
    
    fn main() {
        let x = vec!(4, 2, 3, 1, 7, 4, 2, 7, 1, 7, 5, );
        println!("{}", count_n(&x, 1));
    }
    
  6. Bill said

    Rust generic version, no hash table required:

    /* Author Bill Wood
       Generic function count_n, finds sum of values that appear "n" times
       */
    
    fn count_n<T: Ord + std::ops::Add + num::Zero + Copy>(x: &[T], n: usize) -> T {
        let mut x = x.to_vec();
        x.sort();
        let mut sum = T::zero();
        let mut count = 1;
        for i in 0..x.len() {
            if i + 1 == x.len() || x[i] != x[i + 1] {
                if count == n {
                    sum = sum + x[i];
                }
                count = 1;
            } else {
                count += 1;
            }
        }
        sum
    }
    
    fn main() {
        let x = vec!(4, 2, 3, 1, 7, 4, 2, 7, 1, 7, 5, );
        println!("{}", count_n(&x, 1));
    }
    

    Playground: https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=ace1791e4d6532b0bb39c4d5f4b3eb99

  7. Mark Henderson said

    Paul beat me to it:

    from collections import Counter
    d = Counter([4, 2, 3, 1, 7, 4, 2, 7, 1, 7, 5])
    print(sum(n for n in d if d[n]==1))
    
  8. Globules said

    Here’s a Haskell version. We’ll say the empty list sums to 0.

    import Data.List (group, sort)
    
    sumDistinct :: (Num a, Ord a) => [a] -> a
    sumDistinct = sum . concat . filter ((== 1) . length) . group . sort
    
    main :: IO ()
    main = do
      print $ sumDistinct []
      print $ sumDistinct [4, 2, 3, 1, 7, 4, 2, 7, 1, 7, 5]
    
    $ ./sumdistinct
    0
    8
    
  9. matthew said

    Here’s some C++, using a hash table (‘unordered_map’) to store element counts, and a sorting solution, both use the “increment on first occurrence, decrement on second, subsequently ignore” approach, though for the sorting solution it’s probably better to just count the elements, as in Bill’s solution above.

    Interestingly, for a million items, the sorting method is significantly faster than the hash table method (about 160ms, vs. 500ms or so).

    #include <iostream>
    #include <unordered_map>
    #include <vector>
    #include <random>
    #include <algorithm>
    
    std::vector<int> randvector(int l) {
      // Generate n random values in range 1..n
      std::default_random_engine e1; // default seed
      std::uniform_int_distribution<int> uniform_dist(1,l);
      std::vector<int> a(l);
      for (auto &n: a) n = uniform_dist(e1);
      return a;
    }
    
    long analyse1(const std::vector<int> &a) {
      // Scan, collecting item counts
      std::unordered_map<int,int> counts;
      long sum = 0;
      for (auto n : a) {
        // First occurrence increments sum,
        // second occurrent decrement sum.
        int c = counts[n]++;
        if (c == 0) sum += n;
        else if (c == 1) sum -= n;
      }
      return sum;
    }
    
    long analyse2(std::vector<int> a) {
      // Sort, then scan for repeated values.
      std::sort(a.begin(),a.end());
      long sum = 0;
      int count = 0, last = 0;
      for (auto n : a) {
        // Reset count on value change
        // Initial value of 'last' is immaterial
        if (n != last) count = 0;
        // As above, first occurrence increments, second decrements
        if (count == 0) sum += n;
        else if (count == 1) sum -= n;
        count++; last = n;
      }
      return sum;
    }
    
    int main() {
      std::vector<int> a{ 4,2,3,1,7,4,2,7,1,7,5 };
      //std::vector<int> a(randvector(1000000));
      std::cout << analyse1(a) << "\n";
      std::cout << analyse2(a) << "\n";
    }
    
  10. Zack said

    Here is my take on this, without using any auxiliary functions from packages, in Julia 1.1:

    function main(x::Array{Int64, 1})
    s = 0
    n = length(x)

    for i = 1:n
        z = x[i]
        if !((z in x[1:(i-1)])|(z in x[i+1:n])); s += z; end
    end
    
    return s
    

    end

    Cheers!

  11. Daniel said

    Here’s a solution in C.

    #include <stdio.h>
    #include <stdlib.h>
    
    long sum_distinct(int* array, int n) {
      int nb = n * 10 / 7;  // Number of buckets. Max load factor 0.7.
      int* table = malloc(sizeof(table[0]) * nb);
      int* count = calloc(nb, sizeof(count[0]));
      for (int i = 0; i < n; ++i) {
        int value = array[i];
        int hash = value % n;
        int j = hash;
        while (count[j] != 0 && table[j] != value)
          j = (j + 1) % n;
        table[j] = value;
        ++(count[j]);
      }
      long sum = 0;
      for (int i = 0; i < nb; ++i) {
        if (count[i] == 1) sum += table[i];
      }
      free(table);
      free(count);
      return sum;
    }
    
    int main(void) {
      int array[] = {4, 2, 3, 1, 7, 4, 2, 7, 1, 7, 5};
      int n = sizeof(array) / sizeof(array[0]);
      long sum = sum_distinct(array, n);
      printf("%ld\n", sum);
      return EXIT_SUCCESS;
    }
    

    Output:

    8
    
  12. Daniel said

    My last solution should work, but the load factor of the hash table could reach 1. This was from mistakenly using the wrong variable on lines 10 and 13. Here’s the fixed solution, which limits the load factor to 0.7.

    #include <stdio.h>
    #include <stdlib.h>
    
    long sum_distinct(int* array, int n) {
      int nb = n * 10 / 7;  // Number of buckets. Max load factor 0.7.
      int* table = malloc(sizeof(table[0]) * nb);
      int* count = calloc(nb, sizeof(count[0]));
      for (int i = 0; i < n; ++i) {
        int value = array[i];
        int hash = value % nb;
        int j = hash;
        while (count[j] != 0 && table[j] != value)
          j = (j + 1) % nb;
        table[j] = value;
        ++(count[j]);
      }
      long sum = 0;
      for (int i = 0; i < nb; ++i) {
        if (count[i] == 1) sum += table[i];
      }
      free(table);
      free(count);
      return sum;
    }
    
    int main(void) {
      int array[] = {4, 2, 3, 1, 7, 4, 2, 7, 1, 7, 5};
      int n = sizeof(array) / sizeof(array[0]);
      long sum = sum_distinct(array, n);
      printf("%ld\n", sum);
      return EXIT_SUCCESS;
    }
    
  13. Andy said

    Java version:

        Stream.of(4, 2, 3, 1, 7, 4, 2, 7, 1, 7, 5)
                .collect(Collectors.groupingBy(i -> i, Collectors.counting()))
                .entrySet().stream().filter(e -> e.getValue() == 1)
                .mapToInt(e -> e.getKey()).sum();
    

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: