Maximum K Items Out Of N

August 30, 2016

Today’s exercise is to write a program that, given n integers, outputs the list of k integers whose sum is maximum. That looks easy, but there is a catch: You must provide three solutions, one that runs in O(n log n) time, one that runs in O(n log k) time, and one that runs in O(n) time.

Your task is to write three programs as described above. When you are finished, you are welcome to read or run a suggested solution, or to post your own solution or discuss the exercise in the comments below.

Pages: 1 2

14 Responses to “Maximum K Items Out Of N”

  1. Rutger said

    In Python. Although bucketsort is not guaranteed linear time, perhaps I should have used that magic selection algorithm with the partitioning into 5 medians of medians.

    import heapq
    
    def onlogn(n, k):
    	return sorted(l)[-k:]
    
    
    def onlogk(n, k):
    	h = heapq.heapify(n)
    	return heapq.nlargest(k,n)
    
    
    def bucket_sort(l):
        buckets = []
        minimum = min(l) / 10
        maximum = max(l) / 10 + 1
        for i in range(minimum, maximum):
        	buckets.append([])
        for number in l:
            buckets[number / 10].append(number)
        for index, bucket in enumerate(buckets):
            buckets[index] = sorted(bucket)
        return [v for i in range(minimum, maximum) for v in buckets[i]]
    
    
    def on(n,k):
    	l = bucket_sort(n)
    	return l[-k:]
    
    
    
    l = [-1,4,-6,-19,12,7]
    k = 4
    
    print onlogn(l, k)
    print onlogk(l, k)
    print on(l,k)
    
  2. Paul said

    @Rutger. If you use heapq.nlargest, it is not necessary to heapify first. This only costs time and achieves nothing. Also, heapify does not return anything.

  3. My 2 cents, in Common Lisp :

    (defun max-k-items (k items)
    (subseq (sort (copy-list items) #’>) 0 k))

  4. Daniel said

    Here’s a solution in C++11.

    MaxItems1 sorts the list and gets the top k items. O(n log n).
    MaxItems2 uses a heap of k items, with the smallest at the head. O(n log k).
    MaxItems3 uses std::nth_element. I used C++ for this problem particularly for that function. However I realized in the documentation that it’s O(n) average case, not O(n) worst case.
    MaxItems4 uses an implementation of median of medians. I converted the pseudocode on Wikipedia to C++. This is O(n) worst case.

    #include <algorithm>
    #include <cmath>
    #include <functional>
    #include <iostream>
    #include <iterator>
    #include <queue>
    #include <string>
    #include <type_traits>
    #include <vector>
    
    using std::cout;
    using std::endl;
    using std::priority_queue;
    using std::string;
    using std::vector;
    
    // O(n log n)
    vector<int> MaxItems1(const vector<int>& integers, int k) {
      k = std::min(k, static_cast<int>(integers.size()));
      vector<int> sorted = integers;
      std::sort(sorted.begin(), sorted.end(), std::greater<int>());
      vector<int> output;
      for (int i = 0; i < k; ++i) output.push_back(sorted[i]);
      return output;
    }
    
    // O(n log k)
    vector<int> MaxItems2(const vector<int>& integers, int k) {
      k = std::min(k, static_cast<int>(integers.size()));
      std::priority_queue<int, std::vector<int>, std::greater<int>> pq;
      for (int i = 0; i < integers.size(); ++i) {
        int next = integers[i];
        if (pq.size() < k) {
          pq.push(next);
          continue;
        }
        int head = pq.top();
        if (next > head) {
          pq.pop();
          pq.push(next);
        }
      }
      vector<int> output(k);
      for (int i = k - 1; i >= 0; --i) {
        output[i] = pq.top();
        pq.pop();
      }
      return output;
    }
    
    // O(n) on average.
    vector<int> MaxItems3(const vector<int>& integers, int k) {
      k = std::min(k, static_cast<int>(integers.size()));
      vector<int> partially_sorted = integers;
      std::nth_element(partially_sorted.begin(), partially_sorted.begin() + k - 1,
                       partially_sorted.end(), std::greater<int>());
      vector<int> output;
      int kth = partially_sorted[k - 1];
      for (int integer : integers) if (integer > kth) output.push_back(integer);
      for (int i = output.size(); i < k; ++i) output.push_back(kth);
      return output;
    }
    
    // ************************************************************
    // * Median of Medians
    // ************************************************************
    
    // Code is a conversion to C++ of pseudocode from Wikipedia.
    // https://en.wikipedia.org/wiki/Median_of_medians#Algorithm
    
    // Indexing starts at 0. For smallest element, n=0.
    
    int partition(vector<int>* v, int left, int right, int pivot_index) {
      int pivot_value = (*v)[pivot_index];
      std::swap((*v)[pivot_index], (*v)[right]);
      int store_index = left;
      for (int i = left; i < right; ++i) {
        if ((*v)[i] < pivot_value) {
          std::swap((*v)[store_index], (*v)[i]);
          ++store_index;
        }
      }
      std::swap((*v)[right], (*v)[store_index]);
      return store_index;
    }
    
    // forward declaration for mutual recursion.
    int pivot(vector<int>* v, int left, int right);
    
    int select(vector<int>* v, int left, int right, int n) {
      while (true) {
        if (left == right) return left;
        int pivot_index = pivot(v, left, right);
        pivot_index = partition(v, left, right, pivot_index);
        if (n == pivot_index) {
          return n;
        } else if (n < pivot_index) {
          right = pivot_index - 1;
        } else {
          left = pivot_index + 1;
        }
      }
    }
    
    int partition5(vector<int>* v, int left, int right) {
      int n = right - left + 1;
      std::nth_element(
          v->begin() + left, v->begin() + left + n/2, v->begin() + right + 1);
      int median = (*v)[left + n/2];
      for (int i = left; i <= right; ++i) {
        if ((*v)[i] == median) return partition(v, left, right, i);
      }
      return -1; // error
    }
    
    int pivot(vector<int>* v, int left, int right) {
      if (right - left < 5) return partition5(v, left, right);
      for (int i = left; i <= right; i += 5) {
        int sub_right = i + 4;
        if (sub_right > right) sub_right = right;
        int median5 = partition5(v, i, sub_right);
        std::swap((*v)[median5], (*v)[left + std::floor((i-left)/5)]);
      }
      return select(v,
                    left,
                    left + std::ceil((right - left) / 5) - 1,
                    left + (right - left)/10);
    }
    
    // ************************************************************
    
    // O(n) worst-case.
    vector<int> MaxItems4(const vector<int>& integers, int k) {
      k = std::min(k, static_cast<int>(integers.size()));
      vector<int> selected(integers);
      int length = selected.size();
      select(&selected, 0, length - 1, length - k);
      vector<int> output;
      int kth = selected[length - k];
      for (int integer : integers) if (integer > kth) output.push_back(integer);
      for (int i = output.size(); i < k; ++i) output.push_back(kth);
      return output;
    }
    
    string VectorToString(const vector<int>& v) {
      string output;
      output += "[";
      for (int i = 0; i < v.size(); ++i) {
        if (i > 0) output += ", ";
        output += std::to_string(v[i]);
      }
      output += "]";
      return output;
    }
    
    int main(void) {
      std::vector<int> integers{19, 81, 56, 59, 27, 67, 54, 96, 97, 8, 96, 19};
      int k = 4;
    
      cout << "Integers:" << endl;
      cout << VectorToString(integers) << endl << endl;
    
      cout << "k=" << k << endl << endl;
    
      vector<int> max_items1 = MaxItems1(integers, k);
      cout << "MaxItems1:" << endl;
      cout << VectorToString(max_items1) << endl << endl;
    
      vector<int> max_items2 = MaxItems2(integers, k);
      cout << "MaxItems2:" << endl;
      cout << VectorToString(max_items2) << endl << endl;
    
      vector<int> max_items3 = MaxItems3(integers, k);
      cout << "MaxItems3:" << endl;
      cout << VectorToString(max_items3) << endl << endl;
    
      vector<int> max_items4 = MaxItems4(integers, k);
      cout << "MaxItems4:" << endl;
      cout << VectorToString(max_items4) << endl << endl;
    
      return 0;
    }
    

    Output:

    Integers:
    [19, 81, 56, 59, 27, 67, 54, 96, 97, 8, 96, 19]
    
    k=4
    
    MaxItems1:
    [97, 96, 96, 81]
    
    MaxItems2:
    [97, 96, 96, 81]
    
    MaxItems3:
    [96, 97, 96, 81]
    
    MaxItems4:
    [96, 97, 96, 81]
    
  5. Daniel said

    MaxItems4 in my last post worked and was O(n), but was doing more work than necessary.

    The updated version below of the partition5 function (used for the median of medians algorithm) fixes the issue.

    int partition5(vector<int>* v, int left, int right) {
      int n = right - left + 1;
      std::nth_element(
          v->begin() + left, v->begin() + left + n/2, v->begin() + right + 1);
      return partition(v, left, right, left + n/2);
    }
    
  6. Paul said

    I do not think, that a solution based on a heap is nlogk. The time complexity for heapify is O(n) (see Introduction to Algorithms of Cormen et al.) I created a solution using a heap of k items and a solution using a full heap of all data. I could not see a significant difference between these two. They were also giving the same timings as heapq.nlargest. All timings were consistent with O(n). All this in Python. Do other languages behave differently?

  7. Daniel said

    Hi Paul. I used a k-element heap for my second solution (MaxItems2). I think the approach is O(n log k) because it 1) loops over the n integers, then for each iteration of the loop it 2) checks the minimum value in the heap and possibly 3a) removes the head of the heap and possibly 3b) inserts a new value into the heap. 1 is O(n), 2 is O(1), 3a is O(log k), and 3b is O(log k), making the entire algorithm O(n log k).

    > “The time complexity for heapify is O(n)”

    Can you elaborate on this comment? I believe you’re referring to an optimal way of building a heap that’s an alternative to successive insertions. For a k-element heap, the optimal approach would be O(k) and the approach using successive insertions would be O(k log k).

    I used successive insertions, so I believe the relevant step to consider is reheapify. Both reheapification upward for insert and reheapification downward for remove are O(log k). For example, in a k-element heap, the height of the tree is log(k). Then for reheapify upward, a new leaf node is swapped with a parent iteratively until it’s in the correct place, which is O(log k). This corresponds to steps 3a and 3b that I described above.

  8. programmingpraxis said

    Paul: Heapify is indeed O(n), but you’re not performing heapify on a heap of size n. The correct analysis is that there are n heap operations, each taking log k time, so a total time of O(n log k).

    Daniel: Simple analysis of heapify shows n heap operations that each take log n, for a total time complexity of O(n log n). But that bound is not tight. I’m looking at CLRS3e 6.3, but any algorithms text will give a similar analysis. The half of the nodes that are leaf nodes are themselves trivial heaps, and take no time at all. Building the heap above them considers the height h of the half-heap, each heap operation takes O(log h), and there are n / 2 ** (h+1) heap operations, which works out to O(n) if you do the math.

  9. Paul said

    @PP and @ Daniel. I posted some code and timings on ideone. I understand all arguments that you both gave, but the timings do not show the logk dependence really.

  10. Daniel said

    Hi programmingpraxis. The “successive insertions” method I was referring to is the same as Williams’ method described here: https://en.wikipedia.org/wiki/Binary_heap#Building_a_heap

    Here’s another solution to the problem. It builds a heap with all n elements, which is O(n), and then gets the top k elements from the heap, which is O(k log n).

    #include <algorithm>
    #include <iostream>
    #include <string>
    #include <vector>
    
    using std::cout;
    using std::endl;
    using std::string;
    using std::vector;
    
    // O(n + k log n)
    vector<int> MaxItems5(const vector<int>& integers, int k) {
      k = std::min(k, static_cast<int>(integers.size()));
      vector<int> heap(integers);
      std::make_heap(heap.begin(), heap.end());
      vector<int> output;
      for (int i = 0; i < k; ++i) {
        output.push_back(heap.front());
        std::pop_heap(heap.begin(), heap.end());
        heap.pop_back();
      }
      return output;
    }
    
    string VectorToString(const vector<int>& v) {
      string output;
      output += "[";
      for (int i = 0; i < v.size(); ++i) {
        if (i > 0) output += ", ";
        output += std::to_string(v[i]);
      }
      output += "]";
      return output;
    }
    
    int main(void) {
      std::vector<int> integers{19, 81, 56, 59, 27, 67, 54, 96, 97, 8, 96, 99};
      int k = 4;
    
      cout << "Integers:" << endl;
      cout << VectorToString(integers) << endl << endl;
    
      cout << "k=" << k << endl << endl;
    
      vector<int> max_items5 = MaxItems5(integers, k);
      cout << "MaxItems5:" << endl;
      cout << VectorToString(max_items5) << endl << endl;
    
      return 0;
    }
    

    Output:

    Integers:
    [19, 81, 56, 59, 27, 67, 54, 96, 97, 8, 96, 99]
    
    k=4
    
    MaxItems5:
    [99, 97, 96, 96]
    
  11. Daniel said

    Hi Paul, I believe the dependence becomes clearer after changing n to 6, and also looping through a wider range of k.
    https://ideone.com/kqHpJX

    That won’t run due to a time limit being exceeded. The only change is that n was increased from 5 to 6, and k loops over (1, 10, 100, 1000, 10000, 100000, 1000000).

  12. Daniel said

    > “I understand all arguments that you both gave, but the timings do not show the logk dependence really.”
    – Paul @ September 1, 2016 at 3:11 PM

    In my last post, I was using “dependence” in reference to your quote above.

  13. Daniel said

    Please ignore my last two posts. I’m not sure I understand the issue.

  14. Daniel said

    Hi Paul,

    I wrote earlier that my heap-based solution, MaxItems2, was O(n log k). My implementation was doing more work than necessary, as it was emptying the priority queue in sorted order, which required k iterations, each O(log k). I fixed that below, as the problem does not specify a specific ordering of the output. The array as-ordered underlying the heap is now used as the output.

    > “Do other languages behave differently?”
    – Paul @ September 1, 2016 at 1:18 PM

    The following code sets n to 16,777,216. It then calls MaxItems2 for varying values of k, and reports the elapsed clock ticks. Compiler optimizations were disabled. With n fixed, the complexity is O(log k).

    #include <algorithm>
    #include <cstdio>
    #include <cstdlib>
    #include <ctime>
    #include <functional>
    #include <vector>
    
    using std::vector;
    
    // O(n log k)
    vector<int> MaxItems2(const vector<int>& integers, int k) {
      k = std::min(k, static_cast<int>(integers.size()));
      std::greater<int> comp;
      vector<int> output;
      for (int i = 0; i < integers.size(); ++i) {
        int next = integers[i];
        if (output.size() < k) {
          output.push_back(next);
          std::push_heap(output.begin(), output.end(), comp);
          continue;
        }
        int head = output[0];
        if (next > head) {
          std::pop_heap(output.begin(), output.end(), comp);
          output.pop_back();
          output.push_back(next);
          std::push_heap(output.begin(), output.end(), comp);
        }
      }
      return output;
    }
    
    int main(void) {
      std::vector<int> integers;
      int n = 16777216;
      for (int i = 0; i < n; ++i) {
        integers.push_back(rand());
      }
      int iterations = 10;
      for (int k = 1; k <= n; k = k * 2) {
        long total_clock = 0;
        for (int i = 0; i < iterations; ++i) {
          long tic = clock();
          MaxItems2(integers, k);
          long toc = clock();
          total_clock += toc - tic;
        }
        printf("k=%d, %.0f\n", k, static_cast<float>(total_clock)/iterations);
      }
      return 0;
    }
    

    Output:

    k=1, 117345
    k=2, 120307
    k=4, 121310
    k=8, 117755
    k=16, 129240
    k=32, 139834
    k=64, 119326
    k=128, 117839
    k=256, 123654
    k=512, 119142
    k=1024, 123452
    k=2048, 127856
    k=4096, 137765
    k=8192, 152986
    k=16384, 181573
    k=32768, 236961
    k=65536, 348400
    k=131072, 558266
    k=262144, 1016972
    k=524288, 1645668
    k=1048576, 2704670
    k=2097152, 4399897
    k=4194304, 6340358
    k=8388608, 7356941
    k=16777216, 1984617
    

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: