Finding The Median

September 11, 2015

Since there are only 256 possible values in the array, the easiest way to solve this exercise is to create an array of 256 counters, all initially zero, index through the input array incrementing the value that corresponds to the current element; when the input is exhausted two pointers approach from opposite ends of the counts array, accumulating until the counts equal or exceed half the length of the array, when the median is calculated:

(define (median xs)
  (let* ((counts (make-vector 256 0))
         (len (do ((xs xs (cdr xs)) (len 0 (+ len 1))) ((null? xs) len)
                (vector-set! counts (car xs) (+ (vector-ref counts (car xs)) 1)))))
    (let loop ((lo 0) (lo-count 0) (hi 255) (hi-count 0))
      (cond ((< lo-count (/ len 2)) (loop (+ lo 1) (+ lo-count (vector-ref counts lo)) hi hi-count))
            ((< hi-count (/ len 2)) (loop lo lo-count (- hi 1) (+ hi-count (vector-ref counts hi))))
            (else (/ (+ lo hi) 2))))))

That’s clearly linear time, as it touches each item in the input once, and constant space, since the size of the counts array is independent of the input. Here are some examples:

> (median '(2 4 5 7 3 6 1))
4
> (median '(5 2 1 6 3 4))
7/2

If you don’t mind scrambling the input, another algorithm that works in linear time and constant space uses a “fat pivot” quicksort, recurring only on the partition that contains the median index.

You can run the program at http://ideone.com/paTvbD.

Advertisements

Pages: 1 2

10 Responses to “Finding The Median”

  1. mcmillhj said

    Similar solution in Perl:

    #!/usr/bin/env perl
    
    use strict;
    use warnings;
    
    use feature qw/say/;
    
    sub median {
       my @array = @_;
       my @freqs = (0) x 256;
    
       foreach my $elem ( @array ) {
          $freqs[$elem]++;
       }
    
       my ($low, $high) = (0, 255);
       my ($low_count, $high_count) = (0, 0);
       my $len = @array;
       while ( 1 ) {
          if ( $low_count < ($len / 2) ) {
             $low_count += $freqs[$low++];
          }
          elsif ( $high_count < ($len / 2) ) {
             $high_count += $freqs[$high--];
          }
          else {
             last;
          }
       }
       
       return ($low + $high) / 2;
    }
    
    say median(qw(2 4 5 7 3 6 1)); # 4
    say median(qw(5 2 1 6 3 4));   # 3.5
    
  2. graham said

    Clever! My knee-jerk reaction was to ignore the 8-bit integer part and just go for the slower generic solution.

    I’ve used this as an opportunity to learn some Rust. I like that it made me think about the empty array case.

    fn median(v : &Vec<i8>) -> Option<f32> {
        let len = v.len();
        match len {
            0 => None,
            _ => {
                let mut counts = vec![0; 256];
                for x in v {
                    counts[*x as usize] += 1;
                }
                let mut lo = 0;   let mut lo_count = 0;
                let mut hi = 255; let mut hi_count = 0;
                loop {
                    if lo_count < (len / 2) {
                        lo += 1; lo_count += counts[lo];
                    } else if hi_count < (len / 2) {
                        hi -= 1; hi_count += counts[hi];
                    } else {
                        break;
                    }
                }
                Some((lo as f32 + hi as f32) / 2.0)
            },
        }
    }
    
    fn main() {
        for v in vec![vec![],
                      vec![2, 4, 5, 7, 3, 6, 1],
                      vec![5, 2, 1, 6, 3, 4]] {
            println!("median({:?}) = {:?}", &v, median(&v));
        }
    }
    
  3. Mike said

    Python solution.

    def median(lst):
        m = len(lst)
        if m:
            m /= 2
            count = [0]*256
            for x in lst:
                count[x] += 1
                
            lo, lo_count = 0, 0
            while lo_count < m:
                    lo_count += count[lo]
                    lo += 1
                    
            hi, hi_count = 255, 0
            while hi_count < m:
                    hi_count +=  count[hi]
                    hi -= 1
                    
            return (lo + hi) / 2
    

    @graham, I think you have an “off by one error”. In the loop ‘lo’ is incremented before use, so the value of count[0] is never added to lo_count. Similarly, ‘hi’ is pre-decremented, so count[255] is never added to hi_count.

  4. Jussi Piitulainen said
    ;;; Praxis exercise: median of given bytes in O(n) time, O(1) space.                                             
    ;;; Plan: binary search for the middle points in the cumulative                                                  
    ;;; frequency distribution in O(n) + O(1) + 2 * O(log n) = O(n) time.                                            
    ;;; To do: the conditions to extract quartiles, minimum, and maximum.                                            
    
    (define (distribution bytes) ; new cumulative vector of counts                                                   
      (let ((u (make-vector 256 0))
            (n (vector-length bytes)))
        (do ((k 0 (+ k 1)))
            ((= k n) (do ((p 0 (+ p 1))
                          (k 1 (+ k 1)))
                         ((>= k 256) u)
                       (vector-set! u k (+ (vector-ref u p)
                                           (vector-ref u k)))))
          (let ((o (vector-ref bytes k)))
    	(vector-set! u o (+ 1 (vector-ref u o)))))))
    
    (define (index u lo? hi?)
      ;; an index k of vector u for which neither (lo? u k) nor (hi? u k)                                            
      (let seek ((b 0) (e (vector-length u))) ; b <= e                                                               
        (if (= b e) (raise 'no-such-index))
        (let ((m (quotient (+ b e) 2))) ; b <= m < e; b < m + 1 <= e                                                 
          (cond
           ((lo? u m) (seek (+ m 1) e))   ; e - (m + 1) < e - b                                                      
           ((hi? u m) (seek b m))         ; m - b < e - b                                                            
           (else m)))))
    
    ;;; The conditions were tricky to work out, so there are auxiliaries.                                            
    
    (define (half u) ; half of the number of observations                                                            
      (/ (vector-ref u 255) 2))
    
    (define (below k u) ; how many observations are strictly below k                                                 
      (if (= k 0) 0 (at-most (- k 1) u)))
    
    (define (at-most k u) ; how many observations are at most k                                                      
      (vector-ref u k))
    
    (define (at-least k u) ; how many observations are at least k                                                    
      (if (= k 0) (vector-ref u 255) (above (- k 1) u)))
    
    (define (above k u) ; how many observations are strictly above k                                                 
      (- (vector-ref u 255) (vector-ref u k)))
    
    ;;; "sub-median" is the lower-ranking candidate when there are two;                                              
    ;;; "super-median" is the higher-ranking candidate; the following                                                
    ;;; predicates test whether an index is too low or too high to be one                                            
    ;;; of these in a given cumulative byte frequency distribution.                                                  
    
    (define (sub-median-lo? u k)
      ;; whether less than half are at most k                                                                        
      (< (at-most k u) (half u)))
    
    (define (sub-median-hi? u k)
      ;; whether at least half are below and at most half are above k                                                
      (<= (above k u) (half u) (below k u)))
    
    (define (super-median-lo? u k)
      ;; whether at most half are below and at least half are above k                                                
      (<= (below k u) (half u) (above k u))) ;                                                                       
    
    (define (super-median-hi? u k)
      ;; whether less than half are at least k                                                                       
      (< (at-least k u) (half u)))
    
    ;;; --- elision of tests used to get the conditions right ---
    
    (define (median observations)
      (let* ((u (distribution observations))
             (b (index u sub-median-lo? sub-median-hi?))
             (p (index u super-median-lo? super-median-hi?)))
        (values b p (/ (+ b p) 2))))
    
    ;;; Simpler test output *after* getting the conditions right.                                                    
    
    (for-each
     (lambda (data)
       (display data) (display " => ")
       (call-with-values
           (lambda ()
             (median data))
         (lambda (b p m)
           (display b) (display " ") (display p)
           (display " => ") (display m) (newline))))
     '( #(3)
        #(3 3)
        #(3 5)
        #(1 3 5)
        #(1 3 3 5)
        #(1 3 5 5)
        #(3 3 3 5 7)
        #(3 3 5 7 11)
        #(1 3 3 3 3 3)
        #(1 3 3 5 5 5)
        #(1 3 3 3 5 7)
        #(1 3 3 5 7 11)
        #(2 4 5 7 3 6 1)
        #(5 2 1 6 3 4) ))
     
  5. Jussi Piitulainen said
    ;;; Oops. My binary searches are O(1) time, better than O(log n). Even a linear search
    ;;; of a constant-size array (256 elements) would be O(1). Silly me.
    
  6. graham said

    @Mike: good eye! Sorry for the errors, all.

  7. […] Programming Praxis problem. The problem is nicely constrained by limiting it to 8-bit […]

  8. graham said

    It looks like my solution (along with the issue @Mike caught) doesn’t correctly handle the case of a single element array. Apologies!

  9. shafi47 said

    #!/usr/bin/perl -w
    use strict;

    my @array = qw/1 2 3 4 9 10 11/;

    print median(@array);

    sub median
    {
    my @vals = sort {$a $b} @_;
    my $len = @vals;
    if($len%2) #odd?
    {
    return $vals[int($len/2)];
    }
    else #even
    {
    return ($vals[int($len/2)-1] + $vals[int($len/2)])/2;
    }
    }

  10. Mayur Lokare said

    #include
    int main()
    {
    int a[10],i,j,n;
    printf(“Enter the no of events\n”);
    scanf(“%d”,&n);
    printf(“Input\n”);
    for(i=0;i<n;i++)
    scanf("%d",a+i);
    for(i=1;i<n;i++)
    for(j=0;ja[i])
    a[j] = a[j]*a[i]/(a[i]=a[j]);

    printf(“Output\n”);
    for(i=0;i %d”,a[n/2]);
    else
    printf(“Median is -> %d”,(a[n/2]+a[n/2 -1])/2);

    }

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: