Statistics

September 27, 2011

These are all straight forward:

(define (mean xs) (/ (sum xs) (length xs)))

(define (std-dev xs)
  (let* ((n (length xs)) (x-bar (/ (sum xs) n)))
    (define (diff x) (- x x-bar))
    (sqrt (/ (sum (map square (map diff xs))) (- n 1)))))

(define (linear-regression xs ys)
  (let* ((n (length xs))
         (x (sum xs)) (y (sum ys))
         (xx (sum (map square xs)))
         (xy (sum (map * xs ys)))
         (yy (sum (map square ys)))
         (slope (/ (- (* n xy) (* x y)) (- (* n xx) (* x x))))
         (intercept (- (/ y n) (* slope (/ n) x))))
    (values slope intercept)))

(define (correlation xs ys)
  (let* ((n (length xs)) (x-bar (mean xs)) (y-bar (mean ys)))
    (define (x-diff x) (- x x-bar)) (define (y-diff y) (- y y-bar))
    (/ (sum (map * (map x-diff xs) (map y-diff ys)))
       (- n 1) (std-dev xs) (std-dev ys))))

We used sum and square from the Standard Prelude. Here is a simple example:

> (define xs '(1.47 1.50 1.52 1.55 1.57 1.60 1.63 1.65 1.68 1.70 1.73 1.75 1.78 1.80 1.83))
> (define ys '(52.21 53.12 54.48 55.84 57.20 58.57 59.93 61.29 63.11 64.47 66.28 68.10 69.92 72.19 74.46))
> (mean xs)
1.6506666666666665
> (mean ys)
62.078
> (std-dev xs)
0.11423451233985206
> (std-dev ys)
7.037514983490772
> (linear-regression xs ys)
61.272186542107434
-39.06195591883866
> (correlation xs ys)
0.9945837935768895

In 1973 statistician F. J. Anscombe constructed four datasets that have remarkably different shapes, shown at right, but that share common mean x=9 and y=7.5, standard deviation x=3.32 and y=2.03, slope 0.5, intercept 3.0 and correlation 0.816:

> (define xs-1 '(10 8 13 9 11 14 6 4 12 7 5))
> (define ys-1 '(8.04 6.95 7.58 8.81 8.33 9.96 7.24 4.26 10.84 4.82 5.68))
> (define xs-2 '(10 8 13 9 11 14 6 4 12 7 5))
> (define ys-2 '(9.14 8.14 8.74 8.77 9.26 8.10 6.13 3.10 9.13 7.26 4.74))
> (define xs-3 '(10 8 13 9 11 14 6 4 12 7 5))
> (define ys-3 '(7.46 6.77 12.74 7.11 7.81 8.84 6.08 5.39 8.15 6.42 5.73))
> (define xs-4 '(8 8 8 8 8 8 8 19 8 8 8))
> (define ys-4 '(6.58 5.76 7.71 8.84 8.47 7.04 5.25 12.50 5.56 7.91 6.89))

You can run the program at http://programmingpraxis.codepad.org/8WJpBVc9.

About these ads

Pages: 1 2

8 Responses to “Statistics”

  1. DGel said

    The implementation of your standard deviation (and thus correlation) is wrong, given the definitions on page 1. Your definition says to divide by N, you divide by N – 1…

    My implementation in Go:


    package main
    
    import (
    	"fmt"
    	"math"
    )
    
    func mean(data []float64) float64 {
    	var sum float64
    	for _, x := range data {sum += x}
    	return sum / float64(len(data))
    }
    
    func sd(data []float64) float64 {
    	mn := mean(data)
    	var sd float64
    	for _, x := range data {diff := x - mn; sd += diff * diff}
    	return math.Sqrt(sd / float64(len(data)))
    }
    
    func regress(xs, ys []float64) (slope, intercept float64) {
    	var sum_xs, sum_xs2, sum_ys, sum_xs_ys, n float64
    	for i, _ := range xs {
    		sum_xs += xs[i]
    		sum_xs2 += xs[i] * xs[i]
    		sum_ys += ys[i]
    		sum_xs_ys += xs[i]*ys[i]
    	}
    	n = float64(len(xs))
    	slope = (n*sum_xs_ys - sum_xs*sum_ys) / (n*sum_xs2 - sum_xs*sum_xs)
    	intercept = sum_ys/n - slope*sum_xs/n
    	return
    }
    
    func correlation(xs, ys []float64) float64 {
    	x_bar := mean(xs)
    	y_bar := mean(ys)
    	var cor float64
    	for i, _ := range xs {cor += (xs[i] - x_bar) * (ys[i] - y_bar)}
    	return cor / (float64(len(xs) - 1)*sd(xs)*sd(ys))
    }
    
    func main() {
    	xs := []float64{1.47, 1.50, 1.52, 1.55, 1.57, 1.60, 1.63, 1.65, 1.68, 1.70, 1.73, 1.75, 1.78, 1.80, 1.83}
    	ys := []float64{52.21, 53.12, 54.48, 55.84, 57.20, 58.57, 59.93, 61.29, 63.11, 64.47, 66.28, 68.10, 69.92, 72.19, 74.46}
    	fmt.Println(mean(xs), mean(ys))
    	fmt.Println(sd(xs), sd(ys))
    	fmt.Println(regress(xs, ys))
    	fmt.Println(correlation(xs, ys))
    	return
    }
    

  2. Jussi Piitulainen said

    I think I was taught to divide by n – 1 when the deviation from the (unknown) population mean is wanted but the (known) sample mean is used instead in the formula. The sample values are said to lose one “degree of freedom” because they can not all deviate freely from their own mean.

  3. Paul Hofstra said

    See: http://en.wikipedia.org/wiki/Standard_deviation
    If you divide by n, the standard deviation is biased. Dividing by n-1 gives an unbiased standard deviation.

  4. Jussi Piitulainen said

    By way of conversation, here is an approach I find much fun. I lift constants to be vecs (indexed sequences) so that everything is uniform, and then I map binary or unary operations on these vecs. Like in the language of R but more rigidly and in Scheme. The goal is a special language that allows to explore descriptions like “the mean square deviation from the mean” in the code itself. Someone should write The Structure and Interpretation of Statistical, er, Something.

    Ok, I get carried away. A variation on the theme anyway. I’ve included one of Anscombe’s cases.

    (define (vec . args) (case-lambda ((k) (list-ref args k))
                                      (() (length args))))
    
    (define fun (case-lambda
                 ((f u w) (case-lambda ((k) (f (u k) (w k))) (() (u))))
                 ((f v) (case-lambda ((k) (f (v k))) (() (v))))))
    I
    (define (con v c) (case-lambda ((k) c) (() (v))))
    
    (define (sum v) (con v (do ((k 0 (+ k 1)) (s 0 (+ s (v k))))
                               ((= k (v)) s))))
    (define (mean v) (fun / (sum v) (con v (v))))
    (define (mean1 v) (fun / (sum v) (con v (- (v) 1))))
    
    (define (var1 v) (let* ((d (fun - v (mean v)))) (mean1 (fun * d d))))
    (define (stddev1 v) (fun sqrt (var1 v)))
    
    (define (std1 v) (fun / (fun - v (mean v)) (stddev1 v)))
    (define (cor1 u w) (mean1 (fun * (std1 u) (std1 w))))
    
    (define xs-1 (vec 10 8 13 9 11 14 6 4 12 7 5))
    (define ys-1 (vec 8.04 6.95 7.58 8.81 8.33 9.96 7.24 4.26 10.84 4.82 5.68))
    
    (define (test)
      `((means ,((mean xs-1) 0) ,((mean ys-1) 0))
        (devs ,((stddev1 xs-1) 0) ,((stddev1 ys-1) 0))
        (corr ,((cor1 xs-1 ys-1) 0))))
    
    ;((means 9 7.500909090909093)                                                                                                
    ;  (devs 3.3166247903554 2.031568135925815)                                                                                  
    ;  (corr 0.8164205163448399))                                                                                                
    
  5. DGel said

    @Jussi Piitulainen, Paul Hofstra:

    Yes, but that’s not how he defined standard deviation on page 1. Thus the confusion..

  6. Jussi Piitulainen said

    @DGel: Yes. It may be better to deviate from the definition on page 1, especially when even the model implementation does so.

  7. Axio said

    How ugly :)

    let sum = List.fold_left (+.) 0.;;
    
    let mean =
      function 
        | [] -> 0.
        | l -> sum l /. (float_of_int (List.length l));;
    
    let std =
      function
        | [] -> 0.
        | l ->
            let mu = mean l in
            let le = float_of_int (List.length l) in
            sqrt
            ((List.fold_left (fun r e -> r +. ((e -. mu) *. (e -. mu))) 0. l)
            /. le);;
    
    let slope xs ys =
      let le = float_of_int (List.length xs) in
      let sumxs = sum xs in
      let sumys = sum ys in
      ((le *.
        List.fold_left2 (fun r xi yi -> r +. (xi *. yi)) 0. xs ys)
        -.
       (sumxs *.  sumys))
      /.
      ((le *.
        (List.fold_left (fun r xi -> r +. (xi *. xi)) 0. xs))
        -. (sumxs *. sumxs));;
    
    let intercept xs ys =
      let le = float_of_int (List.length xs) in
      ((sum ys) /. le) -.
      (((slope xs ys) *. (sum xs)) /. le);;
    
    let lin_reg xs ys = ((slope xs ys), (intercept xs ys));;
    
    let correlation xs ys =
      let le = float_of_int (List.length xs) in
      let mx = mean xs in
      let my = mean ys in
      let sx = std xs in
      let sy = std ys in
      (List.fold_left2 (fun r xi yi -> r +. ((xi -. mx) *.(yi -. my))) 0. xs ys)
      /.
      ((le -. 1.) *. sx *. sy)
    ;;
    
    let xs = [1.47;1.50;1.52;1.55;1.57;1.60;1.63;1.65;1.68;1.70;1.73;1.75;1.78;1.80;1.83];;
    let ys = [52.21;53.12;54.48;55.84;57.20;58.57;59.93;61.29;63.11;64.47;66.28;68.10;69.92;72.19;74.46];;
    let wrap point v = let eps = 0.001 in point -. eps < v && v < point +. eps;;
    assert (wrap 1.6506666666666665 (mean xs));;
    assert (wrap 62.078 (mean ys));;
    assert (wrap 0.110 (std xs));;
    assert (wrap 6.798 (std ys));;
    assert (wrap 61.272186542107434 (slope xs ys));;
    assert (wrap (-39.06195591883866) (intercept xs ys));;
    assert (wrap 1.06562549311809596 (correlation xs ys));;
    

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Follow

Get every new post delivered to your Inbox.

Join 575 other followers

%d bloggers like this: