Benford’s Law

October 26, 2010

When Shriram first posted this exercise, my solution used uniq-c, compose and digits from the Standard Prelude, plus the non-standard but very useful printf function of Chez Scheme:

(define (benford xs)
  (for-each
    (lambda (x) (printf "~a ~f~%" (car x) (/ (cdr x) (length xs))))
    (uniq-c = (sort < (map (compose car digits) xs)))))

Shriram’s comment was “My word — that’s lovely.” Later in the thread, Joe Marshall presented a Lisp function to compute the first digit of a number, to which I say “My word — that’s lovely:”

(defun leftmost-digit (n base)
 (if (> base n)
     n
     (do* ((i base next)
           (next (* base base) (* i i)))
       ((> next n) (leftmost-digit (floor n i) base)))))

Here is my translation of that function to Scheme; making base default to 10 makes the function a bit longer than the original:

(define (first-digit n . base)
  (let* ((b (if (null? base) 10 (car base)))
         (b2 (* b b)))
    (let loop ((n n) (i b) (k b2))
      (cond ((< n b) n)
            ((< n k)
              (loop (quotient n i) b b2))
            (else (loop n k (* i i)))))))

Shriram’s second challenge was to extract the first-digit proportions from a CSV file, such as the previous page. The text-file database functions of the two previous exercises make that simple. First, we write a predicate to keep useful records and discard the various headers, “N/A” values, and other useless records:

(define (keep? xs) (string->number (list-ref xs 3)))

Then we use read-csv-record, filter-port and map-reduce-port to calculate the first-digit percentages:

> (let* ((fds (with-input-from-file "mn-lakes.csv"
                (lambda ()
                  (map-reduce-port
                    (filter-port read-csv-record keep?)
                    (lambda (x)
                      (values
                        (first-digit (floor (string->number (list-ref x 3))))
                        1))
                    (lambda (k v1 v2) (+ v1 v2))
                    <))))
         (fds-count (apply + (map cdr fds))))
    (for-each (lambda (x)
                (printf "~a ~f~%"
                  (car x)
                  (/ (cdr x) fds-count 0.01)))
              fds))
1 32.05699020480855
2 19.768477292965272
3 12.733748886910062
4 9.082813891362422
5 6.678539626001781
6 6.144256455921639
7 5.253784505788068
8 4.452359750667854
9 3.8290293855743545

During testing, I wrote the following awk program to compare my results:

awk -F, '
$4>0 { c[substr($4,1,1)]++; t++ }
END { for (i=1; i<10; i++) print i, c[i]/t*100 }
' mn-lakes.csv

Running that program produced results that differed from the Scheme program; one record for which the Scheme program calculated a first digit of 1 was calculated as a first digit of 4 by the awk program. The problem was the record

"Gull, Upper","Nisswa",422,154,54,7.5

The simplistic awk program saw “Gull, Upper” as two fields when it is, in fact, a single field, proving again that it is hard to parse CSV files.

You can run the program at http://programmingpraxis.codepad.org/7l3dRSAn.

Pages: 1 2 3

10 Responses to “Benford’s Law”

  1. [...] today’s Programming Praxis exercise, our task is to see if Benford’s law (lower digits are more [...]

  2. My Haskell solution (see http://bonsaicode.wordpress.com/2010/10/26/programming-praxis-benford%E2%80%99s-law/ for a version with comments):

    import Data.List
    import Text.Printf
    
    firstDigits :: [Float] -> [(Char, Double)]
    firstDigits xs = map (\ds -> (head ds, 100 * toEnum (length ds) /
        toEnum (length xs))) . group . sort $ map (head . show) xs
    
    shriram :: [[String]] -> [(Char, Double)]
    shriram xs = firstDigits [n | [(n,_)] <- map (reads . (!! 3)) xs]
    
  3. Graham said

    I skipped straight to reading in values from a csv file and then performing the analysis. My solution in Python:

    #!/usr/bin/env python
    
    import csv
    
    def benford(csv_file, numerical_position):
        """
        Calculates the percentages of first digit occurrence in numerical_position
        in the csv_file in question.
        """
        r = csv.reader(open(csv_file))
        d = dict(zip(range(1,10), [0] * 9))
        for row in r:
            try:
                d[int(row[numerical_position][0])] += 1
            except:
                pass
        total = float(sum(d.values()))
        for k in d:
            print "%s:\t%.4s%%" % (k, 100 * d[k] / total)
        return
    
    if __name__ == '__main__':
        benford('lakes.csv', 3)
    

    When run on the given csv file (which I named lakes.csv), this yields:

    $ ./benford.py
    1: 32.0%
    2: 19.7%
    3: 12.7%
    4: 9.08%
    5: 6.67%
    6: 6.14%
    7: 5.25%
    8: 4.45%
    9: 3.82%

  4. Graham said

    Sorry about the messed up posting of my output!

  5. Gambiteer said

    Similar to Joe Marshall’s:

    (define (first-digit n base)
    ;; returns first base base digit of n
    (let ((base^2 (* base base)))
    (cond ((< n base) n)
    ((< n base^2) (quotient n base))
    (else
    (first-digit (first-digit n base^2) base)))))

  6. slabounty said

    A ruby version also going straight to CSV calculation …

    require 'csv'
    
    def benford(csv_file, position)
        first_digits = Array.new(10, 0)
        CSV.foreach(csv_file) do |row|
            digit = row[position].to_s[0]
            first_digits[digit.to_i] += 1  if digit =~ /[0-9]/
        end
        first_digits
    end
    
    first_digits = benford("lakes.csv", 3)
    total = first_digits.inject(0) { |sum, v| sum + v }
    first_digits.each_with_index { |v, i| puts "Percentage for #{i} #{ (v.to_f / total.to_f) * 100.0} " if i != 0 }
    
  7. Axio said

    ;; natural -> digit
    (define (head-of-num n)
      (let loop ((n n)) (if (< n 10) n (loop (quotient n 10)))))

    ;; list of naturals -> digit
    (define (benford l)
      (let ((res (make-vector 10 0)) ;; store the results
            (count 0))
        (map (lambda (n) ;; increment each seen digit’s counter
               (let ((i (head-of-num n)))
                 (vector-set! res i (+ 1 (vector-ref res i))))
               (set! count (+ 1 count)))
             l)
        (map (lambda (i) ;; divide by number of elements
               (let ((val (/ (vector-ref res i) count)))
                 (vector-set! res i (list i val (exact->inexact val)))))
             (iota 0 9))
        res))

    (define (test data)
      (benford data))

  8. Axio said

    ;; I like this version more.
    (define (head-of-num n)
      (let loop ((n n)) (if (< n 10) n (loop (quotient n 10)))))

    (define (benford l)
      (let* ((count 0)
             (res (fold-left
                    (lambda (n state)
                      (let ((i (head-of-num n)))
                        (vector-set! state i (+ 1 (vector-ref state i)))
                        (set! count (+ 1 count))
                        state))
                    (make-vector 10 0)
                    l)))
        (map (lambda (x) (exact->inexact (/ x count))) (vector->list res))))

    (define (test data)
      (benford data))

  9. Khanh Nguyen said

    My F# codes

    //partition list of single digits into bins
    let split l =
        let rec split_aux ll i =
            if (i = 9) then
                [ll]
            else
                let tt = List.partition (fun x -> x = i) ll
                (fst tt) :: (split_aux (snd tt) (i+1))
        split_aux l 0
    
    let rec first_digit x =
        if x < 10 then x else first_digit (x / 10)
    
    let benford l =
        let first_digit_list = List.map first_digit l
        let digits_counts = List.map List.length (split first_digit_list)
        //compute the percentage
        List.map (fun x -> (float x) / (float) (List.sum digits_counts)) digits_counts
    
  10. KNguyen said

    My F# implementation:

    //partition list of single digits into bins
    let split l =
        let rec split_aux ll i =
            if (i = 9) then
                [ll]
            else
                let tt = List.partition (fun x -> x = i) ll
                (fst tt) :: (split_aux (snd tt) (i+1))
        split_aux l 0
    
    let rec first_digit x =
        if x < 10 then x else first_digit (x / 10)
    
    let benford l =
        let first_digit_list = List.map first_digit l
        let digits_counts = List.map List.length (split first_digit_list)
        //compute the percentage
        List.map (fun x -> (float x) / (float) (List.sum digits_counts)) digits_counts
    

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Connecting to %s

Follow

Get every new post delivered to your Inbox.

Join 136 other followers