Word Count

December 8, 2009

Here’s our version:

#! /usr/bin/scheme --script

(define l-flag #t)
(define w-flag #t)
(define c-flag #t)

(define (update-flags fs)
  (if (not (member #\l fs)) (set! l-flag #f))
  (if (not (member #\w fs)) (set! w-flag #f))
  (if (not (member #\c fs)) (set! c-flag #f)))

(define (put-dec n width)
  (let* ((n-str (number->string n)))
    (display (make-string (- width (string-length n-str)) #\space))
    (display n-str)))

(define (wc)
  (let loop ((inword #f) (c (read-char)) (ls 0) (ws 0) (cs 0))
    (cond ((eof-object? c) (values ls ws cs))
          ((char=? c #\newline)
            (loop #f (read-char) (add1 ls) ws (add1 cs)))
          ((not (member c '(#\space #\newline #\tab)))
            (if inword
                (loop #t (read-char) ls ws (add1 cs))
                (loop #t (read-char) ls (add1 ws) (add1 cs))))
          (else (loop #f (read-char) ls ws (add1 cs))))))

(define (main args)
  (when (and (pair? args) (char=? (string-ref (car args) 0) #\-))
        (update-flags (cdr (string->list (car args))))
        (set! args (cdr args)))
  (if (null? args)
      (let-values (((ls ws cs) (wc)))
        (when l-flag (display ls) (display " "))
        (when w-flag (display ws) (display " "))
        (when c-flag (display cs) (display " "))
        (newline))
      (let loop ((args args) (l-tot 0) (w-tot 0) (c-tot 0))
        (if (null? args)
            (begin (when l-flag (put-dec l-tot 12))
                   (when w-flag (put-dec w-tot 12))
                   (when c-flag (put-dec c-tot 12)))
            (with-input-from-file (car args)
              (lambda ()
                (let-values (((ls ws cs) (wc)))
                  (when l-flag (put-dec ls 12))
                  (when w-flag (put-dec ws 12))
                  (when c-flag (put-dec cs 12))
                  (display " ") (display (car args)) (newline)
                  (loop (cdr args) (+ l-tot ls) (+ w-tot ws) (+ c-tot cs)))))))))

(main (cdr (command-line)))

The code for handling the optional flags and filenames is tedious but straight forward; the she-bang line is specific to Chez Scheme, as is the command-line procedure, but most Scheme systems have something similar. The wc function performs the actual counting; each input character adds 1 to cs, each newline character adds 1 to ls, and ws is incremented each time the inword variable, which is #t if and only if the current character is not a blank, tab or newline, goes from #f to #t.

We use add1 from the Standard Prelude. You can see the collected code at http://programmingpraxis.codepad.org/ZxPjiEvw.

Pages: 1 2

11 Responses to “Word Count”

  1. […] Praxis – Word Count By Remco Niemeijer In today’s Programming Praxis exercise, we have to implement the Unix wc command line utility. Let’s get […]

  2. Remco Niemeijer said

    My Haskell solution (see http://bonsaicode.wordpress.com/2009/12/08/programming-praxis-word-count/ for a version with comments):

    import System.Environment
    import Text.Printf
    
    parseOpts :: [String] -> ([Bool], [String])
    parseOpts (('-':ps):args) = (map (`elem` ps) "lwc", args)
    parseOpts args            = (replicate 3 True, args)
    
    count :: [Bool] -> [(String, String)] -> [String]
    count opts = map (\(name, text) -> concat
        [printf "%8s" $ if opt then show . length $ f text else "-"
        | (f, opt) <- zip [lines, words, map return] opts] ++ " " ++ name)
    
    main :: IO ()
    main = do args <- getArgs
              let (opts, files) = parseOpts args
              mapM_ putStrLn . count opts =<< if null files
                  then fmap (\x -> [("", x)]) getContents
                  else fmap (zip files) $ mapM readFile files
    
  3. #
    # returns, for the given file, an array with the number of lines, 
    # words and character
    #
    def word_count file
    
      lines = 0
      chars = 0
      words = 0
     
      open(file) do |f|
       
        f.each do |line|
          lines += 1
          chars += line.length
          words += line.split.length
        end
       
      end
     
      [lines, words, chars]
     
    end
    
    # retrieve command line options
    options = ['w', 'l', 'c']
    
    if ARGV[0] =~ /-([lwc])([lwc])?([lwc])?/
      options = [$1, $2, $3]
      ARGV.shift
    end
    
    unless ARGV[0]
      abort("Usage: #{$0} file1 file2 ...")
    end
     
    cumulate = ARGV.length > 1
    
    # process each file and output the count
    total_lines = 0
    total_chars = 0
    total_words = 0
    
    ARGV.each do |file|
     
      unless File.exist?(file)
        $stderr.puts "File not found: #{file}"
        next
      end
     
      lines, words, chars = word_count(file)
     
      print "#{'%7d' % lines}\t" if options.include?('l')
      print "#{'%7d' % words}\t" if options.include?('w')
      print "#{'%7d' % chars}\t" if options.include?('c')
      print "\t#{file}" if cumulate
      puts
             
      total_lines += lines
      total_chars += chars
      total_words += words
    
    end
    
    if cumulate 
      print "#{'%7d' % total_lines}\t" if options.include?('l')
      print "#{'%7d' % total_words}\t" if options.include?('w')
      print "#{'%7d' % total_chars}\t" if options.include?('c')
      puts "total"
    end
    
  4. John Cowan said

    GNU wc has three major improvements on the naive algorithm:

    1) It does its own block buffering rather than using stdio buffering, which means that characters are counted a block at a time.

    2) It has multiple inner loops, deciding on the basis of the options which one to run (thus the inword logic is not executed at all if -w is not specified).

    3) If -c is the only option, then it attempts to fstat() files rather than reading them, being careful to make sure the file is a regular file (not a device file) and taking into account the possibility that stdin may be a file that isn’t positioned at its beginning. This allows O(1) behavior in favorable circumstances.

    It also extends classic wc semantics by being able to count bytes with -b and possibly-multibyte characters with -c, though if characters are known to be single-byte in the current encoding it will treat -c and -b the same (optimization #3 above really applies to -b). It also provides -L which returns the length of the longest line and -W to return the count of words, which are obvious and useful extensions.

  5. Miguel Valadas said

    # Init Some Variables
    count_lines = true
    count_words = true
    count_chars = true

    line_count = 0
    word_count = 0
    char_count = 0

    filename = ARGV[0]

    #Check Arguments
    if(ARGV[0] =~ /^-.+$/)
    count_lines = (ARGV[0].index(‘l’) != nil)
    count_words = (ARGV[0].index(‘w’) != nil)
    count_chars = (ARGV[0].index(‘c’) != nil)
    filename = ARGV[1]
    end

    #Count with Regexp

    file = File.open(filename,’r+’)
    while (line = file.gets) do
    line_count += 1;
    word_count += line.scan(/[^ \n\t]+/).size
    char_count += line.size
    end

    puts “Line Count = #{line_count}” unless !count_lines
    puts “Word Count = #{word_count}” unless !count_words
    puts “Character Count = #{char_count}” unless !count_chars

  6. Miguel Valadas said

    Sorry about the previous post:

    # Init Some Variables
    count_lines = true
    count_words = true
    count_chars = true
    
    line_count = 0
    word_count = 0
    char_count = 0
    
    filename = ARGV[0]
      
      
    #Check Arguments
    if(ARGV[0] =~ /^-.+$/)
      count_lines = (ARGV[0].index('l') != nil)
      count_words = (ARGV[0].index('w') != nil)
      count_chars = (ARGV[0].index('c') != nil)
      filename = ARGV[1]
    end
    
    #Count with Regexp
    
    file = File.open(filename,'r+')
    while (line = file.gets) do
      line_count += 1;
      word_count += line.scan(/[^ \n\t]+/).size
      char_count += line.size
    end
    
    puts "Line Count = #{line_count}" unless !count_lines
    puts "Word Count = #{word_count}" unless !count_words
    puts "Character Count = #{char_count}" unless !count_chars
    
  7. Frank Gleason said

    #include
    #include
    #include
    #include

    int lflag = 0;
    int wflag = 0;
    int cflag = 0;
    char *fname = ”;
    char buf[BUFSIZ];
    extern int errno;

    main(argc, argv)
    int argc;
    char *argv[];
    {
    int i, fd;

    while (argc > 1 && argv[1][0] == ‘-‘) {
    switch (argv[1][1]) {
    case ‘l’ : lflag = 1;
    break;
    case ‘w’ : wflag = 1;
    break;
    case ‘c’ : cflag = 1;
    break;
    default : printf(“usage: \n”);
    exit(1);
    }
    argc–;
    argv++;
    }
    if (lflag == 0 && wflag == 0 && cflag == 0)
    lflag = wflag = cflag = 1;
    if (argc == 1)
    wc(STDIN_FILENO);
    else
    for (i = 1; i < argc; i++)
    if ((fd = open(argv[i], O_RDONLY)) == -1) {
    printf("%s: can not open %s, errno=%d\n", argv[0], argv[i],errno);
    exit(1);
    }
    else
    {
    fname = argv[i];
    wc(fd);
    close(fd);
    }
    exit(0);
    }

    wc(fd)
    int fd;
    {
    int n, l = 0, w = 0, t = 0, ws = 1;
    char *cp, c;

    while (n = read(fd, buf, BUFSIZ)) {
    t += n;
    for (cp = buf; cp != (buf + n); cp++) {
    c = *cp;
    if (c == '\n') {
    l++;
    ws = 1;
    }
    else
    if (c != ' ' && c != '\t') {
    if (ws) {
    ws = 0;
    w++;
    }
    }
    else
    ws = 1;
    }
    }
    if (lflag)
    printf ("%d ", l);
    if (wflag)
    printf("%d ", w);
    if (cflag)
    printf("%d ", t);
    if (fname)
    printf("%s", fname);
    printf("\n");
    }

  8. Frank Gleason said

    I too posted before reading the instructions. Sorry about that.

    #include <stdio.h>
    #include <sys/fcntl.h>
    #include <stdlib.h>
    #include <unistd.h>
    
    int lflag = 0;
    int wflag = 0;
    int cflag = 0;
    char *fname = '\0';
    char buf[BUFSIZ];
    extern int errno;
    
    main(argc, argv)
    int argc;
    char *argv[];
      { 
        int i, fd;
    
        while (argc > 1 && argv[1][0] == '-') {
          switch (argv[1][1]) {
            case 'l' : lflag = 1;
                       break;
    	case 'w' : wflag = 1;
    		   break;
    	case 'c' : cflag = 1;
    		   break;
    	default : printf("usage: wc [-lwc] [name...]\n");
    		  exit(1);
            }
          argc--;
          argv++;
          }
        if (lflag == 0 && wflag == 0 && cflag == 0)
          lflag = wflag = cflag = 1;
        if (argc == 1)
          wc(STDIN_FILENO);
        else
          for (i = 1; i < argc; i++)
            if ((fd = open(argv[i], O_RDONLY)) == -1) {
    	  printf("%s: can not open %s, errno=%d\n", argv[0], argv[i],errno);
    	  exit(1);
              }
            else
              {
                fname = argv[i];
                wc(fd);
                close(fd);
              }
        exit(0);
      }
    
    wc(fd)
    int fd;
      { 
        int n, l = 0, w = 0, t = 0, ws = 1;
        char *cp, c;
    
        while (n = read(fd, buf, BUFSIZ)) {
            t += n;
            for (cp = buf; cp != (buf + n); cp++) {
                c = *cp;
                if (c == '\n') {
                    l++;
                    ws = 1;
                  }
                else  
                if (c != ' ' && c != '\t') {
                    if (ws) {
    	            ws = 0;
    	            w++;
                      }
                  } 
    	    else 
                  ws = 1;
              }
          }
        if (lflag)
          printf ("%d ", l);
        if (wflag)
          printf("%d ", w);
        if (cflag)
          printf("%d ", t);
        if (fname)
          printf("%s", fname);
        printf("\n");
      }
    
  9. […] to the RSS feed or email list for updates on this topic.The goal for this Programming Praxis was to implement the Unix wc function. This one took me a couple days (I haven’t had a lot of time recently) to complete, but I […]

  10. slabounty said

    A ruby version that a) doesn’t look all that rubyish and b) doesn’t format the output nicely but does work more or less like the unix version

    require 'getoptlong'
    
    opts = GetoptLong.new(
        ["--words", "-w", GetoptLong::NO_ARGUMENT],
        ["--chars", "-c", GetoptLong::NO_ARGUMENT],
        ["--lines", "-l", GetoptLong::NO_ARGUMENT],
        ["--verbose", "-v", GetoptLong::NO_ARGUMENT]
        )
    
    words = false
    lines = false
    chars = false
    $verbose = false
    
    begin
        opts.each do | opt, arg|
            case opt
            when "--words"
                words = true
            when "--chars"
                chars = true
            when "--lines"
                lines = true
            when "--verbose"
                $verbose = true
            end
        end
    rescue
        puts "Illegal command line option."
        exit
    end
    
    accumulate = false
    if ARGV.length > 1
        accumulate = true
        wcl_totals = Hash.new(0)
    end
    
    puts "accumulate = #{accumulate}"
    
    ARGV.each do |file_name|
        File.open(file_name) do | file |
    
            wcl = Hash.new(0)
    
            while line = file.gets
                wcl[:words] += line.split.length
                wcl[:chars] += line.length
                wcl[:lines] += 1
            end
    
            if accumulate
                wcl_totals[:words] += wcl[:words]
                wcl_totals[:chars] += wcl[:chars]
                wcl_totals[:lines] += wcl[:lines]
            end
    
            puts "#{wcl[:words] if words} #{wcl[:lines] if lines} #{wcl[:chars] if chars} #{file_name}"
        end
    end
    
    puts "#{wcl_totals[:words] if words} #{wcl_totals[:lines] if lines} #{wcl_totals[:chars] if chars} Total" if accumulate
    
  11. swaraj said

    ruby solution that only takes a filename (http://codepad.org/CC98XPaa) returns values lines,characters,words in that order

Leave a comment