Word Count

December 8, 2009

Here’s our version:

#! /usr/bin/scheme --script

(define l-flag #t) (define w-flag #t) (define c-flag #t)

(define (update-flags fs) (if (not (member #\l fs)) (set! l-flag #f)) (if (not (member #\w fs)) (set! w-flag #f)) (if (not (member #\c fs)) (set! c-flag #f)))

(define (put-dec n width) (let* ((n-str (number->string n))) (display (make-string (- width (string-length n-str)) #\space)) (display n-str)))

(define (wc) (let loop ((inword #f) (c (read-char)) (ls 0) (ws 0) (cs 0)) (cond ((eof-object? c) (values ls ws cs)) ((char=? c #\newline) (loop #f (read-char) (add1 ls) ws (add1 cs))) ((not (member c '(#\space #\newline #\tab))) (if inword (loop #t (read-char) ls ws (add1 cs)) (loop #t (read-char) ls (add1 ws) (add1 cs)))) (else (loop #f (read-char) ls ws (add1 cs))))))

(define (main args) (when (and (pair? args) (char=? (string-ref (car args) 0) #\-)) (update-flags (cdr (string->list (car args)))) (set! args (cdr args))) (if (null? args) (let-values (((ls ws cs) (wc))) (when l-flag (display ls) (display " ")) (when w-flag (display ws) (display " ")) (when c-flag (display cs) (display " ")) (newline)) (let loop ((args args) (l-tot 0) (w-tot 0) (c-tot 0)) (if (null? args) (begin (when l-flag (put-dec l-tot 12)) (when w-flag (put-dec w-tot 12)) (when c-flag (put-dec c-tot 12))) (with-input-from-file (car args) (lambda () (let-values (((ls ws cs) (wc))) (when l-flag (put-dec ls 12)) (when w-flag (put-dec ws 12)) (when c-flag (put-dec cs 12)) (display " ") (display (car args)) (newline) (loop (cdr args) (+ l-tot ls) (+ w-tot ws) (+ c-tot cs)))))))))

(main (cdr (command-line)))

The code for handling the optional flags and filenames is tedious but straight forward; the she-bang line is specific to Chez Scheme, as is the command-line procedure, but most Scheme systems have something similar. The wc function performs the actual counting; each input character adds 1 to cs, each newline character adds 1 to ls, and ws is incremented each time the inword variable, which is #t if and only if the current character is not a blank, tab or newline, goes from #f to #t.

We use add1 from the Standard Prelude. You can see the collected code at http://programmingpraxis.codepad.org/ZxPjiEvw.

Posted by programmingpraxis

Filed in Exercises

11 Comments »

11 Responses to “Word Count”

Programming Praxis – Word Count « Bonsai Code said
December 8, 2009 at 1:39 PM
[…] Praxis – Word Count By Remco Niemeijer In today’s Programming Praxis exercise, we have to implement the Unix wc command line utility. Let’s get […]

Remco Niemeijer said

December 8, 2009 at 1:39 PM

My Haskell solution (see http://bonsaicode.wordpress.com/2009/12/08/programming-praxis-word-count/ for a version with comments):

import System.Environment
import Text.Printf

parseOpts :: [String] -> ([Bool], [String])
parseOpts (('-':ps):args) = (map (`elem` ps) "lwc", args)
parseOpts args            = (replicate 3 True, args)

count :: [Bool] -> [(String, String)] -> [String]
count opts = map (\(name, text) -> concat
    [printf "%8s" $ if opt then show . length $ f text else "-"
    | (f, opt) <- zip [lines, words, map return] opts] ++ " " ++ name)

main :: IO ()
main = do args <- getArgs
          let (opts, files) = parseOpts args
          mapM_ putStrLn . count opts =<< if null files
              then fmap (\x -> [("", x)]) getContents
              else fmap (zip files) $ mapM readFile files

Jean Lazarou said

December 8, 2009 at 7:23 PM

#
# returns, for the given file, an array with the number of lines, 
# words and character
#
def word_count file

  lines = 0
  chars = 0
  words = 0
 
  open(file) do |f|
   
    f.each do |line|
      lines += 1
      chars += line.length
      words += line.split.length
    end
   
  end
 
  [lines, words, chars]
 
end

# retrieve command line options
options = ['w', 'l', 'c']

if ARGV[0] =~ /-([lwc])([lwc])?([lwc])?/
  options = [$1, $2, $3]
  ARGV.shift
end

unless ARGV[0]
  abort("Usage: #{$0} file1 file2 ...")
end
 
cumulate = ARGV.length > 1

# process each file and output the count
total_lines = 0
total_chars = 0
total_words = 0

ARGV.each do |file|
 
  unless File.exist?(file)
    $stderr.puts "File not found: #{file}"
    next
  end
 
  lines, words, chars = word_count(file)
 
  print "#{'%7d' % lines}\t" if options.include?('l')
  print "#{'%7d' % words}\t" if options.include?('w')
  print "#{'%7d' % chars}\t" if options.include?('c')
  print "\t#{file}" if cumulate
  puts
         
  total_lines += lines
  total_chars += chars
  total_words += words

end

if cumulate 
  print "#{'%7d' % total_lines}\t" if options.include?('l')
  print "#{'%7d' % total_words}\t" if options.include?('w')
  print "#{'%7d' % total_chars}\t" if options.include?('c')
  puts "total"
end

John Cowan said
December 9, 2009 at 4:05 PM
GNU wc has three major improvements on the naive algorithm:

1) It does its own block buffering rather than using stdio buffering, which means that characters are counted a block at a time.

2) It has multiple inner loops, deciding on the basis of the options which one to run (thus the inword logic is not executed at all if -w is not specified).

3) If -c is the only option, then it attempts to fstat() files rather than reading them, being careful to make sure the file is a regular file (not a device file) and taking into account the possibility that stdin may be a file that isn’t positioned at its beginning. This allows O(1) behavior in favorable circumstances.

It also extends classic wc semantics by being able to count bytes with -b and possibly-multibyte characters with -c, though if characters are known to be single-byte in the current encoding it will treat -c and -b the same (optimization #3 above really applies to -b). It also provides -L which returns the length of the longest line and -W to return the count of words, which are obvious and useful extensions.
Miguel Valadas said
December 9, 2009 at 4:29 PM
# Init Some Variables
count_lines = true
count_words = true
count_chars = true

line_count = 0
word_count = 0
char_count = 0

filename = ARGV[0]

#Check Arguments
if(ARGV[0] =~ /^-.+$/)
count_lines = (ARGV[0].index(‘l’) != nil)
count_words = (ARGV[0].index(‘w’) != nil)
count_chars = (ARGV[0].index(‘c’) != nil)
filename = ARGV[1]
end

#Count with Regexp

file = File.open(filename,’r+’)
while (line = file.gets) do
line_count += 1;
word_count += line.scan(/[^ \n\t]+/).size
char_count += line.size
end

puts “Line Count = #{line_count}” unless !count_lines
puts “Word Count = #{word_count}” unless !count_words
puts “Character Count = #{char_count}” unless !count_chars

Miguel Valadas said

December 9, 2009 at 4:35 PM

Sorry about the previous post:

# Init Some Variables
count_lines = true
count_words = true
count_chars = true

line_count = 0
word_count = 0
char_count = 0

filename = ARGV[0]
  
  
#Check Arguments
if(ARGV[0] =~ /^-.+$/)
  count_lines = (ARGV[0].index('l') != nil)
  count_words = (ARGV[0].index('w') != nil)
  count_chars = (ARGV[0].index('c') != nil)
  filename = ARGV[1]
end

#Count with Regexp

file = File.open(filename,'r+')
while (line = file.gets) do
  line_count += 1;
  word_count += line.scan(/[^ \n\t]+/).size
  char_count += line.size
end

puts "Line Count = #{line_count}" unless !count_lines
puts "Word Count = #{word_count}" unless !count_words
puts "Character Count = #{char_count}" unless !count_chars

Frank Gleason said
December 10, 2009 at 7:11 PM
#include
#include
#include
#include

int lflag = 0;
int wflag = 0;
int cflag = 0;
char *fname = ”;
char buf[BUFSIZ];
extern int errno;

main(argc, argv)
int argc;
char *argv[];
{
int i, fd;

while (argc > 1 && argv[1][0] == ‘-‘) {
switch (argv[1][1]) {
case ‘l’ : lflag = 1;
break;
case ‘w’ : wflag = 1;
break;
case ‘c’ : cflag = 1;
break;
default : printf(“usage: \n”);
exit(1);
}
argc–;
argv++;
}
if (lflag == 0 && wflag == 0 && cflag == 0)
lflag = wflag = cflag = 1;
if (argc == 1)
wc(STDIN_FILENO);
else
for (i = 1; i < argc; i++)
if ((fd = open(argv[i], O_RDONLY)) == -1) {
printf("%s: can not open %s, errno=%d\n", argv[0], argv[i],errno);
exit(1);
}
else
{
fname = argv[i];
wc(fd);
close(fd);
}
exit(0);
}

wc(fd)
int fd;
{
int n, l = 0, w = 0, t = 0, ws = 1;
char *cp, c;

while (n = read(fd, buf, BUFSIZ)) {
t += n;
for (cp = buf; cp != (buf + n); cp++) {
c = *cp;
if (c == '\n') {
l++;
ws = 1;
}
else
if (c != ' ' && c != '\t') {
if (ws) {
ws = 0;
w++;
}
}
else
ws = 1;
}
}
if (lflag)
printf ("%d ", l);
if (wflag)
printf("%d ", w);
if (cflag)
printf("%d ", t);
if (fname)
printf("%s", fname);
printf("\n");
}

Frank Gleason said

December 10, 2009 at 11:59 PM

I too posted before reading the instructions. Sorry about that.

#include <stdio.h>
#include <sys/fcntl.h>
#include <stdlib.h>
#include <unistd.h>

int lflag = 0;
int wflag = 0;
int cflag = 0;
char *fname = '\0';
char buf[BUFSIZ];
extern int errno;

main(argc, argv)
int argc;
char *argv[];
  { 
    int i, fd;

    while (argc > 1 && argv[1][0] == '-') {
      switch (argv[1][1]) {
        case 'l' : lflag = 1;
                   break;
	case 'w' : wflag = 1;
		   break;
	case 'c' : cflag = 1;
		   break;
	default : printf("usage: wc [-lwc] [name...]\n");
		  exit(1);
        }
      argc--;
      argv++;
      }
    if (lflag == 0 && wflag == 0 && cflag == 0)
      lflag = wflag = cflag = 1;
    if (argc == 1)
      wc(STDIN_FILENO);
    else
      for (i = 1; i < argc; i++)
        if ((fd = open(argv[i], O_RDONLY)) == -1) {
	  printf("%s: can not open %s, errno=%d\n", argv[0], argv[i],errno);
	  exit(1);
          }
        else
          {
            fname = argv[i];
            wc(fd);
            close(fd);
          }
    exit(0);
  }

wc(fd)
int fd;
  { 
    int n, l = 0, w = 0, t = 0, ws = 1;
    char *cp, c;

    while (n = read(fd, buf, BUFSIZ)) {
        t += n;
        for (cp = buf; cp != (buf + n); cp++) {
            c = *cp;
            if (c == '\n') {
                l++;
                ws = 1;
              }
            else  
            if (c != ' ' && c != '\t') {
                if (ws) {
	            ws = 0;
	            w++;
                  }
              } 
	    else 
              ws = 1;
          }
      }
    if (lflag)
      printf ("%d ", l);
    if (wflag)
      printf("%d ", w);
    if (cflag)
      printf("%d ", t);
    if (fname)
      printf("%s", fname);
    printf("\n");
  }

wc: Word Count | Andrew Ferguson said
December 13, 2009 at 12:41 AM
[…] to the RSS feed or email list for updates on this topic.The goal for this Programming Praxis was to implement the Unix wc function. This one took me a couple days (I haven’t had a lot of time recently) to complete, but I […]

slabounty said

November 15, 2010 at 12:05 AM

A ruby version that a) doesn’t look all that rubyish and b) doesn’t format the output nicely but does work more or less like the unix version

require 'getoptlong'

opts = GetoptLong.new(
    ["--words", "-w", GetoptLong::NO_ARGUMENT],
    ["--chars", "-c", GetoptLong::NO_ARGUMENT],
    ["--lines", "-l", GetoptLong::NO_ARGUMENT],
    ["--verbose", "-v", GetoptLong::NO_ARGUMENT]
    )

words = false
lines = false
chars = false
$verbose = false

begin
    opts.each do | opt, arg|
        case opt
        when "--words"
            words = true
        when "--chars"
            chars = true
        when "--lines"
            lines = true
        when "--verbose"
            $verbose = true
        end
    end
rescue
    puts "Illegal command line option."
    exit
end

accumulate = false
if ARGV.length > 1
    accumulate = true
    wcl_totals = Hash.new(0)
end

puts "accumulate = #{accumulate}"

ARGV.each do |file_name|
    File.open(file_name) do | file |

        wcl = Hash.new(0)

        while line = file.gets
            wcl[:words] += line.split.length
            wcl[:chars] += line.length
            wcl[:lines] += 1
        end

        if accumulate
            wcl_totals[:words] += wcl[:words]
            wcl_totals[:chars] += wcl[:chars]
            wcl_totals[:lines] += wcl[:lines]
        end

        puts "#{wcl[:words] if words} #{wcl[:lines] if lines} #{wcl[:chars] if chars} #{file_name}"
    end
end

puts "#{wcl_totals[:words] if words} #{wcl_totals[:lines] if lines} #{wcl_totals[:chars] if chars} Total" if accumulate

swaraj said
August 27, 2012 at 4:13 PM
ruby solution that only takes a filename (http://codepad.org/CC98XPaa) returns values lines,characters,words in that order

S	M	T	W	T	F	S
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

Programming Praxis

Word Count

December 8, 2009

11 Responses to “Word Count”

Leave a comment

Categories

Archives

Archives

Programming Praxis

Word Count

December 8, 2009

Share this:

Related

11 Responses to “Word Count”

Leave a comment

Categories

Archives

Archives