Word Frequencies

March 10, 2009

Write a program that takes a filename and a parameter n and prints the n most common words in the file, and the count of their occurrences, in descending order.

Pages: 1 2

14 Responses to “Word Frequencies”

  1. mnp said

    perl -ne ‘while (/([a-z]+)/gi) {$words{$1}++} END{ map { print “$_ $words{$_}\n”} sort {$words{$b} $words{$a}} keys %words}’

  2. FalconNL said

    A straightforward (though probably not very efficient) Haskell solution:

    import System.Environment
    import Control.Arrow
    import Data.List
    import Data.Ord

    main = do [n, fileName] <- getArgs
    content String -> [(String, Int)]
    findMostCommonWords n = take n . reverse . sortBy (comparing snd) . map (head &&& length) . group . sort . words

  3. FalconNL said

    Sigh… gotta love forms that don’t escape html characters. Let’s try that again.

    import System.Environment
    import Control.Arrow
    import Data.List
    import Data.Ord

    main = do [n, fileName] <- getArgs
    content <- readFile fileName
    mapM_ print $ findMostCommonWords (read n) content

    findMostCommonWords :: Int -> String -> [(String, Int)]
    findMostCommonWords n = take n . reverse . sortBy (comparing snd) . map (head &&& length) . group . sort . words

  4. cdsboy said

    Interesting challenge, here’s my solution in python.

    http://pastebin.com/f1ac76000

  5. Bengt said

    Action CountWords = (filename, top) =>
    {
    foreach (var kv in
    File.ReadAllText(filename)
    .Split()
    .GroupBy(w => w,
    (w, c) => new
    { Word = w, Count = c.Count() })
    .OrderByDescending(a => a.Count)
    .Take(top))
    Console.WriteLine(kv.Word + ” – ” + kv.Count);
    };

  6. […] Dictionaries are a common data type, which we have used in several exercises (Mark V. Shaney, Word Frequencies, Dodgson’s Doublets, Anagrams). Hash tables are often used as the underlying implementation […]

  7. 
    import qualified Data.Map as M
    import Data.List
    import Data.Char
    import Data.Function
    import System.Environment
    
    
    isWord   :: String -> Bool
    isWord s = length s > 2
    
    buildFrequencyMap         :: String -> M.Map String Int
    buildFrequencyMap content = foldl' insertWord M.empty mots
        where insertWord m word = M.insertWith' (+) word 1 m
              mots = filter isWord $ map (map toLower . takeWhile isLetter) (words content)
    
    sortByFrequency   :: M.Map String Int -> [(String, Int)]
    sortByFrequency m = reverse $ sortBy (compare `on` snd) (M.toList m)
    
    displayWords               :: Int -> [(String, Int)] -> IO ()
    displayWords n frequencies = mapM_ (putStrLn . format) (take n frequencies)
        where format (w, x) = (show x) ++ " -> " ++ w
    
    main = do
      args <- getArgs
      case args of
        [n, f] -> displayWords (read n) . sortByFrequency . buildFrequencyMap =<< readFile f
        _ -> error "Deux arguments requis"
    
  8. slabounty said

    Here it is in ruby (commented so I don’t have to do it elsewhere) …

    # Write a program that takes a filename and a parameter n and prints the n most
    # common words in the file, and the count of their occurrences, in descending
    # order.
    
    # Require for the command line options processor.
    require 'getoptlong'
    
    
    # Set up the command line options
    opts = GetoptLong.new(
        ["--number", "-n", GetoptLong::REQUIRED_ARGUMENT],
        ["--verbose", "-v", GetoptLong::NO_ARGUMENT]
        )
    
    # Set the default values for the options
    number = 10
    $verbose = false
    
    # Parse the command line options. If we find one we don't recognize
    # an exception will be thrown and we'll rescue with a message.
    begin
        opts.each do | opt, arg|
            case opt
            when "--number"
                number = arg.to_i
            when "--verbose"
                $verbose = true
            end
        end
    rescue
        puts "Illegal command line option."
        exit
    end
    
    # Create the word frequency hash.
    word_freq = Hash.new(0)
    
    # Loop through the remaining arguments which we'll assume are 
    # file names.
    ARGV.each do |file_name|
        File.open(file_name) do | file |
    
            # Loop through each line of the file
            while line = file.gets
                # Split on non-words or digits. This will throw out punctuation and
                # numbers. If numbers are to be included, remove the \d from the regex.
                words = line.split(/[\W\d]/)
                words.each do |word|
                    # For some reason we're getting empty strings (probably a bad regex above) so
                    # just toss them out here.
                    word_freq[word] += 1 if word != "" 
                end
            end
        end
    end
    
    # Print out the n most frequent words. First we'll sort which normally
    # will sort on the key, we'll pass a block so that we can sort on the 
    # value. Then we'll reverse to get the largest values first, and finally
    # we'll pull out the first n.
    word_freq.sort{|a,b| a[1]<=>b[1]}.reverse[0, number].each do |v|
        puts "#{v[0]} #{v[1]}"
    end
    
  9. kawas said

    Clojure library has an handy frequencies function.

    (defn word-frequencies [filepath n]
      (let [wrds (.split (.toLowerCase (slurp filepath)) "[^a-zA-Z]+")]
        (take n (reverse (sort-by second (frequencies wrds))))))
    
  10. Hello guys,

    Check my solution in Python. I went a bit further I cleaned the source from punctuation characters.
    It brings more relevant output.

    from operator import itemgetter
    from string import punctuation
    
    def countWords(filename, n):
      all_words = (word.strip(punctuation).lower() for line in open(filename) for word in line.split())
      
      words = {}
      for word in all_words:
        words[word] = words.get(word, 0) + 1
    
      #sort by number and slice
      sort_words = sorted(words.iteritems(), key=itemgetter(1), reverse=True)[:int(n)]
      for index, words in enumerate(sort_words):
        print "%d. %s - %d" % (index + 1, words[0], words[1])
    
    if __name__ == "__main__":
      from optparse import OptionParser
    
      parser = OptionParser()
      parser.add_option("-f", "--file", dest="filename",
                        help="Select a FILE", metavar="FILE")
      parser.add_option("-n", "--num", help="Number of words to display")
      (options, args) = parser.parse_args()
      countWords(options.filename, options.num)
    
  11. Lucas A. Brown said
    #! /usr/bin/env python
    
    def word_frequencies(filename):
        g = open(filename, "r")
        f = g.read()
        g.close()
        words = {}
        f = ''.join((c.lower() if c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" else " ") for c in f)
        for w in f.split():
            if w == '': continue
            if w in words: words[w] += 1
            else: words[w] = 1
        return words
    
    if __name__ == "__main__":
        from sys import argv
        words = word_frequencies(argv[1])
        words = [(w, words[w]) for w in words]
        words.sort(key=lambda w: w[1])
        words.reverse()
        for i in xrange(min(len(words), int(argv[2]))): print words[i][0], words[i][1]
    
  12. ? said

    # In Ruby

    text = File.read(ARGV[0])
    n = ARGV[1].to_i

    puts text
    .scan(/\w+/)
    .group_by(&:itself)
    .map { |k, v| [k, v.count] }
    .sort_by { |_, v| -v }
    .take(n)
    .map { |k, v| “#{k}: #{v}” }

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: