N-Gram Frequency

February 20, 2018

Given a string of characters and an n-gram size n, prepare a frequency table of all combinations of size n blocks of characters. For instance, with the string “Programming Praxis” and an n-gram size of 2, the n-grams are “Pr”, “ro”, “og”, “gr”, “ra”, “am”, “mm”, “mi”, “in”, “ng”, “g “, ” P”, “Pr”, “ra”, “ax”, “xi”, and “is”; the two n-grams “Pr” and “ra” appear twice, and all the others appear once.

Your task is to write a program that computes a frequency table of n-grams. When you are finished, you are welcome to read or run a suggested solution, or to post your own solution or discuss the exercise in the comments below.

Advertisement

Pages: 1 2

8 Responses to “N-Gram Frequency”

  1. Using a deque we can iterate through the string, adding each character to the deque, keeping count of the deques’ size, once it reaches the n-gram size, add to result, and deque the first item.

    from collections import deque
    
    def ngrams(word, size):
      res, grams, w = [], deque(), ''
    
      for c in word:
        w += c
        grams.append(c)
    
        if len(grams) == size:
          res.append(w)
          w = w[1:]
          grams.popleft()
    
      return res
    
    print(ngrams("Programming Praxis", 2)) # => ['Pr', 'ro', 'og', 'gr', 'ra', 'am', 'mm', 'mi', 'in', 'ng', 'g ', ' P', 'Pr', 'ra', 'ax', 'xi', 'is']
    
    
  2. from collections import deque
    
    def ngrams(word, size):
      freq, grams, w = {}, deque(), ''
    
      for c in word:
        w += c
        grams.append(c)
    
        if len(grams) == size:
          freq[w] = freq[w]+1 if w in freq else 0
          w = w[1:]
          grams.popleft()
    
      return freq
    
    print(ngrams("Programming Praxis", 2)) # => {'Pr': 2, 'ro': 1, 'og': 1, 'gr': 1, 'ra': 2, 'am': 1, 'mm': 1, 'mi': 1, 'in': 1, 'ng': 1, 'g ': 1, ' P': 1, 'ax': 1, 'xi': 1, 'is': 1}
    
    
  3. Daniel said

    Here’s a solution in Python 2.

    from collections import Counter
    
    def count_ngrams(s, n):
      ngrams = (s[x:x+n] for x in xrange(len(s)-n+1))
      return dict(Counter(ngrams))
    
    s = "Programming Praxis"
    ngram_counts = count_ngrams(s, 2)
    for ngram, count in ngram_counts.iteritems():
      print "'{}' {}".format(ngram, count)
    

    Output:

    'Pr' 2
    'xi' 1
    'gr' 1
    'og' 1
    'am' 1
    'is' 1
    'in' 1
    'mi' 1
    'ng' 1
    'mm' 1
    ' P' 1
    'ra' 2
    'g ' 1
    'ax' 1
    'ro' 1
    
  4. Globules said

    Here’s a Haskell version.

    import Data.Int (Int64)
    import Data.List (sortBy)
    import Data.Ord (Down(..), comparing)
    import System.Environment (getArgs)
    import Text.Printf (printf)
    import Text.Read (readMaybe)
    import qualified Data.Map.Strict as M
    import qualified Data.Text.Lazy as T
    import qualified Data.Text.Lazy.IO as IO
    
    -- Given a string return a map from substrings of length n to the number of
    -- times they occur.
    ngramMap :: Int64 -> T.Text -> M.Map T.Text Int
    ngramMap n = M.fromListWith incr . filter (hasLen n) . map (chars n) . T.tails
      where incr i _ = i + 1
            hasLen m = (== m) . T.length . fst
            chars m t = (T.take m t, 1)
    
    -- Sort a list in decreasing order on the second element.
    sortByCount :: Ord b => [(a, b)] -> [(a, b)]
    sortByCount = sortBy (comparing (Down . snd))
    
    -- Read some text, count the occurrences of each ngram, then print the results.
    testNgrams :: Int64 -> IO ()
    testNgrams n = do
      text <- IO.getContents
      let ngrams = sortByCount $ M.toList $ ngramMap n text
      mapM_ (uncurry (printf "%s %d\n")) ngrams
    
    main :: IO ()
    main = do
      args <- getArgs
      case map readMaybe args of
        [Just n] -> testNgrams n
        _        -> putStrLn "Usage: ngrams ngram-length"
    
    $ printf 'Programming Praxis' | ./ngrams 2
    Pr 2
    ra 2
     P 1
    am 1
    ax 1
    g  1
    gr 1
    in 1
    is 1
    mi 1
    mm 1
    ng 1
    og 1
    ro 1
    xi 1
    $ printf 'Mississippi' | ./ngrams 4
    issi 2
    Miss 1
    ippi 1
    sipp 1
    siss 1
    ssip 1
    ssis 1
    
  5. Milbrae said

    Generic Python, using dictionaries

    try:
        range = xrange
    except:
        pass
    
    if __name__ == "__main__":
    
        T = "Programming Praxis"
        n = 2
    
        grams = {}
        for k in range(len(T) - n + 1):
            try:
                grams[T[k:k+n]] += 1
            except:
                grams[T[k:k+n]] = 1
            
        print (grams)
    

    {‘Pr’: 2, ‘ro’: 1, ‘og’: 1, ‘gr’: 1, ‘ra’: 2, ‘am’: 1, ‘mm’: 1, ‘mi’: 1, ‘in’: 1, ‘ng’: 1, ‘g ‘: 1, ‘ P’: 1, ‘ax’: 1, ‘xi’: 1, ‘is’: 1}

  6. Johann Hibschman said

    Here’s my smart-alec J version:

    ngrams=: [:(~.,:<@#/.~)2<\[
    

    That gives:

    ngrams 'Programming Praxis'
    +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
    |Pr|ro|og|gr|ra|am|mm|mi|in|ng|g | P|ax|xi|is|
    +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
    |2 |1 |1 |1 |2 |1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |
    +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
    

    The slightly-kinder-formatted version (not being obtuse) is:

    ngrams=: [: (~. ,: <@#/.~) 2 <\ [
    

    Ah, J. I’ll never use you for real work, but you’re so fun.

  7. Robert Cooper said

    Java…

     public static Map ngrams(int size, String read){
            HashMap histogram = new HashMap();
            int head = 0;
            int tail = size;
            while(tail  i + 1)
                        .orElse(1)
                );
                head++;
                tail++;
            }
            return histogram;
        }
    

  8. err… that didn’t work…

    
     public static Map<String, Integer> ngrams(int size, String read){
            HashMap histogram = new HashMap();
            int head = 0;
            int tail = size;
            while(tail < read.length()){
                String nGram = read.substring(head, tail);
                histogram.put(nGram,
                        Optional.ofNullable(histogram.get(nGram))
                        .map(i-> i + 1)
                        .orElse(1)
                );
                head++;
                tail++;
            }
            return histogram;
        }
    

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: