N-Gram Frequency

February 20, 2018

We use a two-step process. First, we collect the list of n-grams. Second, we call sort and uniq-c to prepare the frequency table:

(define (freq str . size)
  (let ((size (if (pair? size) (car size) 1)))
    (do ((i 0 (+ i 1)) (blocks (list) (cons (substring str i (+ i size)) blocks)))
      ((< (string-length str) (+ i size)) (uniq-c string=? (sort string<? blocks))))))

And here’s the example task; we sort by n-gram text rather than highest frequency to lowest, but you can sort the other way if you prefer:

> (freq "Programming Praxis" 2)
((" P" . 1) ("Pr" . 2) ("am" . 1) ("ax" . 1) ("g " . 1) ("gr" . 1)
("in" . 1) ("is" . 1) ("mi" . 1) ("mm" . 1) ("ng" . 1)
("og" . 1) ("ra" . 2) ("ro" . 1) ("xi" . 1))

You can run the program at https://ideone.com/KCjbtr.

P.S. I discovered after I wrote and scheduled this exercise that I had already done it a few years ago. Fortunately, I used the same algorithm: collect blocks, then sort and count.

Posted by programmingpraxis

Filed in Exercises

8 Comments »

8 Responses to “N-Gram Frequency”

Sochima Biereagu, KodeJuice said

February 20, 2018 at 4:14 PM

Using a deque we can iterate through the string, adding each character to the deque, keeping count of the deques’ size, once it reaches the n-gram size, add to result, and deque the first item.

from collections import deque

def ngrams(word, size):
  res, grams, w = [], deque(), ''

  for c in word:
    w += c
    grams.append(c)

    if len(grams) == size:
      res.append(w)
      w = w[1:]
      grams.popleft()

  return res

print(ngrams("Programming Praxis", 2)) # => ['Pr', 'ro', 'og', 'gr', 'ra', 'am', 'mm', 'mi', 'in', 'ng', 'g ', ' P', 'Pr', 'ra', 'ax', 'xi', 'is']

Sochima Biereagu, KodeJuice said

February 20, 2018 at 5:27 PM

from collections import deque

def ngrams(word, size):
  freq, grams, w = {}, deque(), ''

  for c in word:
    w += c
    grams.append(c)

    if len(grams) == size:
      freq[w] = freq[w]+1 if w in freq else 0
      w = w[1:]
      grams.popleft()

  return freq

print(ngrams("Programming Praxis", 2)) # => {'Pr': 2, 'ro': 1, 'og': 1, 'gr': 1, 'ra': 2, 'am': 1, 'mm': 1, 'mi': 1, 'in': 1, 'ng': 1, 'g ': 1, ' P': 1, 'ax': 1, 'xi': 1, 'is': 1}

Daniel said

February 21, 2018 at 3:05 AM

Here’s a solution in Python 2.

from collections import Counter

def count_ngrams(s, n):
  ngrams = (s[x:x+n] for x in xrange(len(s)-n+1))
  return dict(Counter(ngrams))

s = "Programming Praxis"
ngram_counts = count_ngrams(s, 2)
for ngram, count in ngram_counts.iteritems():
  print "'{}' {}".format(ngram, count)

Output:

'Pr' 2
'xi' 1
'gr' 1
'og' 1
'am' 1
'is' 1
'in' 1
'mi' 1
'ng' 1
'mm' 1
' P' 1
'ra' 2
'g ' 1
'ax' 1
'ro' 1

Globules said

February 21, 2018 at 5:14 AM

Here’s a Haskell version.

import Data.Int (Int64)
import Data.List (sortBy)
import Data.Ord (Down(..), comparing)
import System.Environment (getArgs)
import Text.Printf (printf)
import Text.Read (readMaybe)
import qualified Data.Map.Strict as M
import qualified Data.Text.Lazy as T
import qualified Data.Text.Lazy.IO as IO

-- Given a string return a map from substrings of length n to the number of
-- times they occur.
ngramMap :: Int64 -> T.Text -> M.Map T.Text Int
ngramMap n = M.fromListWith incr . filter (hasLen n) . map (chars n) . T.tails
  where incr i _ = i + 1
        hasLen m = (== m) . T.length . fst
        chars m t = (T.take m t, 1)

-- Sort a list in decreasing order on the second element.
sortByCount :: Ord b => [(a, b)] -> [(a, b)]
sortByCount = sortBy (comparing (Down . snd))

-- Read some text, count the occurrences of each ngram, then print the results.
testNgrams :: Int64 -> IO ()
testNgrams n = do
  text <- IO.getContents
  let ngrams = sortByCount $ M.toList $ ngramMap n text
  mapM_ (uncurry (printf "%s %d\n")) ngrams

main :: IO ()
main = do
  args <- getArgs
  case map readMaybe args of
    [Just n] -> testNgrams n
    _        -> putStrLn "Usage: ngrams ngram-length"

$ printf 'Programming Praxis' | ./ngrams 2
Pr 2
ra 2
 P 1
am 1
ax 1
g  1
gr 1
in 1
is 1
mi 1
mm 1
ng 1
og 1
ro 1
xi 1
$ printf 'Mississippi' | ./ngrams 4
issi 2
Miss 1
ippi 1
sipp 1
siss 1
ssip 1
ssis 1

Milbrae said

February 21, 2018 at 1:11 PM

Generic Python, using dictionaries

try:
    range = xrange
except:
    pass

if __name__ == "__main__":

    T = "Programming Praxis"
    n = 2

    grams = {}
    for k in range(len(T) - n + 1):
        try:
            grams[T[k:k+n]] += 1
        except:
            grams[T[k:k+n]] = 1
        
    print (grams)

{‘Pr’: 2, ‘ro’: 1, ‘og’: 1, ‘gr’: 1, ‘ra’: 2, ‘am’: 1, ‘mm’: 1, ‘mi’: 1, ‘in’: 1, ‘ng’: 1, ‘g ‘: 1, ‘ P’: 1, ‘ax’: 1, ‘xi’: 1, ‘is’: 1}

Johann Hibschman said

February 22, 2018 at 2:17 PM

Here’s my smart-alec J version:

ngrams=: [:(~.,:<@#/.~)2<\[

That gives:

ngrams 'Programming Praxis'
+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|Pr|ro|og|gr|ra|am|mm|mi|in|ng|g | P|ax|xi|is|
+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|2 |1 |1 |1 |2 |1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |
+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+

The slightly-kinder-formatted version (not being obtuse) is:

ngrams=: [: (~. ,: <@#/.~) 2 <\ [

Ah, J. I’ll never use you for real work, but you’re so fun.

Robert Cooper said

March 5, 2018 at 8:57 PM

Java…


 public static Map ngrams(int size, String read){
        HashMap histogram = new HashMap();
        int head = 0;
        int tail = size;
        while(tail  i + 1)
                    .orElse(1)
            );
            head++;
            tail++;
        }
        return histogram;
    }

Robert "kebernet" Cooper said

March 5, 2018 at 8:58 PM

err… that didn’t work…



 public static Map<String, Integer> ngrams(int size, String read){
        HashMap histogram = new HashMap();
        int head = 0;
        int tail = size;
        while(tail < read.length()){
            String nGram = read.substring(head, tail);
            histogram.put(nGram,
                    Optional.ofNullable(histogram.get(nGram))
                    .map(i-> i + 1)
                    .orElse(1)
            );
            head++;
            tail++;
        }
        return histogram;
    }

S	M	T	W	T	F	S
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28

Programming Praxis

N-Gram Frequency

February 20, 2018

8 Responses to “N-Gram Frequency”

Leave a comment

Categories

Archives

Archives

Programming Praxis

N-Gram Frequency

February 20, 2018

Share this:

Related

8 Responses to “N-Gram Frequency”

Leave a comment

Categories

Archives

Archives