N-Gram Frequency
February 20, 2018
We use a two-step process. First, we collect the list of n-grams. Second, we call sort and uniq-c to prepare the frequency table:
(define (freq str . size)
(let ((size (if (pair? size) (car size) 1)))
(do ((i 0 (+ i 1)) (blocks (list) (cons (substring str i (+ i size)) blocks)))
((< (string-length str) (+ i size)) (uniq-c string=? (sort string<? blocks))))))
And here’s the example task; we sort by n-gram text rather than highest frequency to lowest, but you can sort the other way if you prefer:
> (freq "Programming Praxis" 2)
((" P" . 1) ("Pr" . 2) ("am" . 1) ("ax" . 1) ("g " . 1) ("gr" . 1)
("in" . 1) ("is" . 1) ("mi" . 1) ("mm" . 1) ("ng" . 1)
("og" . 1) ("ra" . 2) ("ro" . 1) ("xi" . 1))
You can run the program at https://ideone.com/KCjbtr.
P.S. I discovered after I wrote and scheduled this exercise that I had already done it a few years ago. Fortunately, I used the same algorithm: collect blocks, then sort and count.
Using a deque we can iterate through the string, adding each character to the deque, keeping count of the deques’ size, once it reaches the n-gram size, add to result, and
dequethe first item.from collections import deque def ngrams(word, size): res, grams, w = [], deque(), '' for c in word: w += c grams.append(c) if len(grams) == size: res.append(w) w = w[1:] grams.popleft() return res print(ngrams("Programming Praxis", 2)) # => ['Pr', 'ro', 'og', 'gr', 'ra', 'am', 'mm', 'mi', 'in', 'ng', 'g ', ' P', 'Pr', 'ra', 'ax', 'xi', 'is']from collections import deque def ngrams(word, size): freq, grams, w = {}, deque(), '' for c in word: w += c grams.append(c) if len(grams) == size: freq[w] = freq[w]+1 if w in freq else 0 w = w[1:] grams.popleft() return freq print(ngrams("Programming Praxis", 2)) # => {'Pr': 2, 'ro': 1, 'og': 1, 'gr': 1, 'ra': 2, 'am': 1, 'mm': 1, 'mi': 1, 'in': 1, 'ng': 1, 'g ': 1, ' P': 1, 'ax': 1, 'xi': 1, 'is': 1}Here’s a solution in Python 2.
from collections import Counter def count_ngrams(s, n): ngrams = (s[x:x+n] for x in xrange(len(s)-n+1)) return dict(Counter(ngrams)) s = "Programming Praxis" ngram_counts = count_ngrams(s, 2) for ngram, count in ngram_counts.iteritems(): print "'{}' {}".format(ngram, count)Output:
Here’s a Haskell version.
import Data.Int (Int64) import Data.List (sortBy) import Data.Ord (Down(..), comparing) import System.Environment (getArgs) import Text.Printf (printf) import Text.Read (readMaybe) import qualified Data.Map.Strict as M import qualified Data.Text.Lazy as T import qualified Data.Text.Lazy.IO as IO -- Given a string return a map from substrings of length n to the number of -- times they occur. ngramMap :: Int64 -> T.Text -> M.Map T.Text Int ngramMap n = M.fromListWith incr . filter (hasLen n) . map (chars n) . T.tails where incr i _ = i + 1 hasLen m = (== m) . T.length . fst chars m t = (T.take m t, 1) -- Sort a list in decreasing order on the second element. sortByCount :: Ord b => [(a, b)] -> [(a, b)] sortByCount = sortBy (comparing (Down . snd)) -- Read some text, count the occurrences of each ngram, then print the results. testNgrams :: Int64 -> IO () testNgrams n = do text <- IO.getContents let ngrams = sortByCount $ M.toList $ ngramMap n text mapM_ (uncurry (printf "%s %d\n")) ngrams main :: IO () main = do args <- getArgs case map readMaybe args of [Just n] -> testNgrams n _ -> putStrLn "Usage: ngrams ngram-length"Generic Python, using dictionaries
try: range = xrange except: pass if __name__ == "__main__": T = "Programming Praxis" n = 2 grams = {} for k in range(len(T) - n + 1): try: grams[T[k:k+n]] += 1 except: grams[T[k:k+n]] = 1 print (grams){‘Pr’: 2, ‘ro’: 1, ‘og’: 1, ‘gr’: 1, ‘ra’: 2, ‘am’: 1, ‘mm’: 1, ‘mi’: 1, ‘in’: 1, ‘ng’: 1, ‘g ‘: 1, ‘ P’: 1, ‘ax’: 1, ‘xi’: 1, ‘is’: 1}
Here’s my smart-alec J version:
That gives:
The slightly-kinder-formatted version (not being obtuse) is:
Ah, J. I’ll never use you for real work, but you’re so fun.
Java…
public static Map ngrams(int size, String read){ HashMap histogram = new HashMap(); int head = 0; int tail = size; while(tail i + 1) .orElse(1) ); head++; tail++; } return histogram; }err… that didn’t work…
public static Map<String, Integer> ngrams(int size, String read){ HashMap histogram = new HashMap(); int head = 0; int tail = size; while(tail < read.length()){ String nGram = read.substring(head, tail); histogram.put(nGram, Optional.ofNullable(histogram.get(nGram)) .map(i-> i + 1) .orElse(1) ); head++; tail++; } return histogram; }