Porter Stemming

September 8, 2009

Our solution follows the reference implementation in the three places where it differs from the original algorithm. We represent a word as a list of characters in reverse order; thus the word praxis is represented by the list (#\s #\i #\x #\a #\r #\p). Storing the word as a list of characters eliminates all the arithmetic present in the original program; storing the word in reverse order makes it easy to access the end of the word, where all the action is. Our first functions determine if a letter is a vowel or consonant:

(define (consonant? cs)
  (case (car cs)
    ((#\a #\e #\i #\o #\u) #f)
    ((#\y) (if (null? (cdr cs)) #t (vowel? (cdr cs))))
    (else #t)))

(define (vowel? cs) (not (consonant? cs)))

We implement measure differently than Porter. Since the measure increases by one any time a consonant follows a vowel, we scan through the string, keeping track of whether the previous letter was a vowel or consonant; we first reverse the string into the zs list, determining if the letter is a vowel or consonant as we go:

(define (measure cs)
  (let loop ((m 0) (cs cs) (zs '()) (prev? #t))
    (cond ((pair? cs)
            (loop m (cdr cs) (cons (if (consonant? cs) #\c #\v) zs) prev?))
          ((null? zs) m)
          ((char=? (car zs) #\v) (loop m cs (cdr zs) #f))
          ((not prev?) (loop (+ m 1) cs (cdr zs) #t))
          (else (loop m cs (cdr zs) #t)))))

The vowel-in-stem, double-consonant and consonant-vowel-consonant tests are simple:

(define (v-stem? cs)
  (let loop ((cs cs))
    (cond ((null? cs) #f)
          ((vowel? cs) #t)
          (else (loop (cdr cs))))))

(define (dbl-cons? cs)
  (cond ((or (null? cs) (null? (cdr cs))) #f)
        ((not (char=? (car cs) (cadr cs))) #f)
        (else (consonant? cs))))

(define (cvc? cs)
  (cond ((< (length cs) 3) #f)
        ((vowel? cs) #f)
        ((consonant? (cdr cs)) #f)
        ((vowel? (cddr cs)) #f)
        ((char=? (car cs) #\w) #f)
        ((char=? (car cs) #\x) #f)
        ((char=? (car cs) #\y) #f)
        (else #t)))

The (ends? cs str) function returns #f if str is not a suffix of cs or the remaining stem if it is:

(define (ends? cs str)
  (let loop ((cs cs) (ss (reverse (string->list str))))
    (cond ((null? cs) #f)
          ((null? ss) cs)
          ((char=? (car cs) (car ss))
            (loop (cdr cs) (cdr ss)))
          (else #f))))

We are ready now to look at the actual stemming algorithm. Step 1a converts plural words to singular:

(define (step1a cs)
  (cond ((null? cs) cs) ((null? (cdr cs)) cs)
        ((ends? cs "sses") => (s "ss"))
        ((ends? cs "ies" ) => (s "i" ))
        ((ends? cs "ss"  ) => (s "ss"))
        ((ends? cs "s"   ) => (lambda (cs) cs))
        (else cs)))

Here we see why ends? was written as it was. The => operator of a cond clause takes the result of a predicate, if it is non-#f, and passes it to a single-argument function in its consequent; thus, s is a higher-order function, similar to the setto function of the reference implementation, that takes a string str and returns a function that takes a list cs representing a word stem (the word stem returned by the ends? function) and returns a new cs list with the characters of str prepended to the front of cs in reverse order:

(define (s str)
  (lambda (cs)
    (append (reverse (string->list str)) cs)))

Note that the final clause in the step1a function is the identity function, which returns the word stem unchanged. Step 1b handles past participles:

(define (step1b cs)
  (cond ((null? cs) cs) ((null? (cdr cs)) cs)
        ((ends? cs "eed") => (r "eed" "ee"))
        ((ends? cs "ed" ) => (q "ed"))
        ((ends? cs "ing") => (q "ing"))
        (else cs)))

R and q are higher-order functions, similar to s; r is similar to the r function of the reference implementation, and q implements the “continuation” logic for -ed and -ing endings:

(define (r str1 str2)
  (lambda (cs)
    (if (< 0 (measure cs))
        (append (reverse (string->list str2)) cs)
        (append (reverse (string->list str1)) cs))))

(define (q str)
  (lambda (cs)
    (if (not (v-stem? cs))
        (append (reverse (string->list str)) cs)
        (cond ((ends? cs "at") => (s "ate"))
              ((ends? cs "bl") => (s "ble"))
              ((ends? cs "iz") => (s "ize"))
              ((and (dbl-cons? cs)
                    (not (member (car cs) '(#\l #\s #\z))))
                (cdr cs))
              ((and (= (measure cs) 1) (cvc? cs))
                (cons #\e cs))
              (else cs)))))

Step 1c does some cleaning up:

(define (step1c cs)
  (cond ((null? cs) cs) ((null? (cdr cs)) cs)
        ((ends? cs "y") =>
          (lambda (cs)
            (if (v-stem? cs)
                (cons #\i cs)
                (cons #\y cs))))
        (else cs)))

Step 2 handles words with double suffices; the case syntax provides the “second-last letter” optimization that Porter describes in his algorithm:

(define (step2 cs)
  (if (or (null? cs) (null? (cdr cs))) cs
    (case (cadr cs)
      ((#\a) (cond ((ends? cs "ational") => (r "ational" "ate" ))
                   ((ends? cs "tional" ) => (r "tional"  "tion"))
                   (else cs)))
      ((#\c) (cond ((ends? cs "enci"   ) => (r "enci"    "ence"))
                   ((ends? cs "anci"   ) => (r "anci"    "ance"))
                   (else cs)))
      ((#\e) (cond ((ends? cs "izer"   ) => (r "izer"    "ize" ))
                   (else cs)))
      ((#\g) (cond ((ends? cs "logi"   ) => (r "logi"    "log" ))
                   (else cs)))
      ((#\l) (cond ((ends? cs "bli"    ) => (r "bli"     "ble" ))
                   ((ends? cs "alli"   ) => (r "alli"    "al"  ))
                   ((ends? cs "entli"  ) => (r "entli"   "ent" ))
                   ((ends? cs "eli"    ) => (r "eli"     "e"   ))
                   ((ends? cs "ousli"  ) => (r "ousli"   "ous" ))
                   (else cs)))
      ((#\o) (cond ((ends? cs "ization") => (r "ization" "ize" ))
                   ((ends? cs "ation"  ) => (r "ation"   "ate" ))
                   ((ends? cs "ator"   ) => (r "ator"    "ate" ))
                   (else cs)))
      ((#\s) (cond ((ends? cs "alism"  ) => (r "alism"   "al"  ))
                   ((ends? cs "iveness") => (r "iveness" "ive" ))
                   ((ends? cs "fulness") => (r "fulness" "ful" ))
                   ((ends? cs "ousness") => (r "ousness" "ous" ))
                   (else cs)))
      ((#\t) (cond ((ends? cs "aliti"  ) => (r "aliti"   "al"  ))
                   ((ends? cs "iviti"  ) => (r "iviti"   "ive" ))
                   ((ends? cs "biliti" ) => (r "biliti"  "ble" ))
                   (else cs)))
      (else cs))))

Step 3, which handles words ending in -ic, -ful, -ness, and other similar suffices, is similar to Step2:

(define (step3 cs)
  (if (or (null? cs) (null? (cdr cs))) cs
    (case (car cs)
      ((#\e) (cond ((ends? cs "icate"  ) => (r "icate"   "ic"  ))
                   ((ends? cs "ative"  ) => (r "ative"   ""    ))
                   ((ends? cs "alize"  ) => (r "alize"   "al"  ))
                   (else cs)))
      ((#\i) (cond ((ends? cs "iciti"  ) => (r "iciti"   "ic"  ))
                   (else cs)))
      ((#\l) (cond ((ends? cs "ical"   ) => (r "ical"    "ic"  ))
                   ((ends? cs "ful"    ) => (r "ful"     ""    ))
                   (else cs)))
      ((#\s) (cond ((ends? cs "ness"   ) => (r "ness"    ""    ))
                   (else cs)))
      (else cs))))

Step 4 handles words with suffices like -ant and -ence that also have measure greater than one; it uses a t higher-order function that is similar to r and s:

(define (t str)
  (lambda (cs)
    (if (< 1 (measure cs)) cs
      (append (reverse (string->list str)) cs))))

(define (step4 cs)
  (if (or (null? cs) (null? (cdr cs))) cs
    (case (cadr cs)
      ((#\a) (cond ((ends? cs "al"     ) => (t "al"   )) (else cs)))
      ((#\c) (cond ((ends? cs "ance"   ) => (t "ance" ))
                   ((ends? cs "ence"   ) => (t "ence" )) (else cs)))
      ((#\e) (cond ((ends? cs "er"     ) => (t "er"   )) (else cs)))
      ((#\i) (cond ((ends? cs "ic"     ) => (t "ic"   )) (else cs)))
      ((#\l) (cond ((ends? cs "able"   ) => (t "able" ))
                   ((ends? cs "ible"   ) => (t "ible" )) (else cs)))
      ((#\n) (cond ((ends? cs "ant"    ) => (t "ant"  ))
                   ((ends? cs "ement"  ) => (t "ement"))
                   ((ends? cs "ment"   ) => (t "ment" ))
                   ((ends? cs "ent"    ) => (t "ent"  )) (else cs)))
      ((#\o) (cond ((ends? cs "ion"    ) => (lambda (cs)
                                              (if (and (or (char=? (car cs) #\s)
                                                           (char=? (car cs) #\t))
                                                       (< 1 (measure cs)))
                                                  cs
                                                  (append (reverse (string->list "ion")) cs))))
                   ((ends? cs "ou"     ) => (t "ou"   )) (else cs)))
      ((#\s) (cond ((ends? cs "ism"    ) => (t "ism"  )) (else cs)))
      ((#\t) (cond ((ends? cs "ate"    ) => (t "ate"  ))
                   ((ends? cs "iti"    ) => (t "iti"  )) (else cs)))
      ((#\u) (cond ((ends? cs "ous"    ) => (t "ous"  )) (else cs)))
      ((#\v) (cond ((ends? cs "ive"    ) => (t "ive"  )) (else cs)))
      ((#\z) (cond ((ends? cs "ize"    ) => (t "ize"  )) (else cs)))
      (else cs))))

Step 5a handles words ending in e, and Step 5b handles words ending in double-l:

(define (step5a cs)
  (if (not (char=? (car cs) #\e)) cs
    (let ((m (measure (cdr cs))))
      (if (< 1 m) (cdr cs)
        (if (and (= m 1) (not (cvc? (cdr cs)))) (cdr cs) cs)))))

(define (step5b cs)
  (if (and (< 1 (measure cs)) (char=? (car cs) #\l) (char=? (cadr cs) #\l))
      (cdr cs) cs))

The stem function takes a word and returns its stem, converting to lower-case in the process. Since the output of each step is in the proper form, each can be fed to the input of the next step:

(define (stem word)
  (if (< (string-length word) 3) (string-downcase word)
    (list->string (reverse
      (step5b (step5a (step4 (step3 (step2 (step1c (step1b (step1a
        (reverse (string->list (string-downcase word)))))))))))))))

A simple test checks each word given as an example in the description of the algorithm:

(define (test-stem)
  (define (cs str) (reverse (string->list str)))
  (assert (step1a (cs "caresses"       )) (cs "caress"     ))
  (assert (step1a (cs "ponies"         )) (cs "poni"       ))
  (assert (step1a (cs "ties"           )) (cs "ti"         ))
  (assert (step1a (cs "caress"         )) (cs "caress"     ))
  (assert (step1a (cs "cats"           )) (cs "cat"        ))
  (assert (step1b (cs "feed"           )) (cs "feed"       ))
  (assert (step1b (cs "agreed"         )) (cs "agree"      ))
  (assert (step1b (cs "plastered"      )) (cs "plaster"    ))
  (assert (step1b (cs "bled"           )) (cs "bled"       ))
  (assert (step1b (cs "motoring"       )) (cs "motor"      ))
  (assert (step1b (cs "sing"           )) (cs "sing"       ))
  (assert (step1b (cs "conflated"      )) (cs "conflate"   ))
  (assert (step1b (cs "troubled"       )) (cs "trouble"    ))
  (assert (step1b (cs "sized"          )) (cs "size"       ))
  (assert (step1b (cs "hopping"        )) (cs "hop"        ))
  (assert (step1b (cs "tanned"         )) (cs "tan"        ))
  (assert (step1b (cs "falling"        )) (cs "fall"       ))
  (assert (step1b (cs "hissing"        )) (cs "hiss"       ))
  (assert (step1b (cs "fizzed"         )) (cs "fizz"       ))
  (assert (step1b (cs "failing"        )) (cs "fail"       ))
  (assert (step1b (cs "filing"         )) (cs "file"       ))
  (assert (step1c (cs "happy"          )) (cs "happi"      ))
  (assert (step1c (cs "sky"            )) (cs "sky"        ))
  (assert (step2  (cs "relational"     )) (cs "relate"     ))
  (assert (step2  (cs "conditional"    )) (cs "condition"  ))
  (assert (step2  (cs "rational"       )) (cs "rational"   ))
  (assert (step2  (cs "valenci"        )) (cs "valence"    ))
  (assert (step2  (cs "hesitanci"      )) (cs "hesitance"  ))
  (assert (step2  (cs "digitizer"      )) (cs "digitize"   ))
  (assert (step2  (cs "conformabli"    )) (cs "conformable"))
  (assert (step2  (cs "radicalli"      )) (cs "radical"    ))
  (assert (step2  (cs "differentli"    )) (cs "different"  ))
  (assert (step2  (cs "vileli"         )) (cs "vile"       ))
  (assert (step2  (cs "analogousli"    )) (cs "analogous"  ))
  (assert (step2  (cs "vietnamization" )) (cs "vietnamize" ))
  (assert (step2  (cs "predication"    )) (cs "predicate"  ))
  (assert (step2  (cs "operator"       )) (cs "operate"    ))
  (assert (step2  (cs "feudalism"      )) (cs "feudal"     ))
  (assert (step2  (cs "decisiveness"   )) (cs "decisive"   ))
  (assert (step2  (cs "hopefulness"    )) (cs "hopeful"    ))
  (assert (step2  (cs "callousness"    )) (cs "callous"    ))
  (assert (step2  (cs "formaliti"      )) (cs "formal"     ))
  (assert (step2  (cs "sensitiviti"    )) (cs "sensitive"  ))
  (assert (step2  (cs "sensibiliti"    )) (cs "sensible"   ))
  (assert (step3  (cs "triplicate"     )) (cs "triplic"    ))
  (assert (step3  (cs "formative"      )) (cs "form"       ))
  (assert (step3  (cs "formalize"      )) (cs "formal"     ))
  (assert (step3  (cs "electriciti"    )) (cs "electric"   ))
  (assert (step3  (cs "electrical"     )) (cs "electric"   ))
  (assert (step3  (cs "hopeful"        )) (cs "hope"       ))
  (assert (step3  (cs "goodness"       )) (cs "good"       ))
  (assert (step4  (cs "revival"        )) (cs "reviv"      ))
  (assert (step4  (cs "allowance"      )) (cs "allow"      ))
  (assert (step4  (cs "inference"      )) (cs "infer"      ))
  (assert (step4  (cs "airliner"       )) (cs "airlin"     ))
  (assert (step4  (cs "gyroscopic"     )) (cs "gyroscop"   ))
  (assert (step4  (cs "adjustable"     )) (cs "adjust"     ))
  (assert (step4  (cs "defensible"     )) (cs "defens"     ))
  (assert (step4  (cs "irritant"       )) (cs "irrit"      ))
  (assert (step4  (cs "replacement"    )) (cs "replac"     ))
  (assert (step4  (cs "adjustment"     )) (cs "adjust"     ))
  (assert (step4  (cs "dependent"      )) (cs "depend"     ))
  (assert (step4  (cs "adoption"       )) (cs "adopt"      ))
  (assert (step4  (cs "homologou"      )) (cs "homolog"    ))
  (assert (step4  (cs "communism"      )) (cs "commun"     ))
  (assert (step4  (cs "activate"       )) (cs "activ"      ))
  (assert (step4  (cs "angulariti"     )) (cs "angular"    ))
  (assert (step4  (cs "homologous"     )) (cs "homolog"    ))
  (assert (step4  (cs "effective"      )) (cs "effect"     ))
  (assert (step4  (cs "bowdlerize"     )) (cs "bowdler"    ))
  (assert (step5a (cs "probate"        )) (cs "probat"     ))
  (assert (step5a (cs "rate"           )) (cs "rate"       ))
  (assert (step5a (cs "cease"          )) (cs "ceas"       ))
  (assert (step5b (cs "controll"       )) (cs "control"    ))
  (assert (step5b (cs "roll"           )) (cs "roll"       ))
  (assert (stem       "relate"          )     "relat"       )
  (assert (stem       "probate"         )     "probat"      )
  (assert (stem       "conflate"        )     "conflat"     )
  (assert (stem       "pirate"          )     "pirat"       )
  (assert (stem       "prelate"         )     "prelat"      )
  (assert (stem       "derivate"        )     "deriv"       )
  (assert (stem       "activate"        )     "activ"       )
  (assert (stem       "demonstrate"     )     "demonstr"    )
  (assert (stem       "necessitate"     )     "necessit"    )
  (assert (stem       "renovate"        )     "renov"       )
  (assert (stem       "archprelate"     )     "archprel"    )
  (assert (stem       "generalizations" )     "gener"       )
  (assert (stem       "oscillators"     )     "oscil"       )
)

We use string-downcase and the assert macro from the Standard Prelude. The function is reasonably fast; Porter claimed in 1960 that stemming 10,000 words took 8.1 seconds on an IBM 370 mainframe, but on a recent-vintage personal computer our function takes about a sixth of a second to process the 23,531 words of Porter’s test vocabulary, with no errors.

You can run the code at http://programmingpraxis.codepad.org/Wyj6sW6N.

About these ads

Pages: 1 2 3

6 Responses to “Porter Stemming”

  1. (module porter-stemmer scheme

    (define or-map ormap)
    (define and-map andmap)

    (define (consonant? str i)
    (let ((char (list-ref str i)))
    (cond ((member char ‘(#\a #\e #\i #\o #\u)) #f)
    ((equal? char #\y)
    (if (= i 0) #t (not (consonant? str (- i 1)))))
    (else #t))))

    ;; m() measures the number of consonant sequences between k0 and j. if c is
    ;; a consonant sequence and v a vowel sequence, and indicates arbitrary
    ;; presence,

    ;; gives 0
    ;; vc gives 1
    ;; vcvc gives 2
    ;; vcvcvc gives 3
    ;; ….

    (define (num-consonant-sequences str)
    (letrec ((helper (λ(str lim i prefix-ends-in-consonant?)
    (if (0 stem) (1 stem) (cry in step1b
    (has-vowel? (rest str)))))

    (define (ends-in-double-consonant? str)
    (let ((r (reverse str)))
    (and (< 1 (length str))
    (equal? (first r) (second r))
    (consonant? r 0))))

    ;; cvc(i) is TRUE i-2,i-1,i has the form consonant – vowel – consonant
    ;; and also if the second c is not w,x or y. this is used when trying to
    ;; restore an e at the end of a short word. e.g.

    ;; cav(e), lov(e), hop(e), crim(e), but
    ;; snow, box, tray.

    (define (cvc? str)
    (let ((last (- (length str) 1)))
    (and (< 1 last)
    (consonant? str last)
    (not (consonant? str (- last 1)))
    (consonant? str (- last 2))
    (not (member (list-ref str last) '(#\w #\x #\y))))))

    (define (cvcv? str)
    (let ((nonvowel (memf (λ(x) (not (member x '(#\a #\e #\i #\o #\u)))) (reverse str))))
    (and nonvowel (cvc? (reverse nonvowel)))))

    ;; ends(s) is TRUE k0,…k ends with the string s.

    (define (starts-with str beginning)
    “true iff beginning is a substring of str starting at 0″
    (or (if (empty? beginning) str false)
    (and (not (empty? str))
    (equal? (first str) (first beginning))
    (starts-with (rest str) (rest beginning)))))

    (define (ends-with str ending)
    (let ((stem (starts-with (reverse str) (reverse ending))))
    (if stem (reverse stem) #f)))
    (define (ends str ending)
    (ends-with str (string->list ending)))

    (define (replace-ending-if str condition ending replacement)
    (let ((stem (ends-with str (string->list ending))))
    (if stem
    (if (condition stem)
    (append stem (string->list replacement))
    str)
    #f)))

    (define (->true any) true)
    (define (rules str proc replacements)
    (or (or-map proc replacements) str))

    ;; step1ab() gets rid of plurals and -ed or -ing. e.g.

    ;; caresses -> caress
    ;; ponies -> poni
    ;; ties -> ti
    ;; caress -> caress
    ;; cats -> cat

    ;; feed -> feed
    ;; agreed -> agree
    ;; disabled -> disable

    ;; matting -> mat
    ;; mating -> mate
    ;; meeting -> meet
    ;; milling -> mill
    ;; messing -> mess

    ;; meetings -> meet

    (define (step1a str)
    (rules
    str
    (λ(p) (replace-ending-if str ->true (car p) (cdr p)))
    ‘((“sses” . “ss”)
    (“ies” . “i”)
    (“ss” . “ss”)
    (“s” . “”))))

    (define (step1b str)
    (let ((ed-ing-stem (or (ends str “ed”) (ends str “ing”))))
    (if ed-ing-stem
    (or (replace-ending-if str m>0 “eed” “ee”)
    (if (has-vowel? ed-ing-stem)
    (or (replace-ending-if str ->true “ated” “ate”)
    (replace-ending-if str ->true “ating” “ate”)
    (replace-ending-if str ->true “bled” “ble”)
    (replace-ending-if str ->true “bling” “ble”)
    (replace-ending-if str ->true “ized” “ize”)
    (replace-ending-if str ->true “izing” “ize”)
    (if (ends-in-double-consonant? ed-ing-stem)
    (if (member (list-ref ed-ing-stem (- (length ed-ing-stem) 1)) ‘(#\l #\s #\z))
    ed-ing-stem
    (reverse (rest (reverse ed-ing-stem))))
    #f)
    (if (and (m=1 ed-ing-stem) (cvc? ed-ing-stem))
    (append ed-ing-stem ‘(#\e))
    ed-ing-stem))
    str))
    str)))

    ;; step1c() turns terminal y to i when there is another vowel in the stem.

    (define (step1c str)
    (or (replace-ending-if str has-vowel? “y” “i”) str))

    ;; step2() maps double suffices to single ones. so -ization ( = -ize plus
    ;; -ation) maps to -ize etc. Call this function only if the string before the
    ;; suffix has at least one non-initial consonant-sequence.

    (define (step2 str)
    (or
    (or-map
    (lambda (end-pattern)
    (replace-ending-if str m>0 (car end-pattern) (cdr end-pattern)))
    ‘((“ational” . “ate”)
    (“tional” . “tion”)
    (“enci” . “ence”)
    (“anci” . “ance”)
    (“izer” . “ize”)
    (“bli” . “ble”)
    ;; -DEPARTURE-
    ;; To match the published algorithm, replace prev line with
    ;; (“abli” . “able”)
    (“alli” . “al”)
    (“entli” . “ent”)
    (“eli” . “e”)
    (“ousli” . “ous”)
    (“ization” . “ize”)
    (“ation” . “ate”)
    (“ator” . “ate”)
    (“alism” . “al”)
    (“iveness” . “ive”)
    (“fulness” . “ful”)
    (“ousness” . “ous”)
    (“aliti” . “al”)
    (“iviti” . “ive”)
    (“biliti” . “ble”)
    (“logi” . “log”) ;; DEPARTURE: the published algorithm does not have this line
    ))
    str))

    ;; step3() deals with -ic-, -full, -ness etc. similar strategy to step2.

    (define (step3 str)
    (or
    (or-map
    (lambda (end-pattern)
    (replace-ending-if str m>0 (car end-pattern) (cdr end-pattern)))
    ‘((“icate” . “ic”)
    (“ative” . “”) ; huh?
    (“alize” . “al”)
    (“iciti” . “ic”)
    (“ical” . “ic”)
    (“ful” . “”) ; huh?
    (“ness” . “”) ; huh?
    ))
    str))

    ;; step4() takes off -ant, -ence etc., in context vcvc.

    (define (step4 str)
    (or
    (or-map
    (lambda (ending)
    (replace-ending-if str m>1 ending “”))
    ‘(“al” “ance” “ence” “er” “ic” “able” “ible” “ant” “ement” “ment” “ent”
    “ou” ;; for -ous
    “ism” “ate” “iti” “ous” “ive” “ize”))
    (replace-ending-if str (λ(stem) (m>1 (append stem ‘(#\s)))) “sion” “s”)
    (replace-ending-if str (λ(stem) (m>1 (append stem ‘(#\t)))) “tion” “t”)
    str))

    ;; step5 removes a final -e if m() > 1, and changes -ll to -l if m() > 1.
    (define (step5a str)
    (let ((stem-e (ends str “e”)))
    (if (and stem-e
    (let ((m (num-consonant-sequences stem-e)))
    (or (1 str)) “ll” “l”) str))

    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

    (define (puts . stuff)
    (map display stuff)
    (newline)
    (car (reverse stuff)))

    (define (steps str)
    (let* ((zero (puts “step0: ” (string->list str)))
    (one-a (puts “step1a: ” (step1a zero)))
    (one-b (puts “step1b: ” (step1b one-a)))
    (one-c (puts “step1c: ” (step1c one-b)))
    (two (puts “step2: ” (step2 one-c)))
    (three (puts “step3: ” (step3 two)))
    (four (puts “step4: ” (step4 three)))
    (five-a (puts “step5a: ” (step5a four)))
    (done (puts “step5b: ” (step5b five-a)))
    )
    done))

    ;; In stem(p,i,j), p is a char pointer, and the string to be stemmed is from p[i] to p[j]
    ;; inclusive. Typically i is zero and j is the offset to the last character of a string,
    ;; (p[j+1] == ”). The stemmer adjusts the characters p[i] … p[j] and returns the new
    ;; end-point of the string, k. Stemming never increases word length, so i <= k <= j. To
    ;; turn the stemmer into a module, declare 'stem' as extern, and delete the remainder of
    ;; this file.

    (define (stem str)
    (let ((len (string-length str)))
    (if (string
    (step5b
    (step5a
    (step4
    (step3
    (step2
    (step1c
    (step1b
    (step1a
    (string->list str)))))))))))))
    (provide stem)

    )

  2. #lang scheme
    ;;; The above should have come with the documentation:

    ;; The Porter Stemming Algorithm, somewhat mechanically hand translated to Common Lisp by
    ;; Steven M. Haflich smh@franz.com Feb 2002. Subsequently translated to Scheme and made
    ;; wholly inefficient, if more readable, by Gregory Marton in 2009.

    ;; This port has only been tested with PLT Scheme, though there is little PLT-specific code.
    ;; This code is offered in the hope it will be useful, but with no warranty of correctness,
    ;; suitability, usability, or anything else.

    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

    ;; This is the Porter stemming algorithm, as presented ins

    ;; Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
    ;; no. 3, pp 130-137,

    ;; only differing from it at the points maked –DEPARTURE– below.

    ;; See also http://www.tartarus.org/~martin/PorterStemmer

    ;; The algorithm as described in the paper could be exactly replicated
    ;; by adjusting the points of DEPARTURE, but this is barely necessary,
    ;; because (a) the points of DEPARTURE are definitely improvements, and
    ;; (b) no encoding of the Porter stemmer I have seen is anything like
    ;; as exact as this version, even with the points of DEPARTURE!

    ;; Release 1

    ;; Note that only lower case sequences are stemmed. Forcing to lower case
    ;; should be done before stem(…) is called.

    ;; This implementation is not a particularly fast one.
    ;; In particular, there is a common optimization to make the many ends-with
    ;; tests faster by switching on the last or penultimate letter, which I chose
    ;; not to use here for the sake of readability.
    ;; I suspect without proof that the many reversals could be ameliorated by just
    ;; reversing the string in the first place and pre-reversing the patterns, or
    ;; by maintaining a last position in the string vector instead of using a list of
    ;; characters.

    ;;; and the tests, which refer to files that can be found at:
    ;;; http://tartarus.org/~martin/PorterStemmer/

    (require (planet schematics/schemeunit:3:4))
    (require (planet schematics/schemeunit:3/text-ui))
    (require “files.ss”)
    (require “porter-stemmer.ss”)

    (define *the-tests*
    (test-suite
    “stemming”
    (let ((count 0))
    (test-case
    “stemming”
    (for-each-line-in-file
    “porter-test.txt”
    (lambda (line)
    (set! count (+ 1 count)) (when (integer? (/ count 1000)) (display “.”))
    ;;(display line) (newline)
    (let* ((untab (regexp-split #px”\t” (chomp line)))
    (original (first untab))
    (stemmed (second untab))
    (result (stem original)))
    (check-equal? result stemmed original))))))))

    (define num-errors (run-tests *the-tests*))
    (define (test) (exit num-errors))
    (provide test)

  3. A very nice Programming Praxis example, as here presented. I have created a link to it from the Porter Stemmer page.

  4. [...] Praxis porter stemming part1 [...]

  5. kishor parkhe said

    nice example

  6. nsmeef said

    Reblogged this on {blog;}.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Follow

Get every new post delivered to your Inbox.

Join 616 other followers

%d bloggers like this: