; pregexp.ss
; http://ds26gte.github.io/pregexp/index.html
; https://github.com/ds26gte/pregexp
; https://github.com/ds26gte/pregexp/blob/master/pregexp.scm
;pregexp.scm
;Portable regular expressions for Scheme
;Dorai Sitaram
;http://www.ccs.neu.edu/~dorai
;dorai AT ccs DOT neu DOT edu
;Oct 2, 1999
(define *pregexp-version* 20050502) ;last change
(define *pregexp-comment-char* #\;)
(define *pregexp-nul-char-int*
;can't assume #\nul maps to 0 because of Scsh
(- (char->integer #\a) 97))
(define *pregexp-return-char*
;can't use #\return because it isn't R5RS
(integer->char
(+ 13 *pregexp-nul-char-int*)))
(define *pregexp-tab-char*
;can't use #\tab because it isn't R5RS
(integer->char
(+ 9 *pregexp-nul-char-int*)))
(define *pregexp-space-sensitive?* #t)
(define pregexp-reverse!
;the useful reverse! isn't R5RS
(lambda (s)
(let loop ((s s) (r '()))
(if (null? s) r
(let ((d (cdr s)))
(set-cdr! s r)
(loop d s))))))
(define pregexp-error
;R5RS won't give me a portable error procedure.
;modify this as needed
(lambda whatever
(display "Error:")
(for-each (lambda (x) (display #\space) (write x))
whatever)
(newline)
(error 'pregexp "pregexp-error")))
(define pregexp-read-pattern
(lambda (s i n)
(if (>= i n)
(list
(list ':or (list ':seq)) i)
(let loop ((branches '()) (i i))
(if (or (>= i n)
(char=? (string-ref s i) #\)))
(list (cons ':or (pregexp-reverse! branches)) i)
(let ((vv (pregexp-read-branch
s
(if (char=? (string-ref s i) #\|) (+ i 1) i) n)))
(loop (cons (car vv) branches) (cadr vv))))))))
(define pregexp-read-branch
(lambda (s i n)
(let loop ((pieces '()) (i i))
(cond ((>= i n)
(list (cons ':seq (pregexp-reverse! pieces)) i))
((let ((c (string-ref s i)))
(or (char=? c #\|)
(char=? c #\))))
(list (cons ':seq (pregexp-reverse! pieces)) i))
(else (let ((vv (pregexp-read-piece s i n)))
(loop (cons (car vv) pieces) (cadr vv))))))))
(define pregexp-read-piece
(lambda (s i n)
(let ((c (string-ref s i)))
(case c
((#\^) (list ':bos (+ i 1)))
((#\$) (list ':eos (+ i 1)))
((#\.) (pregexp-wrap-quantifier-if-any
(list ':any (+ i 1)) s n))
((#\[) (let ((i+1 (+ i 1)))
(pregexp-wrap-quantifier-if-any
(case (and (
(lambda (num-i)
(list (list ':backref (car num-i)) (cadr num-i))))
((pregexp-read-escaped-char s i n) =>
(lambda (char-i)
(list (car char-i) (cadr char-i))))
(else (pregexp-error 'pregexp-read-piece 'backslash)))
s n))
(else
(if (or *pregexp-space-sensitive?*
(and (not (char-whitespace? c))
(not (char=? c *pregexp-comment-char*))))
(pregexp-wrap-quantifier-if-any
(list c (+ i 1)) s n)
(let loop ((i i) (in-comment? #f))
(if (>= i n) (list ':empty i)
(let ((c (string-ref s i)))
(cond (in-comment?
(loop (+ i 1)
(not (char=? c #\newline))))
((char-whitespace? c)
(loop (+ i 1) #f))
((char=? c *pregexp-comment-char*)
(loop (+ i 1) #t))
(else (list ':empty i))))))))))))
(define pregexp-read-escaped-number
(lambda (s i n)
; s[i] = \
(and (= i n)
(list (string->number
(list->string (pregexp-reverse! r))) i)
(let ((c (string-ref s i)))
(if (char-numeric? c)
(loop (+ i 1) (cons c r))
(list (string->number
(list->string (pregexp-reverse! r)))
i))))))))))
(define pregexp-read-escaped-char
(lambda (s i n)
; s[i] = \
(and (= i n)
(pregexp-error 'pregexp-read-posix-char-class)
(let ((c (string-ref s i)))
(cond ((char=? c #\^)
(set! neg? #t)
(loop (+ i 1) r))
((char-alphabetic? c)
(loop (+ i 1) (cons c r)))
((char=? c #\:)
(if (or (>= (+ i 1) n)
(not (char=? (string-ref s (+ i 1)) #\])))
(pregexp-error 'pregexp-read-posix-char-class)
(let ((posix-class
(string->symbol
(list->string (pregexp-reverse! r)))))
(list (if neg? (list ':neg-char posix-class)
posix-class)
(+ i 2)))))
(else
(pregexp-error
'pregexp-read-posix-char-class)))))))))
(define pregexp-read-cluster-type
(lambda (s i n)
; s[i-1] = left-paren
(let ((c (string-ref s i)))
(case c
((#\?)
(let ((i (+ i 1)))
(case (string-ref s i)
((#\:) (list '() (+ i 1)))
((#\=) (list '(:lookahead) (+ i 1)))
((#\!) (list '(:neg-lookahead) (+ i 1)))
((#\>) (list '(:no-backtrack) (+ i 1)))
((#\= i n)
(begin (set-car! (cdr new-re) #f)
(set-car! (cdr new-vv) i))
(let ((c (string-ref s i)))
(cond ((and (char-whitespace? c)
(not *pregexp-space-sensitive?*))
(loop (+ i 1)))
((char=? c #\?)
(set-car! (cdr new-re) #t)
(set-car! (cdr new-vv) (+ i 1)))
(else (set-car! (cdr new-re) #f)
(set-car! (cdr new-vv) i))))))
new-vv))
(else vv)))))))))
;
(define pregexp-read-nums
(lambda (s i n)
; s[i-1] = {
; returns (p q k) where s[k] = }
(let loop ((p '()) (q '()) (k i) (reading 1))
(if (>= k n) (pregexp-error 'pregexp-read-nums))
(let ((c (string-ref s k)))
(cond ((char-numeric? c)
(if (= reading 1)
(loop (cons c p) q (+ k 1) 1)
(loop p (cons c q) (+ k 1) 2)))
((and (char-whitespace? c) (not *pregexp-space-sensitive?*))
(loop p q (+ k 1) reading))
((and (char=? c #\,) (= reading 1))
(loop p q (+ k 1) 2))
((char=? c #\})
(let ((p (string->number
(list->string (pregexp-reverse! p))))
(q (string->number
(list->string (pregexp-reverse! q)))))
(cond ((and (not p) (= reading 1)) (list 0 #f k))
((= reading 1) (list p p k))
(else (list p q k)))))
(else #f))))))
(define pregexp-invert-char-list
(lambda (vv)
(set-car! (car vv) ':none-of-chars)
vv))
;
(define pregexp-read-char-list
(lambda (s i n)
(let loop ((r '()) (i i))
(if (>= i n)
(pregexp-error 'pregexp-read-char-list
'character-class-ended-too-soon)
(let ((c (string-ref s i)))
(case c
((#\]) (if (null? r)
(loop (cons c r) (+ i 1))
(list (cons ':one-of-chars (pregexp-reverse! r))
(+ i 1))))
((#\\ )
(let ((char-i (pregexp-read-escaped-char s i n)))
(if char-i (loop (cons (car char-i) r) (cadr char-i))
(pregexp-error 'pregexp-read-char-list 'backslash))))
((#\-) (if (or (null? r)
(let ((i+1 (+ i 1)))
(and ( n1 n) (fk)
(let loop ((j 0) (k i))
(cond ((>= j n1) (sk k))
((>= k n) (fk))
((char=? (string-ref s1 j) (string-ref s k))
(loop (+ j 1) (+ k 1)))
(else (fk))))))))
(define pregexp-char-word?
(lambda (c)
;too restrictive for Scheme but this
;is what \w is in most regexp notations
(or (char-alphabetic? c)
(char-numeric? c)
(char=? c #\_))))
(define pregexp-at-word-boundary?
(lambda (s i n)
(or (= i 0) (>= i n)
(let ((c/i (string-ref s i))
(c/i-1 (string-ref s (- i 1))))
(let ((c/i/w? (pregexp-check-if-in-char-class?
c/i ':word))
(c/i-1/w? (pregexp-check-if-in-char-class?
c/i-1 ':word)))
(or (and c/i/w? (not c/i-1/w?))
(and (not c/i/w?) c/i-1/w?)))))))
(define pregexp-check-if-in-char-class?
(lambda (c char-class)
(case char-class
((:any) (not (char=? c #\newline)))
;
((:alnum) (or (char-alphabetic? c) (char-numeric? c)))
((:alpha) (char-alphabetic? c))
((:ascii) (integer c) 128))
((:blank) (or (char=? c #\space) (char=? c *pregexp-tab-char*)))
((:cntrl) (integer c) 32))
((:digit) (char-numeric? c))
((:graph) (and (>= (char->integer c) 32)
(not (char-whitespace? c))))
((:lower) (char-lower-case? c))
((:print) (>= (char->integer c) 32))
((:punct) (and (>= (char->integer c) 32)
(not (char-whitespace? c))
(not (char-alphabetic? c))
(not (char-numeric? c))))
((:space) (char-whitespace? c))
((:upper) (char-upper-case? c))
((:word) (or (char-alphabetic? c)
(char-numeric? c)
(char=? c #\_)))
((:xdigit) (or (char-numeric? c)
(char-ci=? c #\a) (char-ci=? c #\b)
(char-ci=? c #\c) (char-ci=? c #\d)
(char-ci=? c #\e) (char-ci=? c #\f)))
(else (pregexp-error 'pregexp-check-if-in-char-class?)))))
(define pregexp-list-ref
(lambda (s i)
;like list-ref but returns #f if index is
;out of bounds
(let loop ((s s) (k 0))
(cond ((null? s) #f)
((= k i) (car s))
(else (loop (cdr s) (+ k 1)))))))
;re is a compiled regexp. It's a list that can't be
;nil. pregexp-match-positions-aux returns a 2-elt list whose
;car is the string-index following the matched
;portion and whose cadr contains the submatches.
;The proc returns false if there's no match.
;Am spelling loop- as loup- because these shouldn't
;be translated into CL loops by scm2cl (although
;they are tail-recursive in Scheme)
(define pregexp-make-backref-list
(lambda (re)
(let sub ((re re))
(if (pair? re)
(let ((car-re (car re))
(sub-cdr-re (sub (cdr re))))
(if (eqv? car-re ':sub)
(cons (cons re #f) sub-cdr-re)
(append (sub car-re) sub-cdr-re)))
'()))))
(define pregexp-match-positions-aux
(lambda (re s sn start n i)
(let ((identity (lambda (x) x))
(backrefs (pregexp-make-backref-list re))
(case-sensitive? #t))
(let sub ((re re) (i i) (sk identity) (fk (lambda () #f)))
;(printf "sub ~s ~s\n" i re)
(cond ((eqv? re ':bos)
;(if (= i 0) (sk i) (fk))
(if (= i start) (sk i) (fk))
)
((eqv? re ':eos)
;(if (>= i sn) (sk i) (fk))
(if (>= i n) (sk i) (fk))
)
((eqv? re ':empty)
(sk i))
((eqv? re ':wbdry)
(if (pregexp-at-word-boundary? s i n)
(sk i)
(fk)))
((eqv? re ':not-wbdry)
(if (pregexp-at-word-boundary? s i n)
(fk)
(sk i)))
((and (char? re) (< i n))
;(printf "bingo\n")
(if ((if case-sensitive? char=? char-ci=?)
(string-ref s i) re)
(sk (+ i 1)) (fk)))
((and (not (pair? re)) (< i n))
(if (pregexp-check-if-in-char-class?
(string-ref s i) re)
(sk (+ i 1)) (fk)))
((and (pair? re) (eqv? (car re) ':char-range) (< i n))
(let ((c (string-ref s i)))
(if (let ((c< (if case-sensitive? char<=? char-ci<=?)))
(and (c= i n) (fk)
(let loup-one-of-chars ((chars (cdr re)))
(if (null? chars) (fk)
(sub (car chars) i sk
(lambda ()
(loup-one-of-chars (cdr chars))))))))
((:neg-char)
(if (>= i n) (fk)
(sub (cadr re) i
(lambda (i1) (fk))
(lambda () (sk (+ i 1))))))
((:seq)
(let loup-seq ((res (cdr re)) (i i))
(if (null? res) (sk i )
(sub (car res) i
(lambda (i1 )
(loup-seq (cdr res) i1 ))
fk))))
((:or)
(let loup-or ((res (cdr re)))
(if (null? res) (fk)
(sub (car res) i
(lambda (i1 )
(or (sk i1 )
(loup-or (cdr res))))
(lambda () (loup-or (cdr res)))))))
((:backref)
(let* ((c (pregexp-list-ref backrefs (cadr re)))
(backref
(cond (c => cdr)
(else
(pregexp-error
'pregexp-match-positions-aux
'non-existent-backref re)
#f))))
(if backref
(pregexp-string-match
(substring s (car backref) (cdr backref))
s i n (lambda (i) (sk i)) fk)
(sk i))))
((:sub)
(sub (cadr re) i
(lambda (i1)
(set-cdr! (assv re backrefs) (cons i i1))
(sk i1)) fk))
((:lookahead)
(let ((found-it?
(sub (cadr re) i
identity (lambda () #f))))
(if found-it? (sk i) (fk))))
((:neg-lookahead)
(let ((found-it?
(sub (cadr re) i
identity (lambda () #f))))
(if found-it? (fk) (sk i))))
((:lookbehind)
(let ((n-actual n) (sn-actual sn))
(set! n i) (set! sn i)
(let ((found-it?
(sub (list ':seq '(:between #f 0 #f :any)
(cadr re) ':eos) 0
identity (lambda () #f))))
(set! n n-actual) (set! sn sn-actual)
(if found-it? (sk i) (fk)))))
((:neg-lookbehind)
(let ((n-actual n) (sn-actual sn))
(set! n i) (set! sn i)
(let ((found-it?
(sub (list ':seq '(:between #f 0 #f :any)
(cadr re) ':eos) 0
identity (lambda () #f))))
(set! n n-actual) (set! sn sn-actual)
(if found-it? (fk) (sk i)))))
((:no-backtrack)
(let ((found-it? (sub (cadr re) i
identity (lambda () #f))))
(if found-it?
(sk found-it?)
(fk))))
((:case-sensitive :case-insensitive)
(let ((old case-sensitive?))
(set! case-sensitive?
(eqv? (car re) ':case-sensitive))
(sub (cadr re) i
(lambda (i1)
(set! case-sensitive? old)
(sk i1))
(lambda ()
(set! case-sensitive? old)
(fk)))))
((:between)
(let* ((maximal? (not (cadr re)))
(p (caddr re))
(q (cadddr re))
(could-loop-infinitely? (and maximal? (not q)))
(re (car (cddddr re))))
(let loup-p ((k 0) (i i) )
(if (= k q)) (fk)
(if maximal?
(sub re i
(lambda (i1)
(if (and could-loop-infinitely?
(= i1 i))
(pregexp-error
'pregexp-match-positions-aux
'greedy-quantifier-operand-could-be-empty))
(or (loup-q (+ k 1) i1)
(fk)))
fk)
(or (fk)
(sub re i
(lambda (i1)
(loup-q (+ k 1) i1))
fk)))))))))))
(else (pregexp-error 'pregexp-match-positions-aux))))
((>= i n) (fk))
(else (pregexp-error 'pregexp-match-positions-aux))))
;(printf "done\n")
(let ((backrefs (map cdr backrefs)))
(and (car backrefs) backrefs)))))
(define pregexp-replace-aux
(lambda (str ins n backrefs)
(let loop ((i 0) (r ""))
(if (>= i n) r
(let ((c (string-ref ins i)))
(if (char=? c #\\ )
(let* ((br-i (pregexp-read-escaped-number ins i n))
(br (if br-i (car br-i)
(if (char=? (string-ref ins (+ i 1)) #\&) 0
#f)))
(i (if br-i (cadr br-i)
(if br (+ i 2)
(+ i 1)))))
(if (not br)
(let ((c2 (string-ref ins i)))
(loop (+ i 1)
(if (char=? c2 #\$) r
(string-append r (string c2)))))
(loop i
(let ((backref (pregexp-list-ref backrefs br)))
(if backref
(string-append r
(substring str (car backref)
(cdr backref)))
r)))))
(loop (+ i 1) (string-append r (string c)))))))))
(define pregexp
(lambda (s)
(set! *pregexp-space-sensitive?* #t) ;in case it got corrupted
(list ':sub (car (pregexp-read-pattern s 0 (string-length s))))))
(define pregexp-match-positions
(lambda (pat str . opt-args)
(cond ((string? pat) (set! pat (pregexp pat)))
((pair? pat) #t)
(else (pregexp-error 'pregexp-match-positions
'pattern-must-be-compiled-or-string-regexp
pat)))
(let* ((str-len (string-length str))
(start (if (null? opt-args) 0
(let ((start (car opt-args)))
(set! opt-args (cdr opt-args))
start)))
(end (if (null? opt-args) str-len
(car opt-args))))
(let loop ((i start))
(and (= i n) (pregexp-reverse! r))
((pregexp-match-positions pat str i n)
=>
(lambda (y)
(let ((jk (car y)))
(let ((j (car jk)) (k (cdr jk)))
;(printf "j = ~a; k = ~a; i = ~a~n" j k i)
(cond ((= j k)
;(printf "producing ~s~n"
; (substring str i (+ j 1)))
(loop (+ k 1)
(cons (substring str i (+ j 1)) r) #t))
((and (= j i) picked-up-one-undelimited-char?)
(loop k r #f))
(else
;(printf "producing ~s~n" (substring str i j))
(loop k (cons (substring str i j) r) #f)))))))
(else (loop n (cons (substring str i n) r) #f)))))))
(define pregexp-replace
(lambda (pat str ins)
(let* ((n (string-length str))
(pp (pregexp-match-positions pat str 0 n)))
(if (not pp) str
(let ((ins-len (string-length ins))
(m-i (caar pp))
(m-n (cdar pp)))
(string-append
(substring str 0 m-i)
(pregexp-replace-aux str ins ins-len pp)
(substring str m-n n)))))))
(define pregexp-replace*
(lambda (pat str ins)
;return str with every occurrence of pat
;replaced by ins
(let ((pat (if (string? pat) (pregexp pat) pat))
(n (string-length str))
(ins-len (string-length ins)))
(let loop ((i 0) (r ""))
;i = index in str to start replacing from
;r = already calculated prefix of answer
(if (>= i n) r
(let ((pp (pregexp-match-positions pat str i n)))
(if (not pp)
(if (= i 0)
;this implies pat didn't match str at
;all, so let's return original str
str
;else: all matches already found and
;replaced in r, so let's just
;append the rest of str
(string-append
r (substring str i n)))
(loop (cdar pp)
(string-append
r
(substring str i (caar pp))
(pregexp-replace-aux str ins ins-len pp))))))))))
(define pregexp-quote
(lambda (s)
(let loop ((i (- (string-length s) 1)) (r '()))
(if (string r)
(loop (- i 1)
(let ((c (string-ref s i)))
(if (memv c '(#\\ #\. #\? #\* #\+ #\| #\^ #\$
#\[ #\] #\{ #\} #\( #\)))
(cons #\\ (cons c r))
(cons c r))))))))
;(trace pregexp-read-pattern pregexp-read-char-list pregexp-read-piece)
;eof
Like this:
Like Loading...
Related
Pages: 1 2 3 4 5 6 7 8 9 10
My apologies; the formatting got really messed up. I’ll work on fixing through the day today.
I think the pairing heap implementation will not scale to beyond stack limits since it uses non-tail recursion. An easy fix would be to replace the recursive calls in pq-merge-pairs with a fold of pq-merge on pre-paired arguments.
[…] There are several pattern-matching libraries available for Scheme, but they are rather heavy (the one I use, by Friedman, Hilsdale and Dybvig, is over six hundred lines of code). Our Standard Prelude has a […]