Regular Expressions, Part 1
September 15, 2009
We represent a regular expression in its intermediate form as a list of rx-elements:
'(bol)— beginning of line'(eol)— end of line'(any)— any (unspecified) character'(litc)— the literal character c'(cclc . . .)— the character class containing the literal characters c . . .'(nclc . . .)— the negated character class not containing any of the literal characters c . . .'(clox)— a closure, consisting of any number (zero or more) of the enclosed rx-element, which may be a literal character, an unspecified character, or a character class
The examples from the prior page are encoded as shown below:
[0-9][0-9]* — ((ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) (clo ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9))^..*$ — ((bol) (any) (clo any) (eol))hello — ((lit #\h) (lit #\e) (lit #\l) (lit #\l) (lit #\o))^ *hello *$ — ((bol) (clo lit #\space) (lit #\h) (lit #\e) (lit #\l) (lit #\l) (lit #\o) (clo lit #\space) (eol))^[^x].*[0-9] *x$ — ((bol) (ncl #\x) (clo any) (ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) (clo lit #\space) (lit x) (eol))
The parser loops through the regular expression, using the list-match macro from the Standard Prelude to identify rx-elements:
(define (make-rx regexp)
(let loop ((first? #t) (rs (string->list regexp)) (zs '()))
(list-match rs
(() (reverse zs))
((#$) (reverse (cons '(eol) zs)))
((#\^ . rest) first? (loop #f rest (cons '(bol) zs)))
((#\\ c \#* . rest)
(loop #f rest (cons (list 'clo 'lit
(case c ((#\n) #\newline) ((#\t) #\tab) (else c))) zs)))
((#\\ c . rest)
(loop #f rest (cons (list 'lit
(case c ((#\n) #\newline) ((#\t) #\tab) (else c))) zs)))
((#\[ #\^ . rest)
(list-match (get-class rest)
((class) (reverse (cons (cons 'ncl class) zs)))
((class rest)
(if (and (pair? rest) (char=? (car rest) #\*))
(loop #f (cdr rest) (cons (cons 'clo (cons 'ncl class)) zs))
(loop #f rest (cons (cons 'ncl class) zs))))))
((#\[ . rest)
(list-match (get-class rest)
((class) (reverse (cons (cons 'ccl class) zs)))
((class rest)
(if (and (pair? rest) (char=? (car rest) #\*))
(loop #f (cdr rest) (cons (cons 'clo (cons 'ccl class)) zs))
(loop #f rest (cons (cons 'ccl class) zs))))))
((#\. #\* . rest) (loop #f rest (cons (list 'clo 'any) zs)))
((#\. . rest) (loop #f rest (cons '(any) zs)))
((c #\* . rest) (loop #f rest (cons (list 'clo 'lit c) zs)))
((c . rest) (loop #f rest (cons (list 'lit c) zs)))
(else (error 'make-rx "unrecognized regular expression")))))
First? is #t initially then #f forever; it is used to distinguish the caret that indicates a beginning-of-line rx-element from any other caret. Most of the other pieces are unremarkable, except the code for the two kinds of character classes; both call get-class to extract the characters in the class and point to the next character after the class. Closures are checked explicitly after the literal character, unspecified character, and the two character classes.
Get-class returns two values in a list, the list of characters that form the class, and the remaining regular-expression text after the class; it uses range from the Standard Prelude:
(define (get-class class)
(define (char-range a b)
(map integer->char
(range (char->integer a) (+ (char->integer b) 1))))
(let loop ((cs class) (zs '()))
(list-match cs
((#\] . rest) (pair? zs) (list zs rest))
((#\] . rest) (loop rest (cons #\] zs)))
((#\ c . rest) (loop rest (cons c zs)))
((a #\- b . rest)
(or (and (char-numeric? a) (char-numeric? b) (char<? a b))
(and (char-upper-case? a) (char-upper-case? b) (char<? a b))
(and (char-lower-case? a) (char-lower-case? b) (char<? a b)))
(loop rest (append (char-range a b) zs)))
((c . rest) (loop rest (cons c zs)))
(else (error 'get-class "unrecognized class element")))))
Here is the parser in action on the sample regular expressions shown above:
> (make-rx "[0-9][0-9]*")
((ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
(clo ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9))
> (make-rx "^..*$")
((bol) (any) (clo any) (eol))
> (make-rx "hello")
((lit #\h) (lit #\e) (lit #\l) (lit #\l) (lit #\o))
> (make-rx "^ *hello *$")
((bol) (clo lit #\space) (lit #\h) (lit #\e) (lit #\l)
(lit #\l) (lit #\o) (clo lit #\space) (eol))
> (make-rx "^[^x].*[0-9] *x$")
((bol) (ncl #\x) (clo any) (ccl #\0 #\1 #\2 #\3 #\4 #\5
#\6 #\7 #\8 #\9) (clo lit #\space) (lit x) (eol))
You can run the parser at http://programmingpraxis.codepad.org/A4XmTLTu.
[…] Praxis – Regular Expressions, Part 1 By Remco Niemeijer In today’s Programming Praxis problem our task is to write a parser for simple regular expressions. Since […]
My Haskell solution (see http://bonsaicode.wordpress.com/2009/09/15/programming-praxis-regular-expressions-part-1/ for a version with comments):
import Control.Applicative ((<$>), (*>), (<*), (<*>)) import Text.Parsec import Text.Parsec.String data Elem = Lit Char | Any | Set Bool [Char] deriving Show data Chunk = Elem Elem | BoL | EoL | Star Elem deriving Show regex :: Parser [Chunk] regex = (++) <$> bol <*> many chunk where bol = option [] (const [BoL] <$> char '^') chunk = choice [Star <$> try (element <* char '*'), const EoL <$> try (char '$' <* eof), Elem <$> element] element = choice [const Any <$> char '.', Set False . expandSet <$> set "[^", Set True . expandSet <$> set "[", Lit <$> anyChar] set s = try (string s *> many1 (noneOf "]") <* char ']') expandSet (a:'-':b:xs) = [a..b] ++ expandSet xs expandSet (x:xs) = x : expandSet xs expandSet _ = [] parseRegex :: String -> Either ParseError [Chunk] parseRegex = parse regex ""Updated code because I overlooked the bit of text between the two bulleted lists:
import Control.Applicative ((<$>), (*>), (<*), (<*>)) import Data.Char import Text.Parsec import Text.Parsec.String data Elem = Lit Char | Esc Char | Any | Set Bool [Elem] deriving Show data Chunk = Elem Elem | BoL | EoL | Star Elem deriving Show regex :: Parser [Chunk] regex = (++) <$> bol <*> many chunk where bol = option [] (const [BoL] <$> char '^') chunk = choice [Star <$> try (element <* char '*'), const EoL <$> try (char '$' <* eof), Elem <$> element] element = choice [esc <$> try (char '\\' *> anyChar), const Any <$> char '.', Set False . expandSet <$> set "[^", Set True . expandSet <$> set "[", Lit <$> noneOf "]"] esc c = if elem c "nt" then Esc c else Lit c set s = try (string s *> many1 element <* char ']') expandSet (Lit a:Lit '-':Lit b:xs) | validRange a b = map Lit [a..b] ++ expandSet xs expandSet (x:xs) = x : expandSet xs expandSet _ = [] validRange a b = b > a && ((isLower a && isLower b) || (isUpper a && isUpper b) || (isDigit a && isDigit b)) parseRegex :: String -> Either ParseError [Chunk] parseRegex = parse regex ""In your listing here, the “#\\”s (corresponding to lines 55 and 58 of the codepad version) have been mangled into “#\”.
Jamie: Thanks for pointing that out. It’s fixed now.
I showed my compilers class how to build lexers using the derivative of regular expressions:
http://matt.might.net/articles/implementation-of-regular-expression-matching-in-scheme-with-derivatives/
The implementation for regular languages is remarkably compact.
Brzozowski’s regular-expression matcher based on derivatives was on my to-do list. Now I’ll have to take it off.
A scratchy and long implementation in C
#include <stdio.h> #include <ctype.h> static int copy_code(int code, char *s, int i); static int copy_character(int c, char *s, int i); static int copy_special_characters(int code, char *s, int i); static char codes[][4] = {"bol", "eol", "any", "lit", "ccl", "ncl", "clo"}; enum{ BOL = 0, EOL, ANY, LIT, CCL, NCL, CLO}; static char special_characters[][8] = {"space", "newline", "tab"}; enum{SPACE, NEWLINE, TAB}; enum{VALID_REGEXP = 0, INVALID_REGEXP}; int encode_regular_expression(char *regexp, char *encode) { /* i is the index of current character in regexp ang j is the current character being output */ int i, j, escaped; char c; i = j = escaped = 0; encode[j++] = '('; /* parse through the regular expression */ for(;;) { /* read the character in regexp in c */ c = regexp[i++]; if(c == '\0') break; if(c == '\\') /* escape character */ { escaped = 1; continue; } encode[j++] = '('; if(!escaped && c == '^' && i == 1) /* begining of line character '^' is not escaped and located at the start of regexp */ j = copy_code(BOL, encode, j); else if(!escaped && c == '$' && regexp[i] == '\0') /* end of line character '$' is not escaped and located at the end of regexp */ j = copy_code(EOL, encode, j); else if(!escaped && c == '.') /* any character */ { if(regexp[i] == '*') { j = copy_code(CLO, encode, j); encode[j++] = ' '; i++; } j = copy_code(ANY, encode, j); } else if(!escaped && c == '[') /* begining of character class */ { int k; /* check for closure and validity of character class */ for(k = i; regexp[k] != '\0' && regexp[k] != ']'; k++); /* if no ending bracket, return invalid regexp */ if(regexp[k] == '\0') return INVALID_REGEXP; /* check for closure */ if(regexp[k+1] == '*') { j = copy_code(CLO, encode, j); encode[j++] = ' '; } /* check for negated character */ if(regexp[i] == '^') { j = copy_code(NCL, encode, j); i++; } else j = copy_code(CCL, encode, j); while(regexp[i] != '\0' && regexp[i] != ']') { char start, end; if(regexp[i+1] != '-') { j = copy_character(regexp[i], encode, j); i++; continue; } if(!isalnum(regexp[i]) && !isalnum(regexp[i+2])) return INVALID_REGEXP; if(isalpha(regexp[i]) && !isalpha(regexp[i+2])) return INVALID_REGEXP; if(isdigit(regexp[i]) && !isdigit(regexp[i+2])) return INVALID_REGEXP; if(regexp[i] > regexp[i+2]) return INVALID_REGEXP; /*now expand the given range */ for(start = regexp[i], end = regexp[i+2]; start <= end; start++) j = copy_character(start, encode, j); i = i + 3; } i++; if(regexp[i] == '*') i++; } else /* literal character */ { if(regexp[i] == '*') { j = copy_code(CLO, encode, j); encode[j++] = ' '; i++; } j = copy_code(LIT, encode, j); /* handle special characters */ if(escaped && c == 'n') j = copy_special_characters(NEWLINE, encode, j); else if(escaped && c == 't') j = copy_special_characters(TAB, encode, j); else if(c == ' ') j = copy_special_characters(SPACE, encode, j); else j = copy_character(c, encode, j); } encode[j++] = ')'; encode[j++] = ' '; } encode[j-1] = ')'; encode[j] = '\0'; return VALID_REGEXP; } static int copy_code(int code, char *s, int i) { int j; for(j = 0; codes[code][j] != '\0'; j++) s[i++] = codes[code][j]; return i; } static int copy_special_characters(int code, char *s, int i) { int j; s[i++] = ' '; s[i++] = '#'; s[i++] = '\\'; for(j = 0; special_characters[code][j] != '\0'; j++) s[i++] = special_characters[code][j]; return i; } static int copy_character(int c, char *s, int i) { s[i++] = ' '; s[i++] = '#'; s[i++] = '\\'; s[i++] = c; return i; }