Regular Expressions, Part 1

September 15, 2009

We represent a regular expression in its intermediate form as a list of rx-elements:

  • '(bol) — beginning of line
  • '(eol) — end of line
  • '(any) — any (unspecified) character
  • '(lit c) — the literal character c
  • '(ccl c . . .) — the character class containing the literal characters c . . .
  • '(ncl c . . .) — the negated character class not containing any of the literal characters c . . .
  • '(clo x) — a closure, consisting of any number (zero or more) of the enclosed rx-element, which may be a literal character, an unspecified character, or a character class

The examples from the prior page are encoded as shown below:

  • [0-9][0-9]* — ((ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) (clo ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9))
  • ^..*$ — ((bol) (any) (clo any) (eol))
  • hello — ((lit #\h) (lit #\e) (lit #\l) (lit #\l) (lit #\o))
  • ^ *hello *$ — ((bol) (clo lit #\space) (lit #\h) (lit #\e) (lit #\l) (lit #\l) (lit #\o) (clo lit #\space) (eol))
  • ^[^x].*[0-9] *x$ — ((bol) (ncl #\x) (clo any) (ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) (clo lit #\space) (lit x) (eol))

The parser loops through the regular expression, using the list-match macro from the Standard Prelude to identify rx-elements:

(define (make-rx regexp)
  (let loop ((first? #t) (rs (string->list regexp)) (zs '()))
    (list-match rs
      (() (reverse zs))
      ((#$) (reverse (cons '(eol) zs)))
      ((#\^ . rest) first? (loop #f rest (cons '(bol) zs)))
      ((#\\ c \#* . rest)
        (loop #f rest (cons (list 'clo 'lit
          (case c ((#\n) #\newline) ((#\t) #\tab) (else c))) zs)))
      ((#\\ c . rest)
        (loop #f rest (cons (list 'lit
          (case c ((#\n) #\newline) ((#\t) #\tab) (else c))) zs)))
      ((#\[ #\^ . rest)
        (list-match (get-class rest)
          ((class) (reverse (cons (cons 'ncl class) zs)))
          ((class rest)
            (if (and (pair? rest) (char=? (car rest) #\*))
                (loop #f (cdr rest) (cons (cons 'clo (cons 'ncl class)) zs))
                (loop #f rest (cons (cons 'ncl class) zs))))))
      ((#\[ . rest)
        (list-match (get-class rest)
          ((class) (reverse (cons (cons 'ccl class) zs)))
          ((class rest)
            (if (and (pair? rest) (char=? (car rest) #\*))
                (loop #f (cdr rest) (cons (cons 'clo (cons 'ccl class)) zs))
                (loop #f rest (cons (cons 'ccl class) zs))))))
      ((#\. #\* . rest) (loop #f rest (cons (list 'clo 'any) zs)))
      ((#\. . rest) (loop #f rest (cons '(any) zs)))
      ((c #\* . rest) (loop #f rest (cons (list 'clo 'lit c) zs)))
      ((c . rest) (loop #f rest (cons (list 'lit c) zs)))
      (else (error 'make-rx "unrecognized regular expression")))))

First? is #t initially then #f forever; it is used to distinguish the caret that indicates a beginning-of-line rx-element from any other caret. Most of the other pieces are unremarkable, except the code for the two kinds of character classes; both call get-class to extract the characters in the class and point to the next character after the class. Closures are checked explicitly after the literal character, unspecified character, and the two character classes.

Get-class returns two values in a list, the list of characters that form the class, and the remaining regular-expression text after the class; it uses range from the Standard Prelude:

(define (get-class class)
  (define (char-range a b)
    (map integer->char
      (range (char->integer a) (+ (char->integer b) 1))))
  (let loop ((cs class) (zs '()))
    (list-match cs
      ((#\] . rest) (pair? zs) (list zs rest))
      ((#\] . rest) (loop rest (cons #\] zs)))
      ((#\ c . rest) (loop rest (cons c zs)))
      ((a #\- b . rest)
        (or (and (char-numeric? a) (char-numeric? b) (char<? a b))
            (and (char-upper-case? a) (char-upper-case? b) (char<? a b))
            (and (char-lower-case? a) (char-lower-case? b) (char<? a b)))
        (loop rest (append (char-range a b) zs)))
      ((c . rest) (loop rest (cons c zs)))
      (else (error 'get-class "unrecognized class element")))))

Here is the parser in action on the sample regular expressions shown above:

> (make-rx "[0-9][0-9]*")
((ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
  (clo ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9))
> (make-rx "^..*$")
((bol) (any) (clo any) (eol))
> (make-rx "hello")
((lit #\h) (lit #\e) (lit #\l) (lit #\l) (lit #\o))
> (make-rx "^ *hello *$")
((bol) (clo lit #\space) (lit #\h) (lit #\e) (lit #\l)
  (lit #\l) (lit #\o) (clo lit #\space) (eol))
> (make-rx "^[^x].*[0-9] *x$")
((bol) (ncl #\x) (clo any) (ccl #\0 #\1 #\2 #\3 #\4 #\5
  #\6 #\7 #\8 #\9) (clo lit #\space) (lit x) (eol))

You can run the parser at http://programmingpraxis.codepad.org/A4XmTLTu.

Advertisement

Pages: 1 2

8 Responses to “Regular Expressions, Part 1”

  1. […] Praxis – Regular Expressions, Part 1 By Remco Niemeijer In today’s Programming Praxis problem our task is to write a parser for simple regular expressions. Since […]

  2. Remco Niemeijer said

    My Haskell solution (see http://bonsaicode.wordpress.com/2009/09/15/programming-praxis-regular-expressions-part-1/ for a version with comments):

    import Control.Applicative ((<$>), (*>), (<*), (<*>))
    import Text.Parsec
    import Text.Parsec.String
    
    data Elem = Lit Char | Any | Set Bool [Char] deriving Show
    data Chunk = Elem Elem | BoL | EoL | Star Elem deriving Show
    
    regex :: Parser [Chunk]
    regex = (++) <$> bol <*> many chunk where
        bol = option [] (const [BoL] <$> char '^')
        chunk = choice [Star <$> try (element <* char '*'),
                        const EoL <$> try (char '$' <* eof),
                        Elem <$> element]
        element = choice [const Any <$> char '.',
                          Set False . expandSet <$> set "[^",
                          Set True . expandSet <$> set "[",
                          Lit <$> anyChar]
        set s = try (string s *> many1 (noneOf "]") <* char '&#93;')
        expandSet (a:'-':b:xs) = &#91;a..b&#93; ++ expandSet xs
        expandSet (x:xs) = x : expandSet xs
        expandSet _ = &#91;&#93;
    
    parseRegex :: String -> Either ParseError [Chunk]
    parseRegex = parse regex ""
    
  3. Remco Niemeijer said

    Updated code because I overlooked the bit of text between the two bulleted lists:

    import Control.Applicative ((<$>), (*>), (<*), (<*>))
    import Data.Char
    import Text.Parsec
    import Text.Parsec.String
    
    data Elem = Lit Char | Esc Char | Any | Set Bool [Elem] deriving Show
    data Chunk = Elem Elem | BoL | EoL | Star Elem deriving Show
    
    regex :: Parser [Chunk]
    regex = (++) <$> bol <*> many chunk where
        bol = option [] (const [BoL] <$> char '^')
        chunk = choice [Star <$> try (element <* char '*'),
                        const EoL <$> try (char '$' <* eof),
                        Elem <$> element]
        element = choice [esc <$> try (char '\\' *> anyChar),
                          const Any <$> char '.',
                          Set False . expandSet <$> set "[^",
                          Set True . expandSet <$> set "[",
                          Lit <$> noneOf "]"]
        esc c = if elem c "nt" then Esc c else Lit c
        set s = try (string s *> many1 element <* char '&#93;')
        expandSet (Lit a:Lit '-':Lit b:xs)
            | validRange a b = map Lit &#91;a..b&#93; ++ expandSet xs
        expandSet (x:xs) = x : expandSet xs
        expandSet _ = &#91;&#93;
        validRange a b = b > a && ((isLower a && isLower b) ||
                                   (isUpper a && isUpper b) ||
                                   (isDigit a && isDigit b))
        
    parseRegex :: String -> Either ParseError [Chunk]
    parseRegex = parse regex ""
    
  4. Jamie Hope said

    In your listing here, the “#\\”s (corresponding to lines 55 and 58 of the codepad version) have been mangled into “#\”.

  5. programmingpraxis said

    Jamie: Thanks for pointing that out. It’s fixed now.

  6. Matt Might said

    I showed my compilers class how to build lexers using the derivative of regular expressions:

    http://matt.might.net/articles/implementation-of-regular-expression-matching-in-scheme-with-derivatives/

    The implementation for regular languages is remarkably compact.

  7. programmingpraxis said

    Brzozowski’s regular-expression matcher based on derivatives was on my to-do list. Now I’ll have to take it off.

  8. Vikas Tandi said

    A scratchy and long implementation in C

    #include <stdio.h>
    #include <ctype.h>
    
    static int copy_code(int code, char *s, int i);
    static int copy_character(int c, char *s, int i);
    static int copy_special_characters(int code, char *s, int i);
    
    static char codes[][4] = {"bol", "eol", "any", "lit", "ccl", "ncl",	"clo"};
    enum{ BOL = 0, EOL, ANY, LIT, CCL, NCL, CLO};
    
    static char special_characters[][8] = {"space", "newline", "tab"};
    enum{SPACE, NEWLINE, TAB};
    
    enum{VALID_REGEXP = 0, INVALID_REGEXP};
    
    int encode_regular_expression(char *regexp, char *encode)
    {
    	/* i is the index of current character in regexp ang j is the current character being output */
    	int i, j, escaped;
    	char c;
    
    	i = j = escaped = 0;
    	encode[j++] = '(';
    	/* parse through the regular expression */
    	for(;;)
    	{
    		/* read the character in regexp in c */
    		c = regexp[i++];
    
    		if(c == '\0')
    			break;
    
    		if(c == '\\')								/* escape character */
    		{
    			escaped = 1;
    			continue;
    		}
    
    		encode[j++] = '(';
    
    		if(!escaped && c == '^' && i == 1)					/* begining of line character '^' is not escaped and located at the start of regexp */
    			j = copy_code(BOL, encode, j);
    		else if(!escaped && c == '$' && regexp[i] == '\0')		/* end of line character '$' is not escaped and located at the end of regexp */
    			j = copy_code(EOL, encode, j);
    		else if(!escaped && c == '.')							/* any character */
    		{
    			if(regexp[i] == '*')
    			{
    				j = copy_code(CLO, encode, j);
    				encode[j++] = ' ';
    				i++;
    			}
    
    			j = copy_code(ANY, encode, j);
    		}
    		else if(!escaped && c == '[')							/* begining of character class */
    		{
    			int k;
    
    			/* check for closure and validity of character class */
    			for(k = i; regexp[k] != '\0' && regexp[k] != ']'; k++);
    
    			/* if no ending bracket, return invalid regexp */
    			if(regexp[k] == '\0')
    				return INVALID_REGEXP;
    
    			/* check for closure */
    			if(regexp[k+1] == '*')
    			{
    				j = copy_code(CLO, encode, j);
    				encode[j++] = ' ';
    			}
    
    			/* check for negated character */
    			if(regexp[i] == '^')
    			{
    				j = copy_code(NCL, encode, j);
    				i++;
    			}
    			else
    				j = copy_code(CCL, encode, j);
    
    			while(regexp[i] != '\0' && regexp[i] != ']')
    			{
    				char start, end;
    
    				if(regexp[i+1] != '-')
    				{
    					j = copy_character(regexp[i], encode, j);
    					i++;
    					continue;
    				}
    
    				if(!isalnum(regexp[i]) && !isalnum(regexp[i+2]))
    					return INVALID_REGEXP;
    
    				if(isalpha(regexp[i])  && !isalpha(regexp[i+2]))
    					return INVALID_REGEXP;
    
    				if(isdigit(regexp[i])  && !isdigit(regexp[i+2]))
    					return INVALID_REGEXP;
    
    				if(regexp[i] > regexp[i+2])
    					return INVALID_REGEXP;
    
    				/*now expand the given range */
    				for(start = regexp[i], end = regexp[i+2]; start <= end; start++)
    					j = copy_character(start, encode, j);
    
    				i = i + 3;
    			}
    
    			i++;
    			if(regexp[i] == '*')
    				i++;
    		}
    		else							/* literal character */
    		{
    			if(regexp[i] == '*')
    			{
    				j = copy_code(CLO, encode, j);
    				encode[j++] = ' ';
    				i++;
    			}
    
    			j = copy_code(LIT, encode, j);
    
    			/* handle special characters */
    			if(escaped && c == 'n')
    				j = copy_special_characters(NEWLINE, encode, j);
    			else if(escaped && c == 't')
    				j = copy_special_characters(TAB, encode, j);
    			else if(c == ' ')
    				j = copy_special_characters(SPACE, encode, j);
    			else
    				j = copy_character(c, encode, j);
    		}
    		encode[j++] = ')';
    		encode[j++] = ' ';
    	}
    	encode[j-1] = ')';
    	encode[j] = '\0';
    	return VALID_REGEXP;
    }
    
    static int copy_code(int code, char *s, int i)
    {
    	int j;
    
    	for(j = 0; codes[code][j] != '\0'; j++)
    		s[i++] = codes[code][j];
    
    	return i;
    }
    
    static int copy_special_characters(int code, char *s, int i)
    {
    	int j;
    
    	s[i++] = ' ';
    	s[i++] = '#';
    	s[i++] = '\\';
    
    	for(j = 0; special_characters[code][j] != '\0'; j++)
    		s[i++] = special_characters[code][j];
    
    	return i;
    }
    
    static int copy_character(int c, char *s, int i)
    {
    	s[i++] = ' ';
    	s[i++] = '#';
    	s[i++] = '\\';
    	s[i++] = c;
    
    	return i;
    }
    

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s