Regular Expressions, Part 1

September 15, 2009

We represent a regular expression in its intermediate form as a list of rx-elements:

'(bol) — beginning of line
'(eol) — end of line
'(any) — any (unspecified) character
'(lit c) — the literal character c
'(ccl c . . .) — the character class containing the literal characters c . . .
'(ncl c . . .) — the negated character class not containing any of the literal characters c . . .
'(clo x) — a closure, consisting of any number (zero or more) of the enclosed rx-element, which may be a literal character, an unspecified character, or a character class

The examples from the prior page are encoded as shown below:

[0-9][0-9]* — ((ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) (clo ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9))
^..*$ — ((bol) (any) (clo any) (eol))
hello — ((lit #\h) (lit #\e) (lit #\l) (lit #\l) (lit #\o))
^ *hello *$ — ((bol) (clo lit #\space) (lit #\h) (lit #\e) (lit #\l) (lit #\l) (lit #\o) (clo lit #\space) (eol))
^[^x].*[0-9] *x$ — ((bol) (ncl #\x) (clo any) (ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) (clo lit #\space) (lit x) (eol))

The parser loops through the regular expression, using the list-match macro from the Standard Prelude to identify rx-elements:


(define (make-rx regexp)

  (let loop ((first? #t) (rs (string->list regexp)) (zs '()))

    (list-match rs

      (() (reverse zs))

      ((#$) (reverse (cons '(eol) zs)))

      ((#\^ . rest) first? (loop #f rest (cons '(bol) zs)))

      ((#\\ c \#* . rest)

        (loop #f rest (cons (list 'clo 'lit

          (case c ((#\n) #\newline) ((#\t) #\tab) (else c))) zs)))

      ((#\\ c . rest)

        (loop #f rest (cons (list 'lit

          (case c ((#\n) #\newline) ((#\t) #\tab) (else c))) zs)))

      ((#\[ #\^ . rest)

        (list-match (get-class rest)

          ((class) (reverse (cons (cons 'ncl class) zs)))

          ((class rest)

            (if (and (pair? rest) (char=? (car rest) #\*))

                (loop #f (cdr rest) (cons (cons 'clo (cons 'ncl class)) zs))

                (loop #f rest (cons (cons 'ncl class) zs))))))

      ((#\[ . rest)

        (list-match (get-class rest)

          ((class) (reverse (cons (cons 'ccl class) zs)))

          ((class rest)

            (if (and (pair? rest) (char=? (car rest) #\*))

                (loop #f (cdr rest) (cons (cons 'clo (cons 'ccl class)) zs))

                (loop #f rest (cons (cons 'ccl class) zs))))))

      ((#\. #\* . rest) (loop #f rest (cons (list 'clo 'any) zs)))

      ((#\. . rest) (loop #f rest (cons '(any) zs)))

      ((c #\* . rest) (loop #f rest (cons (list 'clo 'lit c) zs)))

      ((c . rest) (loop #f rest (cons (list 'lit c) zs)))

      (else (error 'make-rx "unrecognized regular expression")))))
First? is #t initially then #f forever; it is used to distinguish the caret that indicates a beginning-of-line rx-element from any other caret.  Most of the other pieces are unremarkable, except the code for the two kinds of character classes; both call get-class to extract the characters in the class and point to the next character after the class.  Closures are checked explicitly after the literal character, unspecified character, and the two character classes.
Get-class returns two values in a list, the list of characters that form the class, and the remaining regular-expression text after the class; it uses range from the Standard Prelude:

(define (get-class class)

  (define (char-range a b)

    (map integer->char

      (range (char->integer a) (+ (char->integer b) 1))))

  (let loop ((cs class) (zs '()))

    (list-match cs

      ((#\] . rest) (pair? zs) (list zs rest))

      ((#\] . rest) (loop rest (cons #\] zs)))

      ((#\ c . rest) (loop rest (cons c zs)))

      ((a #\- b . rest)

        (or (and (char-numeric? a) (char-numeric? b) (char<? a b))

            (and (char-upper-case? a) (char-upper-case? b) (char<? a b))

            (and (char-lower-case? a) (char-lower-case? b) (char<? a b)))

        (loop rest (append (char-range a b) zs)))

      ((c . rest) (loop rest (cons c zs)))

      (else (error 'get-class "unrecognized class element")))))
Here is the parser in action on the sample regular expressions shown above:
> (make-rx "[0-9][0-9]*")

((ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)

  (clo ccl #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9))

> (make-rx "^..*$")

((bol) (any) (clo any) (eol))

> (make-rx "hello")

((lit #\h) (lit #\e) (lit #\l) (lit #\l) (lit #\o))

> (make-rx "^ *hello *$")

((bol) (clo lit #\space) (lit #\h) (lit #\e) (lit #\l)

  (lit #\l) (lit #\o) (clo lit #\space) (eol))

> (make-rx "^[^x].*[0-9] *x$")

((bol) (ncl #\x) (clo any) (ccl #\0 #\1 #\2 #\3 #\4 #\5

  #\6 #\7 #\8 #\9) (clo lit #\space) (lit x) (eol))
You can run the parser at http://programmingpraxis.codepad.org/A4XmTLTu.

		
	
Share this:

				Share on Facebook (Opens in new window)
				Facebook
			

				Share on X (Opens in new window)
				X
			
Like Loading...


	Related
			Pages: 1 2




		
		
			Posted by programmingpraxis
			Filed in Exercises  
		

		8 Comments »


	
			8 Responses to “Regular Expressions, Part 1”

	
		
	
	
		Programming Praxis – Regular Expressions, Part 1 « Bonsai Code said
	

	
	September 15, 2009  at 3:13 PM 

	[…] Praxis – Regular Expressions, Part 1 By Remco Niemeijer  In today’s Programming Praxis problem our task is to write a parser for simple regular expressions. Since […]

	
		
	

	
	
	
		Remco Niemeijer said
	

	
	September 15, 2009  at 3:13 PM 

	My Haskell solution (see http://bonsaicode.wordpress.com/2009/09/15/programming-praxis-regular-expressions-part-1/ for a version with comments):
import Control.Applicative ((<$>), (*>), (<*), (<*>))
import Text.Parsec
import Text.Parsec.String

data Elem = Lit Char | Any | Set Bool [Char] deriving Show
data Chunk = Elem Elem | BoL | EoL | Star Elem deriving Show

regex :: Parser [Chunk]
regex = (++) <$> bol <*> many chunk where
    bol = option [] (const [BoL] <$> char '^')
    chunk = choice [Star <$> try (element <* char '*'),
                    const EoL <$> try (char '$' <* eof),
                    Elem <$> element]
    element = choice [const Any <$> char '.',
                      Set False . expandSet <$> set "[^",
                      Set True . expandSet <$> set "[",
                      Lit <$> anyChar]
    set s = try (string s *> many1 (noneOf "]") <* char '&#93;')
    expandSet (a:'-':b:xs) = &#91;a..b&#93; ++ expandSet xs
    expandSet (x:xs) = x : expandSet xs
    expandSet _ = &#91;&#93;

parseRegex :: String -> Either ParseError [Chunk]
parseRegex = parse regex ""


	
		
	

	
	
	
		Remco Niemeijer said
	

	
	September 16, 2009  at 12:08 AM 

	Updated code because I overlooked the bit of text between the two bulleted lists:
import Control.Applicative ((<$>), (*>), (<*), (<*>))
import Data.Char
import Text.Parsec
import Text.Parsec.String

data Elem = Lit Char | Esc Char | Any | Set Bool [Elem] deriving Show
data Chunk = Elem Elem | BoL | EoL | Star Elem deriving Show

regex :: Parser [Chunk]
regex = (++) <$> bol <*> many chunk where
    bol = option [] (const [BoL] <$> char '^')
    chunk = choice [Star <$> try (element <* char '*'),
                    const EoL <$> try (char '$' <* eof),
                    Elem <$> element]
    element = choice [esc <$> try (char '\\' *> anyChar),
                      const Any <$> char '.',
                      Set False . expandSet <$> set "[^",
                      Set True . expandSet <$> set "[",
                      Lit <$> noneOf "]"]
    esc c = if elem c "nt" then Esc c else Lit c
    set s = try (string s *> many1 element <* char '&#93;')
    expandSet (Lit a:Lit '-':Lit b:xs)
        | validRange a b = map Lit &#91;a..b&#93; ++ expandSet xs
    expandSet (x:xs) = x : expandSet xs
    expandSet _ = &#91;&#93;
    validRange a b = b > a && ((isLower a && isLower b) ||
                               (isUpper a && isUpper b) ||
                               (isDigit a && isDigit b))
    
parseRegex :: String -> Either ParseError [Chunk]
parseRegex = parse regex ""


	
		
	

	
	
	
		Jamie Hope said
	

	
	September 16, 2009  at 12:48 PM 

	In your listing here, the “#\\”s (corresponding to lines 55 and 58 of the codepad version) have been mangled into “#\”.

	
		
	

	
	
	
		programmingpraxis said
	

	
	September 16, 2009  at 12:54 PM 

	Jamie:  Thanks for pointing that out.  It’s fixed now.

	
		
	

	
	
	
		Matt Might said
	

	
	October 4, 2009  at 12:15 AM 

	I showed my compilers class how to build lexers using the derivative of regular expressions:
 http://matt.might.net/articles/implementation-of-regular-expression-matching-in-scheme-with-derivatives/
The implementation for regular languages is remarkably compact.

	
		
	

	
	
	
		programmingpraxis said
	

	
	October 4, 2009  at 12:21 AM 

	Brzozowski’s regular-expression matcher based on derivatives was on my to-do list.  Now I’ll have to take it off.

	
		
	

	
	
	
		Vikas Tandi said
	

	
	April 21, 2011  at 12:51 PM 

	A scratchy and long implementation in C
#include <stdio.h>
#include <ctype.h>

static int copy_code(int code, char *s, int i);
static int copy_character(int c, char *s, int i);
static int copy_special_characters(int code, char *s, int i);

static char codes[][4] = {"bol", "eol", "any", "lit", "ccl", "ncl",	"clo"};
enum{ BOL = 0, EOL, ANY, LIT, CCL, NCL, CLO};

static char special_characters[][8] = {"space", "newline", "tab"};
enum{SPACE, NEWLINE, TAB};

enum{VALID_REGEXP = 0, INVALID_REGEXP};

int encode_regular_expression(char *regexp, char *encode)
{
	/* i is the index of current character in regexp ang j is the current character being output */
	int i, j, escaped;
	char c;

	i = j = escaped = 0;
	encode[j++] = '(';
	/* parse through the regular expression */
	for(;;)
	{
		/* read the character in regexp in c */
		c = regexp[i++];

		if(c == '\0')
			break;

		if(c == '\\')								/* escape character */
		{
			escaped = 1;
			continue;
		}

		encode[j++] = '(';

		if(!escaped && c == '^' && i == 1)					/* begining of line character '^' is not escaped and located at the start of regexp */
			j = copy_code(BOL, encode, j);
		else if(!escaped && c == '$' && regexp[i] == '\0')		/* end of line character '$' is not escaped and located at the end of regexp */
			j = copy_code(EOL, encode, j);
		else if(!escaped && c == '.')							/* any character */
		{
			if(regexp[i] == '*')
			{
				j = copy_code(CLO, encode, j);
				encode[j++] = ' ';
				i++;
			}

			j = copy_code(ANY, encode, j);
		}
		else if(!escaped && c == '[')							/* begining of character class */
		{
			int k;

			/* check for closure and validity of character class */
			for(k = i; regexp[k] != '\0' && regexp[k] != ']'; k++);

			/* if no ending bracket, return invalid regexp */
			if(regexp[k] == '\0')
				return INVALID_REGEXP;

			/* check for closure */
			if(regexp[k+1] == '*')
			{
				j = copy_code(CLO, encode, j);
				encode[j++] = ' ';
			}

			/* check for negated character */
			if(regexp[i] == '^')
			{
				j = copy_code(NCL, encode, j);
				i++;
			}
			else
				j = copy_code(CCL, encode, j);

			while(regexp[i] != '\0' && regexp[i] != ']')
			{
				char start, end;

				if(regexp[i+1] != '-')
				{
					j = copy_character(regexp[i], encode, j);
					i++;
					continue;
				}

				if(!isalnum(regexp[i]) && !isalnum(regexp[i+2]))
					return INVALID_REGEXP;

				if(isalpha(regexp[i])  && !isalpha(regexp[i+2]))
					return INVALID_REGEXP;

				if(isdigit(regexp[i])  && !isdigit(regexp[i+2]))
					return INVALID_REGEXP;

				if(regexp[i] > regexp[i+2])
					return INVALID_REGEXP;

				/*now expand the given range */
				for(start = regexp[i], end = regexp[i+2]; start <= end; start++)
					j = copy_character(start, encode, j);

				i = i + 3;
			}

			i++;
			if(regexp[i] == '*')
				i++;
		}
		else							/* literal character */
		{
			if(regexp[i] == '*')
			{
				j = copy_code(CLO, encode, j);
				encode[j++] = ' ';
				i++;
			}

			j = copy_code(LIT, encode, j);

			/* handle special characters */
			if(escaped && c == 'n')
				j = copy_special_characters(NEWLINE, encode, j);
			else if(escaped && c == 't')
				j = copy_special_characters(TAB, encode, j);
			else if(c == ' ')
				j = copy_special_characters(SPACE, encode, j);
			else
				j = copy_character(c, encode, j);
		}
		encode[j++] = ')';
		encode[j++] = ' ';
	}
	encode[j-1] = ')';
	encode[j] = '\0';
	return VALID_REGEXP;
}

static int copy_code(int code, char *s, int i)
{
	int j;

	for(j = 0; codes[code][j] != '\0'; j++)
		s[i++] = codes[code][j];

	return i;
}

static int copy_special_characters(int code, char *s, int i)
{
	int j;

	s[i++] = ' ';
	s[i++] = '#';
	s[i++] = '\\';

	for(j = 0; special_characters[code][j] != '\0'; j++)
		s[i++] = special_characters[code][j];

	return i;
}

static int copy_character(int c, char *s, int i)
{
	s[i++] = ' ';
	s[i++] = '#';
	s[i++] = '\\';
	s[i++] = c;

	return i;
}


	
		
	

	

	
	
	
	

		
		Leave a comment






			
			
Δ

S	M	T	W	T	F	S
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30

Programming Praxis

Regular Expressions, Part 1

September 15, 2009

8 Responses to “Regular Expressions, Part 1”

Leave a comment

Categories

Archives

Archives