Text File Databases: Part 1

October 19, 2010

We start with the reader for fixed-length records, which is similar to the reader of a previous exercise:

(define (read-fixed-record size defs . port)
  (let ((p (if (null? port) (current-input-port) (car port))))
    (let ((fix-rec (read-chars size p)))
      (if (eof-object? fix-rec)
          fix-rec
          (let loop ((defs defs) (result '()))
            (if (null? defs)
                (reverse result)
                (loop (cdr defs)
                      (cons (substring fix-rec (caar defs) (cadar defs)) result))))))))

Here size is the number of characters in the record and defs is a list of start/end pairs with zero-based positions suitable for substring. Read-chars may be used to strip a fixed-length header from a file:

(define (read-chars n . port)
  (let ((p (if (null? port) (current-input-port) (car port))))
    (if (eof-object? (peek-char p))
        (peek-char p)
        (let loop ((n n) (c (peek-char p)) (s '()))
          (cond ((and (eof-object? c) (pair? s)) (list->string (reverse s)))
                ((eof-object? c) c)
                ((zero? n) (list->string (reverse s)))
                (else (let ((c (read-char p)))
                        (loop (sub1 n) (peek-char p) (cons c s)))))))))

The reader for character-delimited records was also previewed in the exercise on cut:

(define (read-delim-record . args)
  (define (eat p c)
    (if (and (not (eof-object? (peek-char p)))
             (char=? (peek-char p) c))
        (read-char p)))
  (define (read-delim delim port)
    (cond ((eof-object? (peek-char port)) (peek-char port))
          ((and delim (or (char=? delim #\return) (char=? delim #\newline)))
            (let loop ((f (read-line port)) (fields '()))
              (if (or (eof-object? f) (string=? f ""))
                  (reverse fields)
                  (loop (read-line port) (cons f fields)))))
          (else
            (let loop ((c (read-char port)) (field '()) (fields '()))
              (cond ((eof-object? c) (reverse (cons (list->string (reverse field)) fields)))
                    ((char=? #\return c) (eat port #\newline)
                      (reverse (cons (list->string (reverse field)) fields)))
                    ((char=? #\newline c) (eat port #\return)
                      (reverse (cons (list->string (reverse field)) fields)))
                    ((and delim (char=? delim c))
                      (loop (read-char port) '() (cons (list->string (reverse field)) fields)))
                    ((char-whitespace? c)
                      (if (char-whitespace? (peek-char port))
                          (loop (read-char port) field fields)
                          (loop (read-char port) '()
                                (cons (list->string (reverse field)) fields))))
                    (else (loop (read-char port) (cons c field) fields)))))))
  (cond ((null? args) (read-delim #f (current-input-port)))
        ((and (null? (cdr args)) (char? (car args)))
          (read-delim (car args) (current-input-port)))
        ((and (null? (cdr args)) (port? (car args)))
          (read-delim #f (car args)))
        ((and (pair? (cdr args)) (null? (cddr args)) (char? (car args)) (port? (cadr args)))
          (read-delim (car args) (cadr args)))
        (else (read-delim #f (current-input-port)))))

This function is messy for two reasons: First, it has to handle all types of end-of-line markers. Second, it makes the delim argument optional; if it is not given, then each maximal sequence of whitespace characters (that is, one or more characters from the set that includes #\space, #\tab, and possibly other characters) forms a delimiter.

The reader for comma-separated values tables was also previewed in a previous exercise:

(define (read-csv-record . args)
  (define (read-csv delim port)
    (define (add-field field fields)
      (cons (list->string (reverse field)) fields))
    (define (start field fields)
      (let ((c (read-char port)))
        (cond ((eof-object? c) (reverse fields))
              ((char=? #\return c) (carriage-return field fields))
              ((char=? #\newline c) (line-feed field fields))
              ((char=? #\" c) (quoted-field field fields))
              ((char=? delim c) (not-field '() (add-field field fields)))
              (else (unquoted-field (cons c field) fields)))))
    (define (not-field field fields)
      (let ((c (read-char port)))
        (cond ((eof-object? c) (cons "" fields))
              ((char=? #\return c) (carriage-return '() (add-field field fields)))
              ((char=? #\newline c) (line-feed '() (add-field field fields)))
              ((char=? #\" c) (quoted-field field fields))
              ((char=? delim c) (not-field '() (add-field field fields)))
              (else (unquoted-field (cons c field) fields)))))
    (define (quoted-field field fields)
      (let ((c (read-char port)))
        (cond ((eof-object? c) (add-field field fields))
              ((char=? #\" c) (may-be-doubled-quotes field fields))
              (else (quoted-field (cons c field) fields)))))
    (define (may-be-doubled-quotes field fields)
      (let ((c (read-char port)))
        (cond ((eof-object? c) (add-field field fields))
              ((char=? #\return c) (carriage-return '() (add-field field fields)))
              ((char=? #\newline c) (line-feed '() (add-field field fields)))
              ((char=? #\" c) (quoted-field (cons #\" field) fields))
              ((char=? delim c) (not-field '() (add-field field fields)))
              (else (unquoted-field (cons c field) fields)))))
    (define (unquoted-field field fields)
      (let ((c (read-char port)))
        (cond ((eof-object? c) (add-field field fields))
              ((char=? #\return c) (carriage-return '() (add-field field fields)))
              ((char=? #\newline c) (line-feed '() (add-field field fields)))
              ((char=? delim c) (not-field '() (add-field field fields)))
              (else (unquoted-field (cons c field) fields)))))
    (define (carriage-return field fields)
      (let ((c (peek-char port)))
        (cond ((eof-object? c) fields)
              ((char=? #\newline c) (read-char port) fields)
              (else fields))))
    (define (line-feed field fields)
      (let ((c (peek-char port)))
        (cond ((eof-object? c) fields)
              ((char=? #\return c) (read-char port) fields)
              (else fields))))
    (if (eof-object? (peek-char port)) (peek-char port) (reverse (start '() '()))))
  (cond ((null? args) (read-csv #\, (current-input-port)))
        ((and (null? (cdr args)) (char? (car args)))
          (read-csv (car args) (current-input-port)))
        ((and (null? (cdr args)) (port? (car args)))
          (read-csv #\, (car args)))
        ((and (pair? (cdr args)) (null? (cddr args)) (char? (car args)) (port? (cadr args)))
          (read-csv (car args) (cadr args)))
        (else (read-csv #\, (current-input-port)))))

This is just a big state machine, with gotos implemented by functions in tail position. The delimiter may be changed from the default comma by giving a different character as the first argument.

Finally, here is the reader for name-value records:

(define (read-name-value-record . args)
  (define (eat p c)
    (if (and (not (eof-object? (peek-char p)))
             (char=? (peek-char p) c))
        (read-char p)))
  (define (read-name-value delim port)
    (if (eof-object? (peek-char port))
        (peek-char port)
        (let loop ((c (read-char port)) (key '()) (value '()) (fields '()))
          (if (string? key)
              (cond ((eof-object? c)
                      (reverse (cons (cons key (list->string (reverse value))) fields)))
                    ((char=? #\return c) (eat port #\newline)
                      (loop (read-char port) '() '()
                            (cons (cons key (list->string (reverse value))) fields)))
                    ((char=? #\newline c) (eat port #\return)
                      (loop (read-char port) '() '()
                            (cons (cons key (list->string (reverse value))) fields)))
                    (else (loop (read-char port) key (cons c value) fields)))
              (cond ((eof-object? c)
                      (reverse (cons (cons (list->string (reverse key)) "") fields)))
                    ((char=? #\return c) (eat port #\newline)
                      (reverse (cons (cons (list->string (reverse key)) "") fields)))
                    ((char=? #\newline c) (eat port #\return)
                      (reverse (cons (cons (list->string (reverse key)) "") fields)))
                    ((and delim (char=? delim c))
                      (loop (read-char port) (list->string (reverse key)) value fields))
                    ((and (not delim) (char-whitespace? c))
                      (if (char-whitespace? (peek-char port))
                          (loop (read-char port) key value fields)
                          (loop (read-char port) (list->string (reverse key)) value fields)))
                    (else (loop (read-char port) (cons c key) value fields)))))))
  (cond ((null? args) (read-name-value #f (current-input-port)))
        ((and (null? (cdr args)) (char? (car args)))
          (read-name-value (car args) (current-input-port)))
        ((and (null? (cdr args)) (port? (car args)))
          (read-name-value #f (car args)))
        ((and (pair? (cdr args)) (null? (cddr args)) (char? (car args)) (port? (cadr args)))
          (read-name-value (car args) (cadr args)))
        (else (read-name-value #f (current-input-port)))))

Like the other functions, it takes an optional first argument that defines the delimiter between the name and value portions of each line; if none is given, it defaults to the first maximal run of whitespace characters on the line. Note that only the first delimiter is special; any subsequent appearances of the delimiter are just part of the data. The reader takes multiple lines until it finds a blank line (two successive end-of-line markers).

The code is collected at http://programmingpraxis.codepad.org/h3ENphv5.

Advertisement

Pages: 1 2

2 Responses to “Text File Databases: Part 1”

  1. […] today’s Programming Praxis exercise our goal is to read data from four different types of text file […]

  2. Remco Niemeijer said

    My Haskell solution (see http://bonsaicode.wordpress.com/2010/10/19/programming-praxis-text-file-databases-part-1/ for a version with comments):

    import Control.Applicative ((<*), (<*>), (*>), (<$>))
    import Text.Parsec
    import Text.Parsec.String
    
    eol :: Parser ()
    eol = (char '\n' *> optional (char '\r')) <|>
          (char '\r' *> optional (char '\n')) <|> eof
    
    fixedLength :: [Int] -> Parser [String]
    fixedLength fields = foldr (\n p -> (:) <$> count n anyChar <*> p)
                               (return []) fields <* eol
    
    charDelim :: Parser a -> Parser [String]
    charDelim sep = manyTill field eol where
        field = manyTill anyChar ((sep *> return ()) <|> lookAhead eol)
    
    csv :: Parser [String]
    csv = sepBy field (char ',') <* eol where
        field = quoted <|> many (noneOf ",\n\r")
        quoted = between (char '"') (char '"') $
                 many (try (char '"' <* char '"') <|> noneOf "\"")
    
    nameValue :: Parser a -> Parser [(String, String)]
    nameValue sep = manyTill field eol where
        field = (,) <$> manyTill anyChar sep <*> manyTill anyChar eol
    
    readDB :: Parser a -> FilePath -> IO (Either ParseError [a])
    readDB record = fmap (parse (manyTill record eof) "") . readFile
    

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: