Text File Databases: Part 1
October 19, 2010
We start with the reader for fixed-length records, which is similar to the reader of a previous exercise:
(define (read-fixed-record size defs . port)
(let ((p (if (null? port) (current-input-port) (car port))))
(let ((fix-rec (read-chars size p)))
(if (eof-object? fix-rec)
fix-rec
(let loop ((defs defs) (result '()))
(if (null? defs)
(reverse result)
(loop (cdr defs)
(cons (substring fix-rec (caar defs) (cadar defs)) result))))))))
Here size
is the number of characters in the record and defs
is a list of start/end pairs with zero-based positions suitable for substring
. Read-chars
may be used to strip a fixed-length header from a file:
(define (read-chars n . port)
(let ((p (if (null? port) (current-input-port) (car port))))
(if (eof-object? (peek-char p))
(peek-char p)
(let loop ((n n) (c (peek-char p)) (s '()))
(cond ((and (eof-object? c) (pair? s)) (list->string (reverse s)))
((eof-object? c) c)
((zero? n) (list->string (reverse s)))
(else (let ((c (read-char p)))
(loop (sub1 n) (peek-char p) (cons c s)))))))))
The reader for character-delimited records was also previewed in the exercise on cut
:
(define (read-delim-record . args)
(define (eat p c)
(if (and (not (eof-object? (peek-char p)))
(char=? (peek-char p) c))
(read-char p)))
(define (read-delim delim port)
(cond ((eof-object? (peek-char port)) (peek-char port))
((and delim (or (char=? delim #\return) (char=? delim #\newline)))
(let loop ((f (read-line port)) (fields '()))
(if (or (eof-object? f) (string=? f ""))
(reverse fields)
(loop (read-line port) (cons f fields)))))
(else
(let loop ((c (read-char port)) (field '()) (fields '()))
(cond ((eof-object? c) (reverse (cons (list->string (reverse field)) fields)))
((char=? #\return c) (eat port #\newline)
(reverse (cons (list->string (reverse field)) fields)))
((char=? #\newline c) (eat port #\return)
(reverse (cons (list->string (reverse field)) fields)))
((and delim (char=? delim c))
(loop (read-char port) '() (cons (list->string (reverse field)) fields)))
((char-whitespace? c)
(if (char-whitespace? (peek-char port))
(loop (read-char port) field fields)
(loop (read-char port) '()
(cons (list->string (reverse field)) fields))))
(else (loop (read-char port) (cons c field) fields)))))))
(cond ((null? args) (read-delim #f (current-input-port)))
((and (null? (cdr args)) (char? (car args)))
(read-delim (car args) (current-input-port)))
((and (null? (cdr args)) (port? (car args)))
(read-delim #f (car args)))
((and (pair? (cdr args)) (null? (cddr args)) (char? (car args)) (port? (cadr args)))
(read-delim (car args) (cadr args)))
(else (read-delim #f (current-input-port)))))
This function is messy for two reasons: First, it has to handle all types of end-of-line markers. Second, it makes the delim
argument optional; if it is not given, then each maximal sequence of whitespace characters (that is, one or more characters from the set that includes #\space, #\tab, and possibly other characters) forms a delimiter.
The reader for comma-separated values tables was also previewed in a previous exercise:
(define (read-csv-record . args)
(define (read-csv delim port)
(define (add-field field fields)
(cons (list->string (reverse field)) fields))
(define (start field fields)
(let ((c (read-char port)))
(cond ((eof-object? c) (reverse fields))
((char=? #\return c) (carriage-return field fields))
((char=? #\newline c) (line-feed field fields))
((char=? #\" c) (quoted-field field fields))
((char=? delim c) (not-field '() (add-field field fields)))
(else (unquoted-field (cons c field) fields)))))
(define (not-field field fields)
(let ((c (read-char port)))
(cond ((eof-object? c) (cons "" fields))
((char=? #\return c) (carriage-return '() (add-field field fields)))
((char=? #\newline c) (line-feed '() (add-field field fields)))
((char=? #\" c) (quoted-field field fields))
((char=? delim c) (not-field '() (add-field field fields)))
(else (unquoted-field (cons c field) fields)))))
(define (quoted-field field fields)
(let ((c (read-char port)))
(cond ((eof-object? c) (add-field field fields))
((char=? #\" c) (may-be-doubled-quotes field fields))
(else (quoted-field (cons c field) fields)))))
(define (may-be-doubled-quotes field fields)
(let ((c (read-char port)))
(cond ((eof-object? c) (add-field field fields))
((char=? #\return c) (carriage-return '() (add-field field fields)))
((char=? #\newline c) (line-feed '() (add-field field fields)))
((char=? #\" c) (quoted-field (cons #\" field) fields))
((char=? delim c) (not-field '() (add-field field fields)))
(else (unquoted-field (cons c field) fields)))))
(define (unquoted-field field fields)
(let ((c (read-char port)))
(cond ((eof-object? c) (add-field field fields))
((char=? #\return c) (carriage-return '() (add-field field fields)))
((char=? #\newline c) (line-feed '() (add-field field fields)))
((char=? delim c) (not-field '() (add-field field fields)))
(else (unquoted-field (cons c field) fields)))))
(define (carriage-return field fields)
(let ((c (peek-char port)))
(cond ((eof-object? c) fields)
((char=? #\newline c) (read-char port) fields)
(else fields))))
(define (line-feed field fields)
(let ((c (peek-char port)))
(cond ((eof-object? c) fields)
((char=? #\return c) (read-char port) fields)
(else fields))))
(if (eof-object? (peek-char port)) (peek-char port) (reverse (start '() '()))))
(cond ((null? args) (read-csv #\, (current-input-port)))
((and (null? (cdr args)) (char? (car args)))
(read-csv (car args) (current-input-port)))
((and (null? (cdr args)) (port? (car args)))
(read-csv #\, (car args)))
((and (pair? (cdr args)) (null? (cddr args)) (char? (car args)) (port? (cadr args)))
(read-csv (car args) (cadr args)))
(else (read-csv #\, (current-input-port)))))
This is just a big state machine, with goto
s implemented by functions in tail position. The delimiter may be changed from the default comma by giving a different character as the first argument.
Finally, here is the reader for name-value records:
(define (read-name-value-record . args)
(define (eat p c)
(if (and (not (eof-object? (peek-char p)))
(char=? (peek-char p) c))
(read-char p)))
(define (read-name-value delim port)
(if (eof-object? (peek-char port))
(peek-char port)
(let loop ((c (read-char port)) (key '()) (value '()) (fields '()))
(if (string? key)
(cond ((eof-object? c)
(reverse (cons (cons key (list->string (reverse value))) fields)))
((char=? #\return c) (eat port #\newline)
(loop (read-char port) '() '()
(cons (cons key (list->string (reverse value))) fields)))
((char=? #\newline c) (eat port #\return)
(loop (read-char port) '() '()
(cons (cons key (list->string (reverse value))) fields)))
(else (loop (read-char port) key (cons c value) fields)))
(cond ((eof-object? c)
(reverse (cons (cons (list->string (reverse key)) "") fields)))
((char=? #\return c) (eat port #\newline)
(reverse (cons (cons (list->string (reverse key)) "") fields)))
((char=? #\newline c) (eat port #\return)
(reverse (cons (cons (list->string (reverse key)) "") fields)))
((and delim (char=? delim c))
(loop (read-char port) (list->string (reverse key)) value fields))
((and (not delim) (char-whitespace? c))
(if (char-whitespace? (peek-char port))
(loop (read-char port) key value fields)
(loop (read-char port) (list->string (reverse key)) value fields)))
(else (loop (read-char port) (cons c key) value fields)))))))
(cond ((null? args) (read-name-value #f (current-input-port)))
((and (null? (cdr args)) (char? (car args)))
(read-name-value (car args) (current-input-port)))
((and (null? (cdr args)) (port? (car args)))
(read-name-value #f (car args)))
((and (pair? (cdr args)) (null? (cddr args)) (char? (car args)) (port? (cadr args)))
(read-name-value (car args) (cadr args)))
(else (read-name-value #f (current-input-port)))))
Like the other functions, it takes an optional first argument that defines the delimiter between the name and value portions of each line; if none is given, it defaults to the first maximal run of whitespace characters on the line. Note that only the first delimiter is special; any subsequent appearances of the delimiter are just part of the data. The reader takes multiple lines until it finds a blank line (two successive end-of-line markers).
The code is collected at http://programmingpraxis.codepad.org/h3ENphv5.
[…] today’s Programming Praxis exercise our goal is to read data from four different types of text file […]
My Haskell solution (see http://bonsaicode.wordpress.com/2010/10/19/programming-praxis-text-file-databases-part-1/ for a version with comments):