Comma-Separated Values
March 17, 2009
The obvious approach to a problem like this is to build a state machine with transitions based on the current input character. In Scheme, a state machine is programmed as a series of mutually-recursive functions; since all the functions are called in tail position, the calculation is iterative, not recursive — no stack space is consumed.
; READ-CSV-RECORD [DELIM] [PORT]
(define (read-csv-record . args)
(define (read-csv delim port)
(define (add-field field fields)
(cons (list->string (reverse field)) fields))
(define (start field fields)
(let ((c (read-char port)))
(cond ((eof-object? c) (reverse fields))
((char=? #\return c) (carriage-return field fields))
((char=? #\newline c) (line-feed field fields))
((char=? #\" c) (quoted-field field fields))
((char=? delim c) (not-field '() (add-field field fields)))
(else (unquoted-field (cons c field) fields)))))
(define (not-field field fields)
(let ((c (read-char port)))
(cond ((eof-object? c) (cons "" fields))
((char=? #\return c) (carriage-return '() (add-field field fields)))
((char=? #\newline c) (line-feed '() (add-field field fields)))
((char=? #\" c) (quoted-field field fields))
((char=? delim c) (not-field '() (add-field field fields)))
(else (unquoted-field (cons c field) fields)))))
(define (quoted-field field fields)
(let ((c (read-char port)))
(cond ((eof-object? c) (add-field field fields))
((char=? #\" c) (may-be-doubled-quotes field fields))
(else (quoted-field (cons c field) fields)))))
(define (may-be-doubled-quotes field fields)
(let ((c (read-char port)))
(cond ((eof-object? c) (add-field field fields))
((char=? #\return c) (carriage-return '() (add-field field fields)))
((char=? #\newline c) (line-feed '() (add-field field fields)))
((char=? #\" c) (quoted-field (cons #\" field) fields))
((char=? delim c) (not-field '() (add-field field fields)))
(else (unquoted-field (cons c field) fields)))))
(define (unquoted-field field fields)
(let ((c (read-char port)))
(cond ((eof-object? c) (add-field field fields))
((char=? #\return c) (carriage-return '() (add-field field fields)))
((char=? #\newline c) (line-feed '() (add-field field fields)))
((char=? delim c) (not-field '() (add-field field fields)))
(else (unquoted-field (cons c field) fields)))))
(define (carriage-return field fields)
(let ((c (peek-char port)))
(cond ((eof-object? c) fields)
((char=? #\newline c) (read-char port) fields)
(else fields))))
(define (line-feed field fields)
(let ((c (peek-char port)))
(cond ((eof-object? c) fields)
((char=? #\return c) (read-char port) fields)
(else fields))))
(if (eof-object? (peek-char port)) (peek-char port) (reverse (start '() '()))))
(cond ((null? args) (read-csv #\, (current-input-port)))
((and (null? (cdr args)) (char? (car args)))
(read-csv (car args) (current-input-port)))
((and (null? (cdr args)) (port? (car args)))
(read-csv #\, (car args)))
((and (pair? (cdr args)) (null? (cddr args)) (char? (car args)) (port? (cadr args)))
(read-csv (car args) (cadr args)))
(else (read-csv #\, (current-input-port)))))
We build the test input file in a string:
(define csv-test (string-append
"1,abc,def ghi,jkl,unquoted character strings\n"
"2,\"abc\",\"def ghi\",\"jkl\",quoted character strings\n"
"3,123,456,789,numbers\n"
"4, abc,def , ghi ,strings with whitespace\n"
"5, \"abc\",\"def\" , \"ghi\" ,quoted strings with whitespace\n"
"6, 123,456 , 789 ,numbers with whitespace\n"
"7,TAB123,456TAB,TAB789TAB,numbers with tabs for whitespace\n"
"8, -123, +456, 1E3,more numbers with whitespace\n"
"9,123 456,123\"456, 123 456 ,strange numbers\n"
"10,abc\",de\"f,g\"hi,embedded quotes\n"
"11,\"abc\"\"\",\"de\"\"f\",\"g\"\"hi\",quoted embedded quotes\n"
"12,\"\",\"\" \"\",\"\"x\"\",doubled quotes\n"
"13,\"abc\"def,abc\"def\",\"abc\" \"def\",strange quotes\n"
"14,,\"\", ,empty fields\n"
"15,abc,\"def\n ghi\",jkl,embedded newline\n"
"16,abc,\"def\",789,multiple types of fields\n"))
Then the test is performed by taking input from a string port:
> (with-input-from-string csv-test (lambda ()
(do ((csv-record (read-csv-record) (read-csv-record)))
((eof-object? csv-record))
(display (list-ref csv-record 0)) (display "|")
(display (list-ref csv-record 1)) (display "|")
(display (list-ref csv-record 2)) (display "|")
(display (list-ref csv-record 3)) (display "|")
(display (list-ref csv-record 4)) (newline))))
1|abc|def ghi|jkl|unquoted character strings
2|abc|def ghi|jkl|quoted character strings
3|123|456|789|numbers
4| abc|def | ghi |strings with whitespace
5| "abc"|def | "ghi" |quoted strings with whitespace
6| 123|456 | 789 |numbers with whitespace
7|TAB123|456TAB|TAB789TAB|numbers with tabs for whitespace
8| -123| +456| 1E3|more numbers with whitespace
9|123 456|123"456| 123 456 |strange numbers
10|abc"|de"f|g"hi|embedded quotes
11|abc"|de"f|g"hi|quoted embedded quotes
12|| ""|x""|doubled quotes
13|abcdef|abc"def"|abc "def"|strange quotes
14||| |empty fields
15|abc|def
ghi|jkl|embedded newline
16|abc|def|789|multiple types of fields
The sample solution is available at http://programmingpraxis.codepad.org/8q3sQGzA, though it’s not possible to perform the test because the Scheme system at codepad.org doesn’t provide with-input-from-string
.
A quick and dirty Ruby hack:
def separated_vals(filename, separator = ‘,’)
lines = Array.new
sep = Regexp.compile(“\\s*\\”+separator+”\\s*”)
File.open(filename) do |file_handle|
lines << file_handle.gets.split(sep)
end
return lines
end
I don’t think that properly handles all the cases.
You’re quite right; it doesn’t quite work.
Here’s a better version:
def separated_vals(filename, separator = ‘,’)
lines = Array.new
File.open(filename) do |file_handle|
while not file_handle.eof? do
lines << file_handle.gets.strip.split(separator)
end
end
return lines
end
This works for all test cases with one omission – it keeps quotes around quoted strings. Now, instead of hacking further, I think I’ll sit down and write a proper state machine :)
Haskell (parses every test case correctly):
import Control.Applicative ((<*), (*>), (<*>), (<$>))
import Text.Parsec
main = readFile “csv.txt” >>= print . parseCSV ‘,’
parseCSV sep = parse (sepBy (line sep) (many1 $ oneOf “\r\n”)) “”
line sep = sepBy (field sep) (char sep)
field sep = option “” dropQuotes <+> (ws <+> (keepQuotes <|> plain sep)) <+> ws
plain sep = many (noneOf $ sep:”\r\n”)
dropQuotes = try (between’ “\”\”” quoted) <|> between’ “\”” quoted
keepQuotes = (string “\”” <+> quoted <+> string “\””)
quoted = many (noneOf “\”” <|> (head <$> try (string “\”\””)))
ws = many (oneOf ” \t”)
a <+> b = (++) <$> a <*> b
between’ s p = string s *> p <* string s [/sourcecode]
https://github.com/ftt/programming-praxis/blob/master/20090317-comma-separated-values/csv.py
[…] Reddit or Stack Overflow or someplace about handling CSV files with awk. We’ve done that in a previous exercise, but today I decided to handle CSV files in a different way. Specifically, I wrote an awk function […]