Comma-Separated Values

March 17, 2009

The obvious approach to a problem like this is to build a state machine with transitions based on the current input character. In Scheme, a state machine is programmed as a series of mutually-recursive functions; since all the functions are called in tail position, the calculation is iterative, not recursive — no stack space is consumed.

; READ-CSV-RECORD [DELIM] [PORT]
(define (read-csv-record . args)
  (define (read-csv delim port)
    (define (add-field field fields)
      (cons (list->string (reverse field)) fields))
    (define (start field fields)
      (let ((c (read-char port)))
        (cond ((eof-object? c) (reverse fields))
              ((char=? #\return c) (carriage-return field fields))
              ((char=? #\newline c) (line-feed field fields))
              ((char=? #\" c) (quoted-field field fields))
              ((char=? delim c) (not-field '() (add-field field fields)))
              (else (unquoted-field (cons c field) fields)))))
    (define (not-field field fields)
      (let ((c (read-char port)))
        (cond ((eof-object? c) (cons "" fields))
              ((char=? #\return c) (carriage-return '() (add-field field fields)))
              ((char=? #\newline c) (line-feed '() (add-field field fields)))
              ((char=? #\" c) (quoted-field field fields))
              ((char=? delim c) (not-field '() (add-field field fields)))
              (else (unquoted-field (cons c field) fields)))))
    (define (quoted-field field fields)
      (let ((c (read-char port)))
        (cond ((eof-object? c) (add-field field fields))
              ((char=? #\" c) (may-be-doubled-quotes field fields))
              (else (quoted-field (cons c field) fields)))))
    (define (may-be-doubled-quotes field fields)
      (let ((c (read-char port)))
        (cond ((eof-object? c) (add-field field fields))
              ((char=? #\return c) (carriage-return '() (add-field field fields)))
              ((char=? #\newline c) (line-feed '() (add-field field fields)))
              ((char=? #\" c) (quoted-field (cons #\" field) fields))
              ((char=? delim c) (not-field '() (add-field field fields)))
              (else (unquoted-field (cons c field) fields)))))
    (define (unquoted-field field fields)
      (let ((c (read-char port)))
        (cond ((eof-object? c) (add-field field fields))
              ((char=? #\return c) (carriage-return '() (add-field field fields)))
              ((char=? #\newline c) (line-feed '() (add-field field fields)))
              ((char=? delim c) (not-field '() (add-field field fields)))
              (else (unquoted-field (cons c field) fields)))))
    (define (carriage-return field fields)
      (let ((c (peek-char port)))
        (cond ((eof-object? c) fields)
              ((char=? #\newline c) (read-char port) fields)
              (else fields))))
    (define (line-feed field fields)
      (let ((c (peek-char port)))
        (cond ((eof-object? c) fields)
              ((char=? #\return c) (read-char port) fields)
              (else fields))))
    (if (eof-object? (peek-char port)) (peek-char port) (reverse (start '() '()))))
  (cond ((null? args) (read-csv #\, (current-input-port)))
        ((and (null? (cdr args)) (char? (car args)))
          (read-csv (car args) (current-input-port)))
        ((and (null? (cdr args)) (port? (car args)))
          (read-csv #\, (car args)))
        ((and (pair? (cdr args)) (null? (cddr args)) (char? (car args)) (port? (cadr args)))
          (read-csv (car args) (cadr args)))
        (else (read-csv #\, (current-input-port)))))

We build the test input file in a string:

(define csv-test (string-append
  "1,abc,def ghi,jkl,unquoted character strings\n"
  "2,\"abc\",\"def ghi\",\"jkl\",quoted character strings\n"
  "3,123,456,789,numbers\n"
  "4, abc,def , ghi ,strings with whitespace\n"
  "5, \"abc\",\"def\" , \"ghi\" ,quoted strings with whitespace\n"
  "6, 123,456 , 789 ,numbers with whitespace\n"
  "7,TAB123,456TAB,TAB789TAB,numbers with tabs for whitespace\n"
  "8, -123, +456, 1E3,more numbers with whitespace\n"
  "9,123 456,123\"456, 123 456 ,strange numbers\n"
  "10,abc\",de\"f,g\"hi,embedded quotes\n"
  "11,\"abc\"\"\",\"de\"\"f\",\"g\"\"hi\",quoted embedded quotes\n"
  "12,\"\",\"\" \"\",\"\"x\"\",doubled quotes\n"
  "13,\"abc\"def,abc\"def\",\"abc\" \"def\",strange quotes\n"
  "14,,\"\", ,empty fields\n"
  "15,abc,\"def\n ghi\",jkl,embedded newline\n"
  "16,abc,\"def\",789,multiple types of fields\n"))

Then the test is performed by taking input from a string port:

> (with-input-from-string csv-test (lambda ()
    (do ((csv-record (read-csv-record) (read-csv-record)))
        ((eof-object? csv-record))
      (display (list-ref csv-record 0)) (display "|")
      (display (list-ref csv-record 1)) (display "|")
      (display (list-ref csv-record 2)) (display "|")
      (display (list-ref csv-record 3)) (display "|")
      (display (list-ref csv-record 4)) (newline))))
1|abc|def ghi|jkl|unquoted character strings
2|abc|def ghi|jkl|quoted character strings
3|123|456|789|numbers
4| abc|def | ghi |strings with whitespace
5| "abc"|def | "ghi" |quoted strings with whitespace
6| 123|456 | 789 |numbers with whitespace
7|TAB123|456TAB|TAB789TAB|numbers with tabs for whitespace
8| -123| +456| 1E3|more numbers with whitespace
9|123 456|123"456| 123 456 |strange numbers
10|abc"|de"f|g"hi|embedded quotes
11|abc"|de"f|g"hi|quoted embedded quotes
12|| ""|x""|doubled quotes
13|abcdef|abc"def"|abc "def"|strange quotes
14||| |empty fields
15|abc|def
  ghi|jkl|embedded newline
16|abc|def|789|multiple types of fields

The sample solution is available at http://programmingpraxis.codepad.org/8q3sQGzA, though it’s not possible to perform the test because the Scheme system at codepad.org doesn’t provide with-input-from-string.

About these ads

Pages: 1 2

5 Responses to “Comma-Separated Values”

  1. Connochaetes said

    A quick and dirty Ruby hack:

    def separated_vals(filename, separator = ‘,’)
    lines = Array.new
    sep = Regexp.compile(“\\s*\\”+separator+”\\s*”)

    File.open(filename) do |file_handle|
    lines << file_handle.gets.split(sep)
    end
    return lines
    end

  2. programmingpraxis said

    I don’t think that properly handles all the cases.

  3. Connochaetes said

    You’re quite right; it doesn’t quite work.

    Here’s a better version:

    def separated_vals(filename, separator = ‘,’)
    lines = Array.new

    File.open(filename) do |file_handle|
    while not file_handle.eof? do
    lines << file_handle.gets.strip.split(separator)
    end
    end
    return lines
    end

    This works for all test cases with one omission – it keeps quotes around quoted strings. Now, instead of hacking further, I think I’ll sit down and write a proper state machine :)

  4. FalconNL said

    Haskell (parses every test case correctly):

    import Control.Applicative ((<*), (*>), (<*>), (<$>))
    import Text.Parsec
    
    main = readFile "csv.txt" >>= print . parseCSV ','
    
    parseCSV sep = parse (sepBy (line sep) (many1 $ oneOf "\r\n")) ""
    line sep = sepBy (field sep) (char sep)
    field sep = option "" dropQuotes <+> (ws <+> (keepQuotes <|> plain sep)) <+> ws
    plain sep = many (noneOf $ sep:"\r\n")
    dropQuotes = try (between' "\"\"" quoted) <|> between' "\"" quoted
    keepQuotes = (string "\"" <+> quoted <+> string "\"")
    quoted = many (noneOf "\"" <|> (head <$> try (string "\"\"")))
    ws = many (oneOf " \t")
    a <+> b = (++) <$> a <*> b
    between' s p = string s *> p <* string s
    

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Follow

Get every new post delivered to your Inbox.

Join 611 other followers

%d bloggers like this: