Cutting Fields From A CSV File

December 7, 2021

# csvcut.awk

function csvsplit(str, arr,     i,j,n,s,fs,qt) {
    # split comma-separated fields into arr; return number of fields in arr
    # fields surrounded by double-quotes may contain commas;
    #     doubled double-quotes represent a single embedded quote
    delete arr; s = "START"; n = 0; fs = ","; qt = "\""
    for (i = 1; i <= length(str); i++) {
        if (s == "START") {
            if (substr(str,i,1) == fs) { arr[++n] = "" }
            else if (substr(str,i,1) == qt) { j = i+1; s = "INQUOTES" }
            else { j = i; s = "INFIELD" } }
        else if (s == "INFIELD") {
            if (substr(str,i,1) == fs) {
                arr[++n] = substr(str,j,i-j); j = 0; s = "START" } }
        else if (s == "INQUOTES") {
            if (substr(str,i,1) == qt) { s = "MAYBEDOUBLE" } }
        else if (s == "MAYBEDOUBLE") {
            if (substr(str,i,1) == fs) {
                arr[++n] = substr(str,j,i-j-1)
                gsub(qt qt, qt, arr[n]); j = 0; s = "START" } } }
    if (s == "INFIELD" || s == "INQUOTES") { arr[++n] = substr(str,j) }
    else if (s == "MAYBEDOUBLE") {
        arr[++n] = substr(str,j,length(str)-j); gsub(qt qt, qt, arr[n]) }
    else if (s == "START") { arr[++n] = "" }
    return n }

BEGIN { # read and store output field names
    for (i=1; i<ARGC; i++) { fields[++nfields] = ARGV[i]; ARGV[i] = "" } }

NR == 1 { # read and store input field names, write output header
    for (i=1; i<=csvsplit($0,arr); i++) { names[arr[i]] = i }
    for (i=1; i<=nfields; i++) { printf "%s%s", sep, fields[i]; sep = "," }
    printf "\n" }

NR > 1 { # read input record, split fields, write output record
    delete csv; sep = ""; n = csvsplit($0, csv)
    for (i=1; i<=nfields; i++) {
        printf "%s%s", sep, csv[names[fields[i]]]; sep = "," }
    printf "\n" }
Advertisement

Pages: 1 2

2 Responses to “Cutting Fields From A CSV File”

  1. Using Pascal:

    program CSVSplitter;

    {$APPTYPE CONSOLE}
    {$R *.res}

    uses
    System.SysUtils, Classes;

    procedure Split(inputFileName, fieldName, outputFileName: String);
    var
    inputFile, outputFile: TextFile;
    header, line: String;
    columnList: TStringList;
    index: Integer;
    begin
    AssignFile(inputFile, inputFileName);
    columnList := TStringList.Create;
    try
    Reset(inputFile);
    if Not Eof(inputFile) then
    Readln(inputFile, header);

    columnList.CommaText := UpperCase(header);
    index := columnList.IndexOf(UpperCase(fieldName));
    if index >= 0 then
    begin
      AssignFile(outputFile, outputFileName);
      try
        ReWrite(outputFile);
        writeln(outputFile, fieldName);
    
        while Not Eof(inputFile) do
        begin
          Readln(inputFile, line);
          columnList.CommaText := line;
          writeln(outputFile, columnList[index]);
        end;
      except
        on E: Exception do
        begin
          writeln('Cannot write to file ' + outputFileName + ' -> ' + E.Message);
        end;
      end;
    end
    else
      writeln('Field ' + fieldName + ' cannot be found in file ' + inputFileName);
    

    except
    on E: Exception do
    begin
    writeln('Exception occurs during reading file ' + inputFileName + ' -> ' + E.Message);
    end;
    end;
    columnList.Free;
    CloseFile(inputFile);
    CloseFile(outputFile);
    end;

    begin
    if ParamCount = 3 then
    begin
    if FileExists(ParamStr(1)) then
    begin
    Split(ParamStr(1), ParamStr(2), ParamStr(3));
    end
    else
    writeln('File ' + ParamStr(1) + ' cannot be found.');
    end
    else
    writeln('Usage: ' + ExtractFileName(ParamStr(0)) + ' ');
    end.

  2. Richard A. O'Keefe said

    I found the use of the word “cut” and “field” instead of, say, “select” and “column”, confusing.
    Using Smalltalk, with my own CSV library, I’ve lightly tested this:

      (CSVDecoder on: StdIn) bindOwn: [:input |
        |header inverse map row|
        header := input next.
        inverse := Dictionary new: header size.
        header keysAndValuesDo: [:index :name |
          inverse at: name put: index].
        map := arguments collect: [:name | inverse at: name].
        row := Array new: map size.
        (CSVEncoder on: StdOut) bindOwn: [:output |
          output nextPut: arguments.
          input do: [:each |
            map keysAndValuesDo: [:index :origin |
              row at: index put: (each at: origin)].
            output nextPut: row]]].
    

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: