#1 2020-04-12 16:30:07

Mr.Ed
Member
Registered: 2020-04-09
Posts: 2

Missing routine for CSV parsing

Hi ab, I am missing a routine for parsing a string containing CSV data.

Possibly, I'll got disapproval since SynCommons unit provides a GetNextItem routine that returns the next CSV from a string. This works well on a single line of CSV and could also be done with CSVToRawUTF8DynArray or DelimitedText of TStrings.

But you will struggle with parsing a file that has more than one record.

JSON content is not a problem because line feed and carriage return in a JSON string will be escaped. In this case, you will be fine with the GetNextItemTrimedCRLF routine in SynCommons that returns the next CSV record. Anyway, you could fail with other content. This is, because a quoted value in CSV may contain a CR or LF (see RFC 4180 text/csv).

I guess, it wouldn't be a good idea to parse a record after searching for the end of it (CRLF) in a CSV file – parsing the record twice. But if a GetNextItemCRLF would return the next value and the end of a record you could parse a file like this

  s := '"first'#10'row"'#9'2nd col'#13#10'2nd row'#9'end'#13#10;
  p := PUTF8Char(s);
  RecNo := 1;
  repeat
    GetNextItemCRLF(p, #9, '"', Value, @eol);
    {do something with Value}
    if eol then Inc(RecNo);
  until (p=nil)or(p^=#0)or(p^=#26);

Besides, you could safely parse a range from a spreadsheet you got with Clipboard.AsText that was put to clipboard from Libre Office Calc or MS Excel. It would be a small piece of code for parsing CSV if you image how CSV parsers out there take a sledgehammer to crack a nut.

A GetNextItemCRLF routine derived from GetNextItem and GetNextItemStringCRLF could look as follows:

procedure GetNextItemCRLF(var P: PUTF8Char; Sep: AnsiChar; var result: RawUTF8;
  eol: PBoolean = nil); overload;
var S,E: PUTF8Char;
begin
  if eol<>nil then
    eol^:= true;
  if P=nil then
    result := '' else begin
    S := P;
    while (S^<>#0) and (S^<>Sep) and (S^<>#10) do
      inc(S);
    E := S;
    if (E>P) and (E[-1]=#13) then
      dec(E);
    FastSetString(result,P,E-P);
    if S^<>#0 then begin
      if (eol<>nil)and(S^<>#10) then
        eol^:= False;
      P := S+1
    end else
      P := nil;
  end;
end;

procedure GetNextItemCRLF(var P: PUTF8Char; Sep, Quote: AnsiChar;
  var result: RawUTF8; eol: PBoolean = nil); overload;
begin
  if eol<>nil then
    eol^:= true;
  if P=nil then
    result := ''
  else if P^=Quote then begin
    P := UnQuoteSQLStringVar(P,result);
    if P=nil then
      result := ''
    else if P^<>#0 then begin
      if P^=#13 then
        inc(P);
      if (P^=Sep)and(eol<>nil) then
        eol^:= False;
      inc(P);
    end;
  end else
    GetNextItemCRLF(P,Sep,result,eol);
end;

function GetNextItemStringCRLF(var P: PChar; Sep: Char;
  eol: PBoolean = nil): String; overload;
var S,E: PChar;
begin
  if eol<>nil then
    eol^:= true;
  if P=nil then
    result := '' else begin
    S := P;
    while (S^<>#0) and (S^<>Sep) and (S^<>#10) do
      inc(S);
    E := S;
    if (E>P) and (E[-1]=#13) then
      dec(E);
    SetString(result,P,E-P);
    if S^<>#0 then begin
      if (eol<>nil)and(S^<>#10) then
        eol^:= False;
      P := S+1
    end else
      P := nil;
  end;
end;

function GetNextItemStringCRLF(var P: PChar; Sep, Quote: Char;
  eol: PBoolean = nil): String; overload;
begin
  if eol<>nil then
    eol^:= true;
  if P=nil then
    result := ''
  else if P^=Quote then begin
    result:= {SysUtils.}AnsiExtractQuotedStr(P,Quote);
    if P=nil then
      result := ''
    else if P^<>#0 then begin
      if P^=#13 then
        inc(P);
      if (P^=Sep)and(eol<>nil) then
        eol^:= False;
      inc(P);
    end;
  end else
    result:= GetNextItemStringCRLF(P,Sep,eol);
end;

AnsiExtractQuotedStr was used for the String version since there was no matching version of UnQuoteSQLStringVar.

I would appreciate if such a routine could be included to SynCommons unit and mormot.core.text respectively.

Offline

Board footer

Powered by FluxBB