Fast Ansi/Unicode conversion

Sha · 2012-01-10 21:27:52

Hi, Arnaud.
Here is my module for fast conversion

//Sha 2012
unit SynConversionTables;

interface

uses
  Windows,  // GetACP
  SysUtils; // PWordArray

type
  PConversionTable= ^TConversionTable;
  TConversionTable= packed record
    WideToAnsiW: packed array[0..127] of word;
    WideToAnsiA: packed array[0..127] of byte;
    WideToAnsiCount: integer;
    CodePage: integer;                     //table code page
    pDataExt: pWordArray;
    DataCount: integer;
    AnsiToWide:  packed array[0..255] of word;
    end;

var //READ ONLY VARS
  pSynTableDefault: PConversionTable= nil; //table for default code page
  pSynTable1252:    PConversionTable= nil; //win1252 Latin table
  pSynTable1251:    PConversionTable= nil; //win1251 Cyrillic table

type
  PtrInt= {$ifdef UNICODE} NativeInt {$else} integer {$endif};

//extended version of WinAnsiTableSortedFind
function FindAnsiChar(wc: cardinal; pTable: PConversionTable): PtrInt;

//get table by index
function GetSynTable(i: integer): PConversionTable;

//test of conversion tables
function TestSynTables: boolean;

implementation

const
  Win1252Ext: packed array[0..31] of word = (
    8364,  129, 8218,  402, 8222, 8230, 8224, 8225,  710, 8240,  352, 8249,  338,  141,  381,  143,
     144, 8216, 8217, 8220, 8221, 8226, 8211, 8212,  732, 8482,  353, 8250,  339,  157,  382,  376);
  Win1251Ext: packed array[0..127] of word = (
    1026, 1027, 8218, 1107, 8222, 8230, 8224, 8225, 8364, 8240, 1033, 8249, 1034, 1036, 1035, 1039,
    1106, 8216, 8217, 8220, 8221, 8226, 8211, 8212,  152, 8482, 1113, 8250, 1114, 1116, 1115, 1119,
     160, 1038, 1118, 1032,  164, 1168,  166,  167, 1025,  169, 1028,  171,  172,  173,  174, 1031,
     176,  177, 1030, 1110, 1169,  181,  182,  183, 1105, 8470, 1108,  187, 1112, 1029, 1109, 1111,
    1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
    1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071,
    1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087,
    1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103);

var
  SynDefaultExt: packed array[0..127] of word;
  SynDefaultCP: integer= 0; //default code page
  SynTables: array of array of word;

function FindAnsiChar(wc: cardinal; pTable: PConversionTable): PtrInt;
var
  Cur, Left, Right: PtrInt;
begin;
  //fast search of main 32 letters for win1251
  if (pTable=pSynTable1251) and (cardinal(wc-1040)<32) then begin;
    Result:=wc-848;
    exit;
    end;
  Right:=pTable.WideToAnsiCount;
  Left:=0;
  while Left<Right do begin;
    Cur:=(Left + Right) shr 1;
    if wc>pTable.WideToAnsiW[Cur] then Right:=Cur else Left:=Cur + 1;
    end;
  dec(Right);
  if (Right>=0) and (wc=pTable.WideToAnsiW[Right])
  then Result:=pTable.WideToAnsiA[Right]
  else Result:=-1;
  end;

function GetSynTable(i: integer): PConversionTable;
begin;
  if (i>=0) and (i<Length(SynTables))
  then Result:=@SynTables[i,0]
  else Result:=nil;
  end;

function TestFindChars(pTable: PConversionTable): integer;
var
  i: integer;
begin;
  Result:=0;
  for i:=0 to pTable.DataCount-1 do if pTable.pDataExt[i]>255 then begin;
    dec(Result);
    if FindAnsiChar(pTable.pDataExt[i],pTable)<>i+128 then exit;
    end;
  Result:=-Result;
  end;

function TestCountChars(pTable: PConversionTable): integer;
var
  i: integer;
begin;
  Result:=0;
  for i:=$100 to $FFFF do if FindAnsiChar(i,pTable)>=0 then inc(Result);
  end;

function TestSynTables: boolean;
var
  i, FoundAll, CountAll: integer;
  pTable: PConversionTable;
begin;
  Result:=true;
  for i:=0 to Length(SynTables)-1 do begin;
    pTable:=@SynTables[i,0];
    FoundAll:=TestFindChars(pTable);
    CountAll:=TestCountChars(pTable);
    Result:=Result and (FoundAll>0) and (CountAll=FoundAll);
    end;
  end;

procedure AddConversionTable(CodePage: integer; pDataExt: PWordArray; DataCount: integer);
var
  save: array[0..127] of cardinal;
  tmp: cardinal;
  i, len, max: integer;
  pTable: PConversionTable;
begin;
  if (CodePage<=0) or (DataCount<=0) or (DataCount>128) then exit;

  len:=Length(SynTables);
  for i:=0 to len-1 do begin;
    pTable:=@SynTables[i,0];
    if pTable.CodePage=CodePage then exit;
    end;
  SetLength(SynTables,len+1);
  SetLength(SynTables[len], SizeOf(TConversionTable) div SizeOf(word));
  pTable:=@SynTables[len,0];
  pTable.CodePage:=CodePage;
  pTable.pDataExt:=pDataExt;
  pTable.DataCount:=DataCount;

  for i:=0 to 255 do pTable.AnsiToWide[i]:=i;
  len:=0;
  for i:=0 to DataCount-1 do if pDataExt[i]>255 then inc(len);
  pTable.WideToAnsiCount:=len;
  len:=0;
  max:=0;
  for i:=DataCount-1 downto 0 do begin;
    pTable.AnsiToWide[i+128]:=pDataExt[i];
    if pDataExt[i]>255 then begin;
      save[len]:=integer(pDataExt[i]) shl 8 or (i+128);
      if save[max]<save[len] then max:=len;
      inc(len);
      end;
    end;
  dec(len); // last index

  // insertion sort of save[0..len]
  tmp:=save[0]; save[0]:=save[max]; save[max]:=tmp;
  i:=1;
  while i<len do begin;
    inc(i);
    tmp:=save[i];
    if tmp>save[i-1] then begin;
      max:=i;
      repeat;
        save[max]:=save[max-1];
        dec(max);
        until tmp<=save[max-1];
      save[max]:=tmp;
      end;
    end;

  for i:=0 to len do begin;
    pTable.WideToAnsiW[i]:=save[i] shr 8;
    pTable.WideToAnsiA[i]:=byte(save[i]);
    end;
  if CodePage=1252 then pSynTable1252:=pTable;
  if CodePage=1251 then pSynTable1251:=pTable;
  if CodePage=SynDefaultCP then pSynTableDefault:=pTable;
  end;

function InitConversionTables: boolean;
var
  c: array[0..127] of byte;
  i: integer;
begin;
  SynDefaultCP:=GetACP;
  AddConversionTable(1252, @Win1252Ext[0], Length(Win1252Ext)); //Latin
  AddConversionTable(1251, @Win1251Ext[0], Length(Win1251Ext)); //Cyrillic
  if pSynTableDefault=nil then begin;
    for i:=0 to 127 do c[i]:=i+128;
    MultiByteToWideChar(SynDefaultCP,0,@c[0],128,@SynDefaultExt[0],128);
    AddConversionTable(SynDefaultCP, @SynDefaultExt[0], 128);
    end;
  Result:=(pSynTableDefault<>nil) and TestSynTables;
  end;

procedure FinalConversionTables;
begin;
  SynDefaultCP:=0;
  pSynTableDefault:=nil;
  pSynTable1252:=nil;
  pSynTable1251:=nil;
  SynTables:=nil;
  end;

initialization

  InitConversionTables;

finalization

  FinalConversionTables;

end.

I suggest to replace WinAnsiTableSortedFind(wc) with FindAnsiChar(wc,pSynTable1252),
for example:

//Sha: new version
function WideCharToWinAnsiChar(wc: cardinal): AnsiChar;
begin // code generated for this function is very fast
  if wc<256 then
    //if WinAnsiTable[wc]<256 then begin
    if pSynTable1252.AnsiToWide[wc]<256 then begin //Sha: use new table
      result := AnsiChar(wc);
      exit;
    end else begin
      result := ' '; // invalid ansi char for this code page (e.g. #128)
      exit;
    end else begin // wc>255:
      //wc := WinAnsiTableSortedFind(wc);
      wc:=FindAnsiChar(wc, pSynTable1252); //Sha: use new version of search
      if integer(wc)>=0 then
        result := AnsiChar(byte(wc)) else
        result := ' '; // space for invalid wide char
    exit;
  end;
end;

//Sha: new version
function WideCharToWinAnsi(wc: cardinal): integer;
begin
  if wc<256 then
    //if WinAnsiTable[wc]<256 then
    if pSynTable1252.AnsiToWide[wc]<256 then //Sha: use new table
      result := wc else
      result := -1 else // invalid ansi char for this code page (e.g. #128)
      //result := WinAnsiTableSortedFind(wc);
      result := FindAnsiChar(wc, pSynTable1252); //Sha: use new version of search
end;

//Sha: new version
function IsWinAnsiU(UTF8Text: PUTF8Char): boolean;
var c: Cardinal;
begin
  result := false;
  if UTF8Text<>nil then
    repeat
      c := byte(UTF8Text^); inc(UTF8Text);
      if c=0 then break else
      if c and $80=0 then
        continue else begin
        if UTF8Text^=#0 then break;
        if c and $20=0 then begin
          c := c shl 6+byte(UTF8Text^)-$00003080; inc(UTF8Text);
        end else begin
          c := c shl 6+byte(UTF8Text^); inc(UTF8Text);
          if UTF8Text^=#0 then break;
          c := c shl 6+byte(UTF8Text^)-$000E2080; inc(UTF8Text);
        end;
        if c>255 then begin
          //if WinAnsiTableSortedFind(c)<0 then
          if FindAnsiChar(c, pSynTable1252)<0 then //Sha: use new version of search
            exit; // invalid char in the WinAnsi code page
        end else
        //if WinAnsiTable[c]>255 then
        if pSynTable1252.AnsiToWide[c]>255 then //Sha: use new table
          exit; // invalid char in the WinAnsi code page
      end;
    until false;
  result := true;
end;

//Sha: new version
function UTF8ToWinPChar(dest: PAnsiChar; source: PUTF8Char; count: integer): integer;
var c: cardinal;
    begd: PAnsiChar;
    endSource: PUTF8Char;
begin
  result := 0;
  if source=nil then exit;
  begd := dest;
  endSource := source+count;
  repeat
    c := byte(source^); inc(source);
    if byte(c) and $80=0 then begin
      dest^ := AnsiChar(byte(c)); inc(dest);
      if source<endsource then continue else break;
    end else begin
      if source>=endsource then break;
      if c and $20=0 then begin
        c := c shl 6+byte(source^)-$00003080; inc(source);
        if c and $ffffff00=0 then begin
          //if WinAnsiTable[c]>255 then
          if pSynTable1252.AnsiToWide[c]>255 then //Sha: use new table
            dest^ := ' ' else // invalid char in the WinAnsi code page
            dest^ := AnsiChar(c);
          inc(dest);  // #128..#255 -> direct copy
          if source<endsource then continue else break;
        end;
      end else begin
        c := c shl 6+byte(source^); inc(source);
        if source>=endsource then break;
        c := c shl 6+byte(source^)-$000E2080; inc(source);
      end;
      // #256.. -> slower but accurate conversion
      //c := WinAnsiTableSortedFind(c);
      c := FindAnsiChar(c, pSynTable1252); //Sha: use new version of search
      if integer(c)>=0 then begin
        dest^ := AnsiChar(Byte(c)); // don't add invalid wide char
        inc(dest);
      end;
      if source>=endsource then break;
    end;
  until false;
  result := dest-begd;
end;

//Sha: new version
procedure UTF8ToShortString(var dest: shortstring; source: PUTF8Char);
var c: cardinal;
    len: integer;
begin
  len := 0;
  if source<>nil then
  repeat
    c := byte(source^); inc(source);
    if c=0 then break else
    if c and $80=0 then begin
      inc(len); dest[len] := AnsiChar(c);
      if len<255 then continue else break;
    end else begin
      if source^=#0 then break;
      if c and $20=0 then begin
        c := c shl 6+byte(source^)-$00003080; inc(source);
      end else begin
        c := c shl 6+byte(source^); inc(source);
        if source^=#0 then break;
        c := c shl 6+byte(source^)-$000E2080; inc(source);
      end;
      // #256.. -> slower but accurate conversion
      inc(len);
      //c := WinAnsiTableSortedFind(c);
      c := FindAnsiChar(c, pSynTable1252); //Sha: use new version of search
      if integer(c)<0 then
        c := ord('?');
      dest[len] := AnsiChar(byte(c)); // #128..#255 -> direct copy
      if len<255 then continue else break;
    end;
  until false;
  dest[0] := AnsiChar(len);
end;

//Sha: new version
procedure RawUnicodeToWinPChar(dest: PAnsiChar; source: PWideChar; WideCharCount: Integer);
var i: integer;
    wc: integer;
begin
  for i := 0 to WideCharCount-1 do begin
    wc := integer(source[i]);
    if wc<256 then
    //if WinAnsiTable[wc]<256 then
    if pSynTable1252.AnsiToWide[wc]<256 then //Sha: use new table
      dest[i] := AnsiChar(wc) else
      dest[i] := ' ' else begin
      //wc := WinAnsiTableSortedFind(wc);
      wc := FindAnsiChar(wc, pSynTable1252); //Sha: use new version of search
      if integer(wc)>=0 then
        dest[i] := AnsiChar(byte(wc)) else
        dest[i] := ' '; // space for invalid wide char
    end;
  end;
end;

It is easy to create new fast common functions w/o Windows API
by adding parameter, for example

//Sha: new function
function WideCharToSynAnsiChar(wc: cardinal; pSynTable: PConversionTable): AnsiChar;
begin // code generated for this function is very fast
  if wc<256 then
    if pSynTable.AnsiToWide[wc]<256 then begin
      result := AnsiChar(wc);
      exit;
    end else begin
      result := ' '; // invalid ansi char for this code page (e.g. #128)
      exit;
    end else begin // wc>255:
      wc:=FindAnsiChar(wc, pSynTable);
      if integer(wc)>=0 then
        result := AnsiChar(byte(wc)) else
        result := ' '; // space for invalid wide char
    exit;
  end;
end;

Unit has internal full self-test:

//how to validate all tables in use
procedure TForm1.bValidateClick(Sender: TObject);
const
  msg: array[boolean] of string= ('failed', 'passed');
begin;
  Memo1.Lines.Add('Test of conversion tables ' + msg[TestSynTables]);
  end;

It is easy to add support for other code pages.
Just copy/paste data from TMemo to the unit.

//how to fill your default table
procedure TForm1.bShowWideClick(Sender: TObject);
var
  c: array[0..127] of byte;
  w: array[0..127] of word;
  i: integer;
begin;
  for i:=0 to 127 do c[i]:=i+128;
  MultiByteToWideChar(GetACP,0,@c[0],128,@w[0],128);
  i:=0;
  while i<=128-16 do begin;
    Memo1.Lines.Add(Format('{%d:} %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, ',
                          [i+128, w[i+0],w[i+1],w[i+2], w[i+3], w[i+4], w[i+5], w[i+6], w[i+7],
                                  w[i+8],w[i+9],w[i+10],w[i+11],w[i+12],w[i+13],w[i+14],w[i+15]]));
    i:=i+16;
    end;
  end;

ab · 2012-01-23 07:26:09

Thanks.

I'll see how to add this without breaking the existing features.

But IMHO the Windows APIs are not so slow, when it deals with CP 1251 and such.
All framework core is already optimized for UTF-8 encoding, so those conversions will take place only before calling the UI part.
So I suspect the current implementation is not slow.

Sha · 2012-01-23 17:56:46

I have tested some new functions. They are faster than framework's ones even on WinAnsiStrings.
I hope I will write full set of Ansi/Unicode/UTF8 conversions in a week.

Last edited by Sha (2012-01-23 18:03:16)

Sha · 2012-02-04 18:45:44

Hi

New unicode functions are here
Some coments (russian) are here

Timings at E6850, table headers:
Charset 1251 - russian text
Charset 1252 - english (ASCII) text
Charset 1252* - english (ASCII) text, but first char is russian
Sha - functions from ShaUnicode.pas
Syn - functions from SynCommons.pas
WinAnsi - special functions from SynCommons.pas for WinAnsiString

   AnsiToUnicode time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         563   562   547
WinAnsi           766
Syn         734   734   735

   UnicodeToAnsi time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         500   531   515
WinAnsi           938
Syn         828  1703   813

   UnicodeToUtf8 time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         563   422   547
Syn         734   703   719

   AnsiToUtf8 time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         594   219   594
WinAnsi           593
Syn        1469  1438  1453

   Utf8ToUnicode time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         547   594   563
Syn         671   641   672

   Utf8ToAnsi time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         687   219   703
WinAnsi           594
Syn        1047  1015  1032

Results at i5-2300

   AnsiToUnicode time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         468   452   468
WinAnsi       0   718     0
Syn         686   702   702

   UnicodeToAnsi time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         390   375   390
WinAnsi       0   936     0
Syn         982  1685   983

   UnicodeToUtf8 time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         530   406   530
Syn         702   702   702

   AnsiToUtf8 time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         562   171   562
WinAnsi       0   546     0
Syn        1404  1388  1404

   Utf8ToUnicode time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         499   530   499
Syn         609   608   609

   Utf8ToAnsi time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         655   218   640
WinAnsi       0   562     0
Syn        1201  1185  1186

Some bugs fixed in ShaUnicode.pas 2012-02-05

Last edited by Sha (2012-02-06 10:38:17)

ab · 2012-02-06 09:16:14

Thanks a lot for sharing your code!

Some remarks:
- ShaAnsiToUnicode() will be correct only for 7 bits ascii - so I guess this is about Ansi7ToString() which is not often called so I did not modified it;
- I've updated RawUnicodeToUtf8(), WinAnsiBufferToUtf8(), UTF8ToWideChar(), UTF8ToWinPChar() to handle any trailing 7 bit ASCII AnsiChars, by pairs - this is a very nice trick in practice;
- I did not introduce CP 1252 specific optimization yet, since I'd like to implement a clean class-driven approach here - I've added it to the mORMot roadmap.

See http://synopse.info/fossil/info/4be9156a97

Sha · 2012-02-06 10:32:39

ab wrote:

Thanks a lot for sharing your code!
Some remarks:
- ShaAnsiToUnicode() will be correct only for 7 bits ascii - so I guess this is about Ansi7ToString() which is not often called so I did not modified it;
- I've updated RawUnicodeToUtf8(), WinAnsiBufferToUtf8(), UTF8ToWideChar(), UTF8ToWinPChar() to handle any trailing 7 bit ASCII AnsiChars, by pairs - this is a very nice trick in practice;
- I did not introduce CP 1252 specific optimization yet, since I'd like to implement a clean class-driven approach here - I've added it to the mORMot roadmap.
See http://synopse.info/fossil/info/4be9156a97

- ShaAnsiToUnicode() uses pointer to the translation table as well as all Ansi-functions from ShaUnicode.pas. So it trasforms correctly any Ansi-characters. You just need to call the function with pConvLatin or pConvDefault as second parameter.

- I think it is simple to use new unicode functions from ShaUnicode.pas changing *all* calls in SynCommons.pas:
WinAnsiToRawUnicode/StringToRawUnicode/... --> ShaAnsiToUnicode
RawUnicodeToWinAnsi/RawUnicodeToString/... --> ShaUnicodeToAnsi
WinAnsiToUTF8/AnsiCharToUTF8/... --> ShaAnsiToUTF8
RawUnicodeToUTF8/... --> ShaUnicodeToUTF8
UTF8DecodeToRawUnicode/... --> ShaUTF8ToUnicode
UTF8ToString/... --> ShaUTF8ToAnsi

- Main optimization for win1252 and user default code page is done in ShaUnicode.pas. Of course, we always may do more.

ab · 2012-02-06 12:13:46

Sha wrote:

- ShaAnsiToUnicode() uses pointer to the translation table as well as all Ansi-functions from ShaUnicode.pas. So it trasforms correctly any Ansi-characters. You just need to call the function with pConvLatin or pConvDefault as second parameter.

Indeed, I overlooked the code and did not see the 64 KB of lookup table defined.

The upcoming TSynAnsiConvert class will use similar tables.

ab · 2012-02-08 16:27:46

That's it.

I've done a huge code refactoring to include a generic optimized way of converting Ansi content using several charsets/codepages.
I tried to add all your speed ups (if not too specific) into the framework.

I've created two new TSynAnsiConvert and TSynAnsiFixedWidth classes, able to process Unicode to/from Ansi conversion in all possible code pages, with generic access methods and optimized handling of fixed width encodings.

See http://synopse.info/fossil/info/fef5fd8640

Code page 1251 will now be as fast as possible.
Due to this code refactoring, e.g. in SQLite3i18n, some methods have disappeared.

Sha · 2012-02-08 19:57:32

Thanks!
I will test it tomorrow.

One question.
Why TSynAnsiConvert.AnsiBufferToRawUTF8 calls Getmem, but RawUnicodeToUtf8 doesn't ?

Last edited by Sha (2012-02-08 20:12:19)

ab · 2012-02-09 08:19:17

It is just an implementation pattern.

GetMem() + FreeMem() with no try..finally blocks "could" be a little less expensive than a local string allocation, which always generates an implicit try...finally block by the compiler.
So if the stack buffer is used, there won't be no try...finally block generated.
Such trick should work only if you are sure that no exception will raise within the process - or you may leak memory (FreeMem is not called if there is an exception risen).

Nothing definitive. Just something I wanted to code like that.

Sha · 2012-02-09 08:46:12

Yes, I see.
The question is why the framework uses SetString (buffered result) in one case and SetLength (unbuffered result) in another case?

Last edited by Sha (2012-02-09 08:57:19)

ab · 2012-02-09 09:45:09

SetString() will create a new allocation buffer, whereas SetLength() will reuse an existing one.
It means that SetString() will never move data, whereas SetLength() could, if the string was already containing something.

Most of the time, we will overwrite the string content, so move() the data is just a time waste.
In case of a function returning a string, the "result" variable is in fact passed by reference: that is, the previous content is available. Therefore, SetLength() of a huge result could unnecessarily move a lot of data.

That's why I try:
- either set the string to '' before SetLength()
- either use SetString() with an existing buffer or with nil, which will in fact set to '' then call SetLength()

I've posted this on the blog - see http://blog.synopse.info/post/2012/02/0 … -SetString

Sha · 2012-02-09 11:14:35

Yes, I see :-)
But the question is
why sometimes the framework calculates data in the buffer and then copies data to result string,
and sometimes it calculates data in the result string and then reallocates it?

ab · 2012-02-09 12:47:30

Due to FastMM4 (the memory manager) implementation, reallocation is sometimes a no cost operation (it is a so-called in-place reallocation).

The safer is IMHO to use SetString() or set to '' then call SetLength().

Calculating data in the result string is not so bad, since it won't create any temporary try..finally block, whereas using a stack-based buffer will.

Like always, profiling on real application will make the difference, here...
But I suspect that most process will use the fixed stack-based temporary memory (for a text with up to 255 WideChars), and won't require any heap allocation.

About the new TSynAnsiConvert classes, what is nice with it is that it centralizes all OS-specific API calls within the class.
This will make conversion to Mac OS X (or Linux) easier.

Sha · 2012-02-09 18:41:51

Speed tests (2 times each test)
of new ShaUnicode functions (unbuffered version 2012-02-09)
and new framework functions at E6850

   AnsiToUnicode time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         562   563   562
WinAnsi           735
Syn         734   735   734
Sha         562   563   547
WinAnsi           734
Syn         750   735   734

   UnicodeToAnsi time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha         500   531   531
WinAnsi           813
Syn        1109  1109  1110
Sha         500   531   531
WinAnsi           813
Syn        1109  1109  1110

   UnicodeToUtf8 time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha        2266   359   485
Syn        2734   360   718
Sha        2266   359   469
Syn        2734   375   704

   AnsiToUtf8 time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha        2547   140   485
WinAnsi           218
Syn        3422   203   329
Sha        2546   141   484
WinAnsi           219
Syn        3422   203   328

   Utf8ToUnicode time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha        2109   453   406
Syn        2500   516   687
Sha        2110   453   406
Syn        2500   516   687

   Utf8ToAnsi time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha        2297   141   609
WinAnsi           219
Syn        3047   218   329
Sha        2250   156   609
WinAnsi           203
Syn        3063   219   328

Code page 1252 is very fast, but 1251 is not.

ab · 2012-02-09 19:03:04

The code for CP 1251 and 1252 is exactly the same: both use the TSynAnsiFixedWidth class, and the same algorithms.

I guess something is wrong... somewhere.

Could you provide some sample code?

Here are my tests results:

CodePage 1250 UTF8:95.83ms Unicode:44.02ms
CodePage 1251 UTF8:95.97ms Unicode:44.15ms
CodePage 1252 UTF8:94.41ms Unicode:43.99ms
CodePage 1253 UTF8:94.16ms Unicode:43.78ms
CodePage 1254 UTF8:94.15ms Unicode:44.53ms
CodePage 1255 UTF8:94.13ms Unicode:44.13ms
CodePage 1256 UTF8:94.27ms Unicode:44.02ms
CodePage 1257 UTF8:94.57ms Unicode:43.78ms

CodePage 1250 UTF8:94.39ms Unicode:44.08ms
CodePage 1251 UTF8:94.31ms Unicode:44.14ms
CodePage 1252 UTF8:94.49ms Unicode:44.00ms
CodePage 1253 UTF8:94.13ms Unicode:43.92ms
CodePage 1254 UTF8:94.22ms Unicode:43.95ms
CodePage 1255 UTF8:94.31ms Unicode:43.78ms
CodePage 1256 UTF8:94.94ms Unicode:44.09ms
CodePage 1257 UTF8:94.42ms Unicode:43.81ms

With the following code, which will convert, for all those code pages, some random string to/from UTF8/Unicode.
Each time, 10001 strings of 0..1250 characters long are converted.

procedure TestCP;
var T: TPrecisionTimer;
    C: TSynAnsiConvert;
    CP,i: Integer;
    ST: array[0..10000] of RawByteString;
begin
  for i := 0 to high(ST) do
    ST[i] := TSynTestCase.RandomString(i shr 3);
  for CP := 1250 to 1257 do begin
    write('CodePage ',CP,' UTF8:');
    C := TSynAnsiConvert.Engine(CP);
    T.Start;
    for i := 0 to high(ST) do
      Assert(C.UTF8ToAnsi(C.AnsiToUTF8(ST[i]))=ST[i]);
    Write(T.Stop,' Unicode:');
    T.Start;
    for i := 0 to high(ST) do
      Assert(C.RawUnicodeToAnsi(C.AnsiToRawUnicode(ST[i]))=ST[i]);
    Writeln(T.Stop);
  end;
end;

You can see that the conversion speed is very consistent.
I guess there is something wrong in your test code.

Sha · 2012-02-09 20:20:19

I have used russian and english texts. Very simple tests. Some of them are here.

var
  //137 chars
  s1251: RawByteString='Лишь годные дятлы собираются в стаи, юникодом пугая мозги января. Их песни не стихнут, они не устанут. А елка как кактус беспокоит меня. ';
  s1252: RawByteString='Only woodpeckers gather in flights, they frighten with unicode the brains of January. They were not tired, their songs will not abate.   ';
  a1251, a1252, a255: RawByteString;
  u1251, u1252, u255: RawUnicode;
  t1251, t1252, t255: RawUTF8;

  Ticks: array[0..18] of cardinal;
  TicksLast: integer;
  Iterations: integer;

function NextTimer: integer;
begin;
  inc(TicksLast);
  if TicksLast<=High(Ticks) then Ticks[TicksLast]:=GetTickCount;
  Result:=Iterations;
  end;

function FirstTimer: integer;
begin;
  TicksLast:=-1;
  Result:=NextTimer;
  end;

//--------------------------------------------------------------------------------------------------
procedure TForm1.FormCreate(Sender: TObject);
var
  i: integer;
begin;
  Iterations:=10000;
  a1251:=s1251;
  a1252:=s1252;
{}
  for i:=1 to 8 do begin;
    a1251:=a1251 + a1251;
    a1252:=a1252 + a1252;
    end;
  //35209 chars
  a1251:=a1251 + s1251;
  a1252:=a1252 + s1252;
{}
  a255:=a1252; a255[1]:=#255;

  u1251:=ShaAnsiToUnicode(a1251,pConvDefault);
  u1252:=ShaAnsiToUnicode(a1252,pConvDefault);
  u255 :=ShaAnsiToUnicode(a255, pConvDefault);

  t1251:=ShaUnicodeToUTF8(u1251);
  t1252:=ShaUnicodeToUTF8(u1252);
  t255 :=ShaUnicodeToUTF8(u255);

  if (ShaUnicodeToAnsi(u1251,pConvDefault)<>a1251)
  or (ShaUTF8ToUnicode(t1251)<>u1251)
  or (ShaAnsiToUTF8(a1251,pConvDefault)<>t1251)
  or (ShaUTF8ToAnsi(t1251,pConvDefault)<>a1251)
  then ShowMessage('Error in FormCreate');
  end;

//--------------------------------------------------------------------------------------------------
procedure TForm1.ShowResult(const Title: string);
begin;
  Memo1.Lines.Add       ('');
  Memo1.Lines.Add       ('   ' + Title + ' time, ms');
  Memo1.Lines.Add       ('============================');
  Memo1.Lines.Add       ('                Charset');
  Memo1.Lines.Add       ('Functions  1251  1252  1252*');
  Memo1.Lines.Add       ('----------------------------');
  Memo1.Lines.Add(Format('Sha       %5d %5d %5d',[Ticks[01]-Ticks[00],
                                                  Ticks[02]-Ticks[01],
                                                  Ticks[03]-Ticks[02]]));
  Memo1.Lines.Add(Format('WinAnsi   %5d %5d %5d',[Ticks[04]-Ticks[03],
                                                  Ticks[05]-Ticks[04],
                                                  Ticks[06]-Ticks[05]]));
  Memo1.Lines.Add(Format('Syn       %5d %5d %5d',[Ticks[07]-Ticks[06],
                                                  Ticks[08]-Ticks[07],
                                                  Ticks[09]-Ticks[08]]));
  Memo1.Lines.Add(Format('Sha       %5d %5d %5d',[Ticks[10]-Ticks[09],
                                                  Ticks[11]-Ticks[10],
                                                  Ticks[12]-Ticks[11]]));
  Memo1.Lines.Add(Format('WinAnsi   %5d %5d %5d',[Ticks[13]-Ticks[12],
                                                  Ticks[14]-Ticks[13],
                                                  Ticks[15]-Ticks[14]]));
  Memo1.Lines.Add(Format('Syn       %5d %5d %5d',[Ticks[16]-Ticks[15],
                                                  Ticks[17]-Ticks[16],
                                                  Ticks[18]-Ticks[17]]));
  end;

//--------------------------------------------------------------------------------------------------
procedure TForm1.bAnsiToUtf8Click(Sender: TObject);
var
  t: RawUTF8;
  i: integer;
begin;
  i:=FirstTimer;
  repeat;

    repeat;
      t:=ShaAnsiToUtf8(a1251,pConvDefault);
      t:=ShaAnsiToUtf8(a1251,pConvDefault);
      dec(i); until i=0; i:=NextTimer;
    repeat;
      t:=ShaAnsiToUtf8(a1252,pConvDefault);
      t:=ShaAnsiToUtf8(a1252,pConvDefault);
      dec(i); until i=0; i:=NextTimer;
    repeat;
      t:=ShaAnsiToUtf8(a255,pConvDefault);
      t:=ShaAnsiToUtf8(a255,pConvDefault);
      dec(i); until i=0; i:=NextTimer;

    repeat;
      dec(i); until i=0; i:=NextTimer;
    repeat;
      t:=WinAnsiToUtf8(a1252);
      t:=WinAnsiToUtf8(a1252);
      dec(i); until i=0; i:=NextTimer;
    repeat;
      dec(i); until i=0; i:=NextTimer;

    repeat;
      AnsiCharToUTF8(pointer(a1251),length(a1251),t,1251);
      AnsiCharToUTF8(pointer(a1251),length(a1251),t,1251);
      dec(i); until i=0; i:=NextTimer;
    repeat;
      AnsiCharToUTF8(pointer(a1252),length(a1252),t,1251);
      AnsiCharToUTF8(pointer(a1252),length(a1252),t,1251);
      dec(i); until i=0; i:=NextTimer;
    repeat;
      AnsiCharToUTF8(pointer(a255),length(a255),t,1251);
      AnsiCharToUTF8(pointer(a255),length(a255),t,1251);
      dec(i); until i=0; i:=NextTimer;

    until TicksLast>9;
  ShowResult('AnsiToUtf8');
  end;

//--------------------------------------------------------------------------------------------------
procedure TForm1.bUTF8ToAnsiClick(Sender: TObject);
var
  t: AnsiString;
  i: integer;
begin;
  i:=FirstTimer;
  repeat;

    repeat;
      t:=ShaUTF8ToAnsi(t1251,pConvDefault);
      t:=ShaUTF8ToAnsi(t1251,pConvDefault);
      dec(i); until i=0; i:=NextTimer;
    repeat;
      t:=ShaUTF8ToAnsi(t1252,pConvDefault);
      t:=ShaUTF8ToAnsi(t1252,pConvDefault);
      dec(i); until i=0; i:=NextTimer;
    repeat;
      t:=ShaUTF8ToAnsi(t255,pConvDefault);
      t:=ShaUTF8ToAnsi(t255,pConvDefault);
      dec(i); until i=0; i:=NextTimer;

    repeat;
      dec(i); until i=0; i:=NextTimer;
    repeat;
      t:=Utf8ToWinAnsi(t1252);
      t:=Utf8ToWinAnsi(t1252);
      dec(i); until i=0; i:=NextTimer;
    repeat;
      dec(i); until i=0; i:=NextTimer;

    repeat;
      t:=Utf8ToString(t1251);
      t:=Utf8ToString(t1251);
      dec(i); until i=0; i:=NextTimer;
    repeat;
      t:=Utf8ToString(t1252);
      t:=Utf8ToString(t1252);
      dec(i); until i=0; i:=NextTimer;
    repeat;
      t:=Utf8ToString(t255);
      t:=Utf8ToString(t255);
      dec(i); until i=0; i:=NextTimer;

    until TicksLast>9;
  ShowResult('Utf8ToAnsi');
  end;

ab · 2012-02-09 20:40:29

Your code is a bit difficult to follow.
And I do not understand lines like this:

AnsiCharToUTF8(pointer(a1252),length(a1252),t,1251);

... 1252 or 1251?

In all cases, GetTickCount is not a very good idea for benchmarking.
You should better use a high resolution timer (like our TPrecisionTimer object).

You should better not use AnsiCharToUTF8() but e.g. directly CurrentAnsiConvert.AnsiToUTF8(a1251) if the current ansi code page is 1251.

Better speed will be achieved with AnsiToUTF8/UTF8ToAnsi and RawUnicodeToAnsi/AnsiToRawUnicode methods.
Those are the the direct entry points of the framework.

Sha · 2012-02-09 20:46:34

My default code page is 1251. It can be used for test with all russian and english (ASCII) texts.

I think 15 ms resolution of GetTickCount is sufficient in our case.

Changing to CurrentAnsiConvert.AnsiToUTF8()/UTF8ToAnsi() shows the same speed.

   AnsiToUtf8 time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha        2547   140   594
WinAnsi       0   203     0
Syn        3422   203   313
Sha        2547   140   594
WinAnsi       0   203     0
Syn        3422   203   313

   Utf8ToAnsi time, ms
============================
                Charset
Functions  1251  1252  1252*
----------------------------
Sha        2297   141   609
WinAnsi       0   219     0
Syn        3047   203   344
Sha        2234   156   610
WinAnsi       0   203     0
Syn        3062   203   329

Last edited by Sha (2012-02-09 21:15:33)

ab · 2012-02-10 06:11:05

You are not measuring the same content.
So your benchmark is a bit difficult to read.

If you use plain random text, there is no diff between the code pages in our implementation.
It is very stable, and can handle any kind of content - it is always better to benchmark with random data instead of fixed data.
Of course, best pattern would be indeed real data (e.g. from a big text, like a whole Bible) in the real language.
The sample test I provide above (easy to follow) proves it.

Sha · 2012-02-10 07:19:47

I use text of same length in both languages (Russian and English). Measure differs in 15 times.

So in real applications for long texts it is better to use win1251 coding but not utf8. It makes difficult using of JSON in my case.

ab · 2012-02-10 16:16:32

With the following code:

procedure TestCP;
var T: TPrecisionTimer;
    C: TSynAnsiConvert;
    CP,i: Integer;
    ST: array[0..10000] of RawByteString;
    Sha: PConvTable;
begin
  for i := 0 to high(ST) do
    ST[i] := TSynTestCase.RandomString(i shr 3);
  for CP := 1250 to 1257 do begin
    writeln('CodePage ',CP);
    Write('      WinAPI UTF8:');
    C := TSynAnsiConvert.Create(CP);
    T.Start;
    for i := 0 to high(ST) do
      Assert(C.UTF8ToAnsi(C.AnsiToUTF8(ST[i]))=ST[i]);
    Write(T.Stop,' Unicode:');
    T.Start;
    for i := 0 to high(ST) do
      Assert(C.RawUnicodeToAnsi(C.AnsiToRawUnicode(ST[i]))=ST[i]);
    Writeln(T.Stop);
    C.Free;
    Sha := GetShaConvTable(CP);
    write('         Sha UTF8:');
    T.Start;
    for i := 0 to high(ST) do
      Assert(ShaUTF8ToAnsi(ShaAnsiToUTF8(ST[i],Sha),Sha)=ST[i]);
    Write(T.Stop,' Unicode:');
    T.Start;
    for i := 0 to high(ST) do
      Assert(ShaUnicodeToAnsi(ShaAnsiToUnicode(ST[i],Sha),Sha)=ST[i]);
    writeln(T.Stop);
    write('         Syn UTF8:');
    C := TSynAnsiConvert.Engine(CP);
    T.Start;
    for i := 0 to high(ST) do
      Assert(C.UTF8ToAnsi(C.AnsiToUTF8(ST[i]))=ST[i]);
    Write(T.Stop,' Unicode:');
    T.Start;
    for i := 0 to high(ST) do
      Assert(C.RawUnicodeToAnsi(C.AnsiToRawUnicode(ST[i]))=ST[i]);
    Writeln(T.Stop);
  end;
end;

I noticed that your routines are a bit faster than mine.
But also that it works only with (CP=1250) or (CP=1251) or (CP=1254) or (CP=1256).
I've got access violation otherwise. I suspect there are some issues in your UTF8 conversion code.

Here are the scores with random text (i.e. a more aggressive test than with true text):

CodePage 1250
      WinAPI UTF8:117.78ms Unicode:44.48ms
         Sha UTF8:79.80ms Unicode:18.98ms
         Syn UTF8:89.67ms Unicode:25.75ms
CodePage 1251
      WinAPI UTF8:116.97ms Unicode:44.54ms
         Sha UTF8:80.15ms Unicode:18.73ms
         Syn UTF8:90.04ms Unicode:25.75ms
CodePage 1252
      WinAPI UTF8:116.15ms Unicode:43.64ms
         Syn UTF8:89.62ms Unicode:25.63ms
CodePage 1253
      WinAPI UTF8:116.59ms Unicode:44.40ms
         Syn UTF8:89.23ms Unicode:25.98ms
CodePage 1254
      WinAPI UTF8:117.03ms Unicode:44.83ms
         Sha UTF8:79.82ms Unicode:18.66ms
         Syn UTF8:89.49ms Unicode:25.88ms
CodePage 1255
      WinAPI UTF8:117.06ms Unicode:44.42ms
         Syn UTF8:89.54ms Unicode:25.64ms
CodePage 1256
      WinAPI UTF8:116.54ms Unicode:44.47ms
         Sha UTF8:80.01ms Unicode:18.83ms
         Syn UTF8:88.97ms Unicode:25.94ms
CodePage 1257
      WinAPI UTF8:116.63ms Unicode:44.64ms
         Syn UTF8:89.59ms Unicode:26.04ms

For UTF8, Windows APIs are not so bad, after all... What was slow is Unicode/UTF8 encoding in this case - and this part is sharing the same encoding routine.
For Unicode, we are about 2 times faster than Windows API.

In all cases, UTF-8 is acceptable for French or such (with an accent every one and then, but mostly latin chars).
Of course, UTF-8 is more verbose with Code Page 1251, when most chars are not ASCII 7.
It still makes sense if your UTF-8 has some field names or spaces, like when an object is serialized.
In all cases, the speed bottleneck of our framework is clearly not in the UTF-8 encoding any more.

Sha · 2012-02-10 18:26:00

ab wrote:

I noticed that your routines are a bit faster than mine.
But also that it works only with (CP=1250) or (CP=1251) or (CP=1254) or (CP=1256).
I've got access violation otherwise. I suspect there are some issues in your UTF8 conversion code.

I have validated all my functions for CP 1251 using code

function RandomString(MaxCharCount: Integer): RawByteString;
var
  CharCount, CharRange: integer;
  P: PAnsiChar;
begin;
//  CharRange:=96 +  32; //ASCII + #128..159
  CharRange:=96 + 128; //ASCII + #128..255
  CharCount:=1+Random(MaxCharCount);
  SetString(Result, nil, CharCount);
  P:=pointer(Result);
  while CharCount>0 do begin;
    dec(CharCount); P[CharCount]:=AnsiChar(Random(CharRange)+32); //starting from #32
    end;
  end;

function Validate(const s: RawByteString; Status: integer=0): integer;
var
  s2: RawByteString;
  u, u2: RawUnicode;
  t, t2: RawUTF8;
  Len, Len2, Len8, TestNo: integer;
begin;
  u:='';
  t:='';
  Len8:=0;

  Len:=Length(s);
  if Len>0 then begin;
    SetLength(u,Len*2); u[Len+2]:=#0;
    MultiByteToWideChar(GetACP, 0, pointer(s), Len, pointer(u), Len);
    Len8:=WideCharToMultiByte(CP_UTF8, 0, pointer(u), Len, nil, 0, nil, nil);
    SetLength(t, Len8);
    WideCharToMultiByte(CP_UTF8, 0, pointer(u), Len, pointer(t), Len8, nil, nil);
    end;

  TestNo:=1;
  if TestNo and Status=0 then begin;
    u2:=ShaAnsiToUnicode(s, pConvDefault);
    if u2=u then Status:=Status or TestNo;
    end;

  TestNo:=TestNo*2;
  if TestNo and Status=0 then begin;
    t2:=ShaUnicodeToUTF8(u);
    if t2=t then Status:=Status or TestNo;
    end;

  TestNo:=TestNo*2;
  if TestNo and Status=0 then begin;
    s2:=ShaUnicodeToAnsi(u, pConvDefault);
    if s2=s then Status:=Status or TestNo;
    end;

  TestNo:=TestNo*2;
  if TestNo and Status=0 then begin;
    u2:=ShaUTF8ToUnicode(t);
    if u2=u then Status:=Status or TestNo;
    end;

  TestNo:=TestNo*2;
  if TestNo and Status=0 then begin;
    t2:=ShaAnsiToUTF8(s, pConvDefault);
    if t2=t then Status:=Status or TestNo;
    end;

  TestNo:=TestNo*2;
  if TestNo and Status=0 then begin;
    s2:=ShaUTF8ToAnsi(t, pConvDefault);
    if s2=s then Status:=Status or TestNo;
    end;

  TestNo:=TestNo*2;
  if TestNo and Status=0 then begin;
    Len2:=ShaUnicodeToUTF8(pointer(u), Len);
    if Len2=Len8 then Status:=Status or TestNo;
    end;

  TestNo:=TestNo*2;
  if TestNo and Status=0 then begin;
    Len2:=ShaAnsiToUTF8(pointer(s), Len, pConvDefault);
    if Len2=Len8 then Status:=Status or TestNo;
    end;

  TestNo:=TestNo*2;
  if TestNo and Status=0 then begin;
    Len2:=ShaUTF8ToAnsi(pointer(t), Len8);
    if Len2=Len then Status:=Status or TestNo;
    end;

  while TestNo<>0 do begin;
    TestNo:=TestNo*2;
    Status:=Status or TestNo;
    end;
  Result:=Status;
  end;

procedure TForm1.bValidateClick(Sender: TObject);
var
  s: RawByteString;
  i, len, ErrorCount, Status: integer;
begin;
  ErrorCount:=0;
  i:=0;
  len:=0;
  repeat;
    s:=RandomString(len);
    Status:=Validate(s);
    if Status<>-1 then begin;
      inc(ErrorCount);
      Validate(s,Status);
      end;
    len:=32;
    if i>100000 then len:=8*1024;
    inc(i);
    until i>150000;
  Memo1.Lines.Add(Format('Validation done, %d errors',[ErrorCount]));
  end;

Also I have tested them for CP 1251 and 1252 as you do

function TSynTestCaseRandomString(CharCount: Integer): RawByteString;
var V: cardinal;
    P: PAnsiChar;
begin
  SetString(result,nil,CharCount);
  P := pointer(Result);
  while CharCount>0 do begin
    if CharCount>5 then begin
      V := Random(maxInt); // fast: one random compute per 5 chars
      P[0] := AnsiChar(32+V and 127); V := V shr 7;
      P[1] := AnsiChar(32+V and 127); V := V shr 7;
      P[2] := AnsiChar(32+V and 127); V := V shr 7;
      P[3] := AnsiChar(32+V and 127); V := V shr 7;
      P[4] := AnsiChar(65+V);
      Inc(P,5);
      dec(CharCount,5);
    end else begin
      P^ := AnsiChar(32+Random(224));
      inc(P);
      dec(CharCount);
    end;
  end;
end;

procedure TestCP;
var
  CP, i: Integer;
  ST: array[0..10000] of RawByteString;
  Sha: PConvTable;
begin
  for i := 0 to high(ST) do ST[i] := TSynTestCaseRandomString(i shr 3);

  for CP:=1251 to 1252 do begin;
    if CP=1251 then Sha:=pConvDefault else Sha:=pConvLatin;

    for i := 0 to high(ST) do
    if ShaUTF8ToAnsi(ShaAnsiToUTF8(ST[i],Sha),Sha)<>ST[i] then begin;
      Form1.Memo1.Lines.Add('ShaAnsiToUTF8'+IntToStr(CP));
      Form1.Memo1.Lines.Add(ST[i]);
      break;
      end;

    for i := 0 to high(ST) do
    if ShaUnicodeToAnsi(ShaAnsiToUnicode(ST[i],Sha),Sha)<>ST[i] then begin;
      Form1.Memo1.Lines.Add('ShaAnsiToUnicode'+IntToStr(CP));
      Form1.Memo1.Lines.Add(ST[i]);
      break;
      end;
    end;
  end;

procedure TForm1.Button1Click(Sender: TObject);
begin
  TestCP;
  Form1.Memo1.Lines.Add('TestCP done');
  end;

No errors found in all tests.
Can you please point more details about AV for CP 1252.

ab · 2012-02-11 08:29:56

Just use my above benchmark code, and you'll find out that it it works only with (CP=1250) or (CP=1251) or (CP=1254) or (CP=1256).

Sha · 2012-02-11 09:05:11

ab wrote:

Just use my above benchmark code, and you'll find out that it it works only with (CP=1250) or (CP=1251) or (CP=1254) or (CP=1256).

So may be problem in your above benchmark code? ;-)
Or you tested old my functions?

I haven't such function

Sha := GetShaConvTable(CP);

as well as conversion tables for all these CPs.

Could you provide minimal code that I could reproduce AV?

ab · 2012-02-11 09:49:44

I do not have the GetShaConvTable() code here.

It is just something which will create a pCOnvTable instance correspodning to the CP.

The error raised in case of some random text length.

I tested ShaUnicode_0.pas as far as I remember.

Sha · 2012-02-11 11:56:40

Ok. Then let's try my code.

I have tested functons from current version of ShaUnicode
using 1000 random CP and 10000 random strings.

No error. You can repeat this. All code you need is below. You need also Button and Memo on the Form.

procedure FillRandomConvTable(pTable: PConvTable);
const
  DefaultConvChar = 32;
var
  i, n: NInt;
begin;
  pTable.CodePage:=0;

  for i:=0 to 127 do begin;
    pTable.WideToAnsi[i]:=i;
    pTable.AnsiToWide[i]:=i;
    end;

  for i:=128 to $FFFF do pTable.WideToAnsi[i]:=DefaultConvChar;

  for i:=128 to 255 do begin;
    repeat;
      if i<128+64 then n:=Random($07FF -   255) +   256
                  else n:=Random($FFFF - $07FF) + $0800;
      until pTable.WideToAnsi[n]=DefaultConvChar;
    pTable.WideToAnsi[n]:=i;
    pTable.AnsiToWide[i]:=n;
    end;

  for i:=0 to 255 do begin;
    n:=pTable.AnsiToWide[i];
    if n<=127 then n:=0
    else if n<=$7FF then n:=1
    else n:=2;
    pTable.AnsiToUTF8Len[i]:=n;
    end;
  end;

function TSynTestCaseRandomString(CharCount: Integer): RawByteString;
var V: cardinal;
    P: PAnsiChar;
begin
  SetString(result,nil,CharCount);
  P := pointer(Result);
  while CharCount>0 do begin
    if CharCount>5 then begin
      V := Random(maxInt); // fast: one random compute per 5 chars
      P[0] := AnsiChar(32+V and 127); V := V shr 7;
      P[1] := AnsiChar(32+V and 127); V := V shr 7;
      P[2] := AnsiChar(32+V and 127); V := V shr 7;
      P[3] := AnsiChar(32+V and 127); V := V shr 7;
      P[4] := AnsiChar(65+V);
      Inc(P,5);
      dec(CharCount,5);
    end else begin
      P^ := AnsiChar(32+Random(224));
      inc(P);
      dec(CharCount);
    end;
  end;
end;

procedure TestCP;
var
  ST: array[0..10000] of RawByteString;
  CP, i: integer;
  Table: TConvTable;
begin
  for i := 0 to high(ST) do ST[i] := TSynTestCaseRandomString(i shr 3);

  for CP:=0 to 999 do begin;
    FillRandomConvTable(@Table);

    for i := 0 to high(ST) do
    if ShaUTF8ToAnsi(ShaAnsiToUTF8(ST[i],@Table),@Table)<>ST[i] then begin;
      Form1.Memo1.Lines.Add('ShaAnsiToUTF8 '+IntToStr(CP));
      Form1.Memo1.Lines.Add(ST[i]);
      break;
      end;

    for i := 0 to high(ST) do
    if ShaUnicodeToAnsi(ShaAnsiToUnicode(ST[i],@Table),@Table)<>ST[i] then begin;
      Form1.Memo1.Lines.Add('ShaAnsiToUnicode '+IntToStr(CP));
      Form1.Memo1.Lines.Add(ST[i]);
      break;
      end;
    end;
  end;

procedure TForm1.Button1Click(Sender: TObject);
begin
  TestCP;
  Form1.Memo1.Lines.Add('TestCP done');
  end;

mORMot Open Source

#1 2012-01-10 21:27:52

Fast Ansi/Unicode conversion

#2 2012-01-23 07:26:09

Re: Fast Ansi/Unicode conversion

#3 2012-01-23 17:56:46

Re: Fast Ansi/Unicode conversion

#4 2012-02-04 18:45:44

Re: Fast Ansi/Unicode conversion

#5 2012-02-06 09:16:14

Re: Fast Ansi/Unicode conversion

#6 2012-02-06 10:32:39

Re: Fast Ansi/Unicode conversion

#7 2012-02-06 12:13:46

Re: Fast Ansi/Unicode conversion

#8 2012-02-08 16:27:46

Re: Fast Ansi/Unicode conversion

#9 2012-02-08 19:57:32

Re: Fast Ansi/Unicode conversion

#10 2012-02-09 08:19:17

Re: Fast Ansi/Unicode conversion

#11 2012-02-09 08:46:12

Re: Fast Ansi/Unicode conversion

#12 2012-02-09 09:45:09

Re: Fast Ansi/Unicode conversion

#13 2012-02-09 11:14:35

Re: Fast Ansi/Unicode conversion

#14 2012-02-09 12:47:30

Re: Fast Ansi/Unicode conversion

#15 2012-02-09 18:41:51

Re: Fast Ansi/Unicode conversion

#16 2012-02-09 19:03:04

Re: Fast Ansi/Unicode conversion

#17 2012-02-09 20:20:19

Re: Fast Ansi/Unicode conversion

#18 2012-02-09 20:40:29

Re: Fast Ansi/Unicode conversion

#19 2012-02-09 20:46:34

Re: Fast Ansi/Unicode conversion

#20 2012-02-10 06:11:05

Re: Fast Ansi/Unicode conversion

#21 2012-02-10 07:19:47

Re: Fast Ansi/Unicode conversion

#22 2012-02-10 16:16:32

Re: Fast Ansi/Unicode conversion

#23 2012-02-10 18:26:00

Re: Fast Ansi/Unicode conversion

#24 2012-02-11 08:29:56

Re: Fast Ansi/Unicode conversion

#25 2012-02-11 09:05:11

Re: Fast Ansi/Unicode conversion

#26 2012-02-11 09:49:44

Re: Fast Ansi/Unicode conversion

#27 2012-02-11 11:56:40

Re: Fast Ansi/Unicode conversion

Board footer