mORMot and Open Source friends
Check-in [ab50456505]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:{6308} Haswell CPUs will use simdjson AVX2 asm for IsValidUtf8() - backport from mORMot 2 - as proposed by https://github.com/synopse/mORMot/pull/400
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: ab50456505559dd8a9bcdcf6e9c45b4d7c2992b4
User & Date: ab 2021-07-31 14:41:28
Context
2021-08-08
06:39
{6309} fixed Android compilation check-in: ae98eeeb70 user: ab tags: trunk
2021-07-31
14:41
{6308} Haswell CPUs will use simdjson AVX2 asm for IsValidUtf8() - backport from mORMot 2 - as proposed by https://github.com/synopse/mORMot/pull/400 check-in: ab50456505 user: ab tags: trunk
2021-07-24
12:06
{6307} introducing TTestLowLevelTypes.JSONBenchmark check-in: 7a58b38729 user: ab tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to SynCommons.pas.

518
519
520
521
522
523
524
























525
526
527
528
529
530
531
....
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
.....
16929
16930
16931
16932
16933
16934
16935
16936
16937
16938
16939
16940
16941
16942
16943
16944
16945
16946
16947
16948
16949
16950
16951
16952
16953
16954
16955
16956
16957
16958
16959
16960
16961
16962
16963
16964
16965
16966
.....
18425
18426
18427
18428
18429
18430
18431
18432
18433
18434
18435
18436
18437
18438
18439
18440
18441
18442
18443
18444
18445
18446
18447
18448
18449
18450
18451
18452
18453
18454
18455
18456
18457
18458
18459
18460
18461
18462
18463
18464
18465
18466
18467
18468
18469
18470
18471
18472
18473
18474
18475
18476
18477
18478
18479
18480
18481
18482
18483
18484
18485
18486
18487
18488
  /// class-reference type (metaclass) of a TInterfacedObject
  TInterfacedObjectClass = class of TInterfacedObject;


{ ************ fast UTF-8 / Unicode / Ansi types and conversion routines **** }

























type
  /// kind of adding in a TTextWriter
  TTextWriterKind = (twNone, twJSONEscape, twOnSameLine);

  /// an abstract class to handle Ansi to/from Unicode translation
  // - implementations of this class will handle efficiently all Code Pages
  // - this default implementation will use the Operating System APIs
................................................................................
  MaxDestChars, sourceBytes: PtrInt; NoTrailingZero: boolean=false): PtrInt; overload;

/// calculate the UTF-16 Unicode characters count, UTF-8 encoded in source^
// - count may not match the UCS4 glyphs number, in case of UTF-16 surrogates
// - faster than System.UTF8ToUnicode with dest=nil
function Utf8ToUnicodeLength(source: PUTF8Char): PtrUInt;

/// returns TRUE if the supplied buffer has valid UTF-8 encoding
// - will stop when the buffer contains #0
function IsValidUTF8(source: PUTF8Char): Boolean; overload;

/// returns TRUE if the supplied buffer has valid UTF-8 encoding
// - will also refuse #0 characters within the buffer
function IsValidUTF8(source: PUTF8Char; sourcelen: PtrInt): Boolean; overload;

/// returns TRUE if the supplied buffer has valid UTF-8 encoding
// - will also refuse #0 characters within the buffer
function IsValidUTF8(const source: RawUTF8): Boolean; overload;

/// returns TRUE if the supplied buffer has valid UTF-8 encoding with no #1..#31
// control characters
// - supplied input is a pointer to a #0 ended text buffer
function IsValidUTF8WithoutControlChars(source: PUTF8Char): Boolean; overload;

/// returns TRUE if the supplied buffer has valid UTF-8 encoding with no #0..#31
// control characters
................................................................................

{ ************ some fast UTF-8 / Unicode / Ansi conversion routines }

var
  // internal list of TSynAnsiConvert instances
  SynAnsiConvertList: TSynObjectList = nil;

// some constants used for UTF-8 conversion, including surrogates
const
  UTF16_HISURROGATE_MIN = $d800;
  UTF16_HISURROGATE_MAX = $dbff;
  UTF16_LOSURROGATE_MIN = $dc00;
  UTF16_LOSURROGATE_MAX = $dfff;
  UTF8_EXTRABYTES: array[$80..$ff] of byte = (
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0);
  UTF8_EXTRA: array[0..6] of record
    offset, minimum: cardinal;
  end = ( // http://floodyberry.wordpress.com/2007/04/14/utf-8-conversion-tricks
    (offset: $00000000;  minimum: $00010000),
    (offset: $00003080;  minimum: $00000080),
    (offset: $000e2080;  minimum: $00000800),
    (offset: $03c82080;  minimum: $00010000),
    (offset: $fa082080;  minimum: $00200000),
    (offset: $82082080;  minimum: $04000000),
    (offset: $00000000;  minimum: $04000000));
  UTF8_EXTRA_SURROGATE = 3;
  UTF8_FIRSTBYTE: array[2..6] of byte = ($c0,$e0,$f0,$f8,$fc);

{$ifdef HASINLINE}
{$ifdef USE_VTYPE_STATIC} // circumvent weird bug on BSD + ARM (Alfred)
procedure VarClear(var v: variant); // defined here for proper inlining
const VTYPE_STATIC = $BFE8; // bitmask to avoid remote VarClearProc call
var p: PInteger; // more efficient generated asm with an explicit temp variable
begin
  p := @v;
................................................................................
Quit:
  result := PtrUInt(dest)-PtrUInt(begd); // dest-begd returns bytes length
NoSource:
  if not NoTrailingZero then
    dest^ := #0; // always append a WideChar(0) to the end of the buffer
end;

function IsValidUTF8(source: PUTF8Char): Boolean;
var extra, i: integer;
    c: cardinal;
begin
  result := false;
  if source<>nil then
  repeat
    c := byte(source^);
    inc(source);
    if c=0 then break else
    if c and $80<>0 then begin
      extra := UTF8_EXTRABYTES[c];
      if extra=0 then exit else // invalid leading byte
      for i := 1 to extra do
        if byte(source^) and $c0<>$80 then
          exit else
          inc(source); // check valid UTF-8 content
    end;
  until false;
  result := true;
end;

function IsValidUTF8(const source: RawUTF8): Boolean;
begin
  result := IsValidUTF8(pointer(Source),length(Source));
end;

function IsValidUTF8(source: PUTF8Char; sourcelen: PtrInt): Boolean;
var extra, i: integer;
    c: cardinal;
begin
  result := false;
  inc(sourcelen,PtrInt(source));
  if source<>nil then
    while PtrInt(PtrUInt(source))<sourcelen do begin
      c := byte(source^);
      inc(source);
      if c=0 then exit else
      if c and $80<>0 then begin
        extra := UTF8_EXTRABYTES[c];
        if extra=0 then exit else // invalid leading byte
        for i := 1 to extra do
          if (PtrInt(PtrUInt(source))>=sourcelen) or (byte(source^) and $c0<>$80) then
            exit else
            inc(source); // check valid UTF-8 content
      end;
    end;
  result := true;
end;

function IsValidUTF8WithoutControlChars(source: PUTF8Char): Boolean;
var extra, i: integer;
    c: cardinal;
begin
  result := false;
  if source<>nil then
  repeat






>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







<
<
<
<
<
<
<
<
<
<
<
<







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
....
1160
1161
1162
1163
1164
1165
1166












1167
1168
1169
1170
1171
1172
1173
.....
16941
16942
16943
16944
16945
16946
16947
























16948
16949
16950
16951
16952
16953
16954
.....
18413
18414
18415
18416
18417
18418
18419


















































18420
18421
18422
18423
18424
18425
18426
  /// class-reference type (metaclass) of a TInterfacedObject
  TInterfacedObjectClass = class of TInterfacedObject;


{ ************ fast UTF-8 / Unicode / Ansi types and conversion routines **** }

// some constants used for UTF-8 conversion, including surrogates
const
  UTF16_HISURROGATE_MIN = $d800;
  UTF16_HISURROGATE_MAX = $dbff;
  UTF16_LOSURROGATE_MIN = $dc00;
  UTF16_LOSURROGATE_MAX = $dfff;
  UTF8_EXTRABYTES: array[$80..$ff] of byte = (
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0);
  UTF8_EXTRA: array[0..6] of record
    offset, minimum: cardinal;
  end = ( // http://floodyberry.wordpress.com/2007/04/14/utf-8-conversion-tricks
    (offset: $00000000;  minimum: $00010000),
    (offset: $00003080;  minimum: $00000080),
    (offset: $000e2080;  minimum: $00000800),
    (offset: $03c82080;  minimum: $00010000),
    (offset: $fa082080;  minimum: $00200000),
    (offset: $82082080;  minimum: $04000000),
    (offset: $00000000;  minimum: $04000000));
  UTF8_EXTRA_SURROGATE = 3;
  UTF8_FIRSTBYTE: array[2..6] of byte = ($c0,$e0,$f0,$f8,$fc);

type
  /// kind of adding in a TTextWriter
  TTextWriterKind = (twNone, twJSONEscape, twOnSameLine);

  /// an abstract class to handle Ansi to/from Unicode translation
  // - implementations of this class will handle efficiently all Code Pages
  // - this default implementation will use the Operating System APIs
................................................................................
  MaxDestChars, sourceBytes: PtrInt; NoTrailingZero: boolean=false): PtrInt; overload;

/// calculate the UTF-16 Unicode characters count, UTF-8 encoded in source^
// - count may not match the UCS4 glyphs number, in case of UTF-16 surrogates
// - faster than System.UTF8ToUnicode with dest=nil
function Utf8ToUnicodeLength(source: PUTF8Char): PtrUInt;













/// returns TRUE if the supplied buffer has valid UTF-8 encoding with no #1..#31
// control characters
// - supplied input is a pointer to a #0 ended text buffer
function IsValidUTF8WithoutControlChars(source: PUTF8Char): Boolean; overload;

/// returns TRUE if the supplied buffer has valid UTF-8 encoding with no #0..#31
// control characters
................................................................................

{ ************ some fast UTF-8 / Unicode / Ansi conversion routines }

var
  // internal list of TSynAnsiConvert instances
  SynAnsiConvertList: TSynObjectList = nil;

























{$ifdef HASINLINE}
{$ifdef USE_VTYPE_STATIC} // circumvent weird bug on BSD + ARM (Alfred)
procedure VarClear(var v: variant); // defined here for proper inlining
const VTYPE_STATIC = $BFE8; // bitmask to avoid remote VarClearProc call
var p: PInteger; // more efficient generated asm with an explicit temp variable
begin
  p := @v;
................................................................................
Quit:
  result := PtrUInt(dest)-PtrUInt(begd); // dest-begd returns bytes length
NoSource:
  if not NoTrailingZero then
    dest^ := #0; // always append a WideChar(0) to the end of the buffer
end;



















































function IsValidUTF8WithoutControlChars(source: PUTF8Char): Boolean;
var extra, i: integer;
    c: cardinal;
begin
  result := false;
  if source<>nil then
  repeat

Changes to SynTable.pas.

304
305
306
307
308
309
310




















311
312
313
314
315
316
317
....
9110
9111
9112
9113
9114
9115
9116








































































































































































































































































































9117
9118
9119
9120
9121
9122
9123
.....
18164
18165
18166
18167
18168
18169
18170










18171
18172
18173
18174
18175
18176
18177
  /// number of bits to use for each interresting soundex char
  // - default is to use 8 bits, i.e. 4 soundex chars, which is the
  // standard approach
  // - for a more detailled soundex, use 4 bits resolution, which will
  // compute up to 7 soundex chars in a cardinal (that's our choice)
  SOUNDEX_BITS = 4;






















{ ************ filtering and validation classes and functions ************** }

/// convert an IPv4 'x.x.x.x' text into its 32-bit value
// - returns TRUE if the text was a valid IPv4 text, unserialized as 32-bit aValue
// - returns FALSE on parsing error, also setting aValue=0
// - '' or '127.0.0.1' will also return false
................................................................................
  if result<>0 then begin
    dec(result,ord('A')-1);   // first Soundex char is first char
    SoundExComputeUTF8(U,result,SOUNDEXVALUES[Lang]);
  end;
  if next<>nil then
    next^ := FindNextUTF8WordBegin(U);
end;










































































































































































































































































































{ ************ filtering and validation classes and functions *************** }

function IPToCardinal(P: PUTF8Char; out aValue: cardinal): boolean;
var i,c: cardinal;
    b: array[0..3] of byte;
................................................................................
  EMOJI_AFTERDOTS['D'] := eLaughing;
  EMOJI_AFTERDOTS['o'] := eOpen_mouth;
  EMOJI_AFTERDOTS['O'] := eOpen_mouth;
  EMOJI_AFTERDOTS['p'] := eYum;
  EMOJI_AFTERDOTS['P'] := eYum;
  EMOJI_AFTERDOTS['s'] := eScream;
  EMOJI_AFTERDOTS['S'] := eScream;










end;


initialization
  Assert(SizeOf(TSynTableFieldType)=1); // as expected by TSynTableFieldProperties
  Assert(SizeOf(TSynTableFieldOptions)=1);
  {$ifndef NOVARIANTS}






>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>
>







304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
....
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
.....
18480
18481
18482
18483
18484
18485
18486
18487
18488
18489
18490
18491
18492
18493
18494
18495
18496
18497
18498
18499
18500
18501
18502
18503
  /// number of bits to use for each interresting soundex char
  // - default is to use 8 bits, i.e. 4 soundex chars, which is the
  // standard approach
  // - for a more detailled soundex, use 4 bits resolution, which will
  // compute up to 7 soundex chars in a cardinal (that's our choice)
  SOUNDEX_BITS = 4;

var
  DoIsValidUTF8: function(source: PUTF8Char): Boolean;
  DoIsValidUTF8Len: function(source: PUTF8Char; sourcelen: PtrInt): Boolean;

/// returns TRUE if the supplied buffer has valid UTF-8 encoding
// - will stop when the buffer contains #0
// - on Haswell AVX2 Intel/AMD CPUs, will use very efficient ASM
function IsValidUTF8(source: PUTF8Char): Boolean; overload; {$ifdef HASINLINE}inline;{$endif}

/// returns TRUE if the supplied buffer has valid UTF-8 encoding
// - will also refuse #0 characters within the buffer
// - on Haswell AVX2 Intel/AMD CPUs, will use very efficient ASM
function IsValidUTF8(source: PUTF8Char; sourcelen: PtrInt): Boolean; overload; {$ifdef HASINLINE}inline;{$endif}

/// returns TRUE if the supplied buffer has valid UTF-8 encoding
// - will also refuse #0 characters within the buffer
// - on Haswell AVX2 Intel/AMD CPUs, will use very efficient ASM
function IsValidUTF8(const source: RawUTF8): Boolean; overload;



{ ************ filtering and validation classes and functions ************** }

/// convert an IPv4 'x.x.x.x' text into its 32-bit value
// - returns TRUE if the text was a valid IPv4 text, unserialized as 32-bit aValue
// - returns FALSE on parsing error, also setting aValue=0
// - '' or '127.0.0.1' will also return false
................................................................................
  if result<>0 then begin
    dec(result,ord('A')-1);   // first Soundex char is first char
    SoundExComputeUTF8(U,result,SOUNDEXVALUES[Lang]);
  end;
  if next<>nil then
    next^ := FindNextUTF8WordBegin(U);
end;

{$ifdef ASMX64AVX} // AVX2 ASM not available on Delphi yet
// adapted from https://github.com/simdjson/simdjson - Apache License 2.0
function IsValidUtf8LenAvx2(source: PUtf8Char; sourcelen: PtrInt): boolean;
  {$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
        push    rbp
        mov     r8, source
        mov     rdx, sourcelen
        mov     rsi, r8
        mov     ecx, 64
        mov     rax, rsi
        mov     rdi, rdx
        mov     rbp, rsp
        and     rsp, 0FFFFFFFFFFFFFFE0H // align stack at 32 bytes
        sub     rsp, 160
        cmp     rdx, 64
        cmovnc  rcx, rdx
        sub     rcx, 64
        je      @small
        vpxor   xmm3, xmm3, xmm3
        vmovdqa ymm7,  ymmword ptr [rip + @0f]
        vmovdqa ymm15, ymmword ptr [rip + @_6]
        xor     esi, esi
        vmovdqa ymm14, ymmword ptr [rip + @_7]
        vmovdqa ymm13, ymmword ptr [rip + @_8]
        vmovdqa ymm5, ymm3
        vmovdqa ymm2, ymm3
        // main processing loop, 64 bytes per iteration
        align 16
@loop:  vmovdqu xmm6, xmmword ptr [rax + rsi]
        vinserti128 ymm0, ymm6, xmmword ptr [rax + rsi + 10H], 01H
        vmovdqu xmm6, xmmword ptr [rax + rsi + 20H]
        vinserti128 ymm1, ymm6, xmmword ptr [rax + rsi + 30H], 01H
        add     rsi, 64
        vpor    ymm4, ymm1, ymm0
        vpmovmskb rdx, ymm4 // check set MSB of each 64 bytes
        test    edx, edx
        jne     @check
        vpor    ymm2, ymm5, ymm2
        vmovdqa ymm4, ymm2
        cmp     rcx, rsi
        ja      @loop
        // process trailing 0..63 bytes
@trail: sub     rdi, rsi
        jz      @ended
        add     rsi, rax
        vmovdqa xmm0, xmmword ptr [rip + @20]
        lea     rdx, qword ptr [rsp + 60H] // copy on stack with space padding
        sub     rsi, rdx
        vmovdqa xmmword ptr [rdx], xmm0
        vmovdqa xmmword ptr [rdx + 10H], xmm0
        vmovdqa xmmword ptr [rdx + 20H], xmm0
        vmovdqa xmmword ptr [rdx + 30H], xmm0
@by8:   sub     rdi, 8
        jb      @by1
        mov     rax, qword ptr [rsi + rdx]
        mov     qword ptr [rdx], rax
        add     rdx, 8 // in-order copy to preserve UTF-8 encoding
        jmp     @by8
@by1:   add     rdi, 8
        jz      @0
@sml:   mov     al, byte ptr [rsi + rdx]
        mov     byte ptr [rdx], al
        add     rdx, 1
        sub     rdi, 1
        jnz     @sml
@0:     vmovdqa ymm1, ymmword ptr [rsp + 60H]
        vmovdqa ymm2, ymmword ptr [rsp + 80H]
        vpor    ymm0, ymm1, ymm2
        vpmovmskb rax, ymm0 // check any set MSB
        test    eax, eax
        jne     @last
@ended: vpor    ymm5, ymm5, ymm4
        vptest  ymm5, ymm5
        sete    al
        vzeroupper
        leave
        ret
@small: vpxor   xmm4, xmm4, xmm4
        xor     esi, esi
        vmovdqa ymm3, ymm4
        vmovdqa ymm5, ymm4
        jmp     @trail
        // validate UTF-8 extra bytes from main loop
        align 8
@check: vpsrlw  ymm9, ymm0, 4
        vpsrlw  ymm12, ymm1, 4
        vperm2i128 ymm3, ymm3, ymm0, 21H
        vpalignr ymm5, ymm0, ymm3, 0FH
        vpalignr ymm11, ymm0, ymm3, 0EH
        vpsubusb ymm11, ymm11, ymmword ptr [rip + @_9]
        vpalignr ymm3, ymm0, ymm3, 0DH
        vperm2i128 ymm0, ymm0, ymm1, 21H
        vpsubusb ymm3, ymm3, ymmword ptr [rip + @_10]
        vpalignr ymm8, ymm1, ymm0, 0FH
        vpsrlw  ymm10, ymm5, 4
        vpand   ymm5, ymm7, ymm5
        vpsrlw  ymm6, ymm8, 4
        vpalignr ymm4, ymm1, ymm0, 0EH
        vpsubusb ymm4, ymm4, ymmword ptr [rip + @_9]
        vpalignr ymm0, ymm1, ymm0, 0DH
        vpsubusb ymm0, ymm0, ymmword ptr [rip + @_10]
        vpand   ymm10, ymm10, ymm7
        vpand   ymm6, ymm6, ymm7
        vpand   ymm8, ymm7, ymm8
        vpor    ymm3, ymm3, ymm11
        vpor    ymm0, ymm4, ymm0
        vpxor   xmm11, xmm11, xmm11
        vpshufb ymm10, ymm15, ymm10
        vpshufb ymm5, ymm14, ymm5
        vpand   ymm9, ymm9, ymm7
        vpshufb ymm6, ymm15, ymm6
        vpshufb ymm8, ymm14, ymm8
        vpand   ymm12, ymm12, ymm7
        vpand   ymm5, ymm5, ymm10
        vpcmpgtb ymm3, ymm3, ymm11
        vpcmpgtb ymm0, ymm0, ymm11
        vpshufb ymm9, ymm13, ymm9
        vpand   ymm3, ymm3, ymmword ptr [rip + @_11]
        vpand   ymm0, ymm0, ymmword ptr [rip + @_11]
        vpshufb ymm12, ymm13, ymm12
        vpand   ymm6, ymm6, ymm8
        vpand   ymm9, ymm5, ymm9
        vpsubusb ymm5, ymm1, ymmword ptr [rip + @_12]
        vpand   ymm12, ymm6, ymm12
        vpxor   ymm9, ymm3, ymm9
        vmovdqa ymm3, ymm1
        vpxor   ymm12, ymm0, ymm12
        vpor    ymm9, ymm9, ymm12
        vpor    ymm2, ymm9, ymm2
        vmovdqa ymm4, ymm2
        cmp     rcx, rsi
        ja      @loop
        jmp     @trail
        // validate UTF-8 extra bytes from input ending
        align 8
@last:  vmovdqa ymm5, ymmword ptr [rip + @0f]
        vperm2i128 ymm3, ymm3, ymm1, 21H
        vmovdqa ymm9, ymmword ptr [rip + @_7]
        vpsrlw  ymm11, ymm1, 4
        vpalignr ymm0, ymm1, ymm3, 0FH
        vmovdqa ymm13, ymmword ptr [rip + @_10]
        vmovdqa ymm14, ymmword ptr [rip + @_9]
        vpsrlw  ymm6, ymm0, 4
        vpand   ymm0, ymm5, ymm0
        vpand   ymm11, ymm11, ymm5
        vmovdqa ymm7, ymmword ptr [rip + @_6]
        vpshufb ymm10, ymm9, ymm0
        vpalignr ymm0, ymm1, ymm3, 0EH
        vpand   ymm6, ymm6, ymm5
        vmovdqa ymm8, ymmword ptr [rip + @_8]
        vpalignr ymm3, ymm1, ymm3, 0DH
        vperm2i128 ymm1, ymm1, ymm2, 21H
        vpsubusb ymm0, ymm0, ymm14
        vpsubusb ymm12, ymm3, ymm13
        vpalignr ymm3, ymm2, ymm1, 0FH
        vpshufb ymm6, ymm7, ymm6
        vpsrlw  ymm15, ymm3, 4
        vpand   ymm3, ymm5, ymm3
        vpor    ymm0, ymm0, ymm12
        vpshufb ymm9, ymm9, ymm3
        vpsrlw  ymm3, ymm2, 4
        vpand   ymm15, ymm15, ymm5
        vpand   ymm5, ymm3, ymm5
        vpalignr ymm3, ymm2, ymm1, 0EH
        vpxor   xmm12, xmm12, xmm12
        vpalignr ymm1, ymm2, ymm1, 0DH
        vpsubusb ymm3, ymm3, ymm14
        vpshufb ymm11, ymm8, ymm11
        vpsubusb ymm1, ymm1, ymm13
        vpcmpgtb ymm0, ymm0, ymm12
        vpshufb ymm7, ymm7, ymm15
        vpor    ymm1, ymm3, ymm1
        vpshufb ymm8, ymm8, ymm5
        vpsubusb ymm5, ymm2, ymmword ptr [rip + @_12]
        vmovdqa ymm2, ymmword ptr [rip + @_11]
        vpcmpgtb ymm1, ymm1, ymm12
        vpand   ymm6, ymm6, ymm10
        vpand   ymm7, ymm7, ymm9
        vpand   ymm0, ymm0, ymm2
        vpand   ymm11, ymm6, ymm11
        vpand   ymm8, ymm7, ymm8
        vpxor   ymm0, ymm0, ymm11
        vpor    ymm5, ymm4, ymm5
        vpand   ymm1, ymm1, ymm2
        vpxor   ymm1, ymm1, ymm8
        vpor    ymm0, ymm0, ymm1
        vpor    ymm5, ymm0, ymm5
        vptest  ymm5, ymm5
        sete    al
        vzeroupper
        leave
        ret
        align 16
@20:    dq 2020202020202020H
        dq 2020202020202020H
        align 32
@0f:    dq 0F0F0F0F0F0F0F0FH
        dq 0F0F0F0F0F0F0F0FH
        dq 0F0F0F0F0F0F0F0FH
        dq 0F0F0F0F0F0F0F0FH
@_6:    dq 0202020202020202H
        dq 4915012180808080H
        dq 0202020202020202H
        dq 4915012180808080H
@_7:    dq 0CBCBCB8B8383A3E7H
        dq 0CBCBDBCBCBCBCBCBH
        dq 0CBCBCB8B8383A3E7H
        dq 0CBCBDBCBCBCBCBCBH
@_8:    dq 0101010101010101H
        dq 01010101BABAAEE6H
        dq 0101010101010101H
        dq 01010101BABAAEE6H
@_9:    dq 0DFDFDFDFDFDFDFDFH
        dq 0DFDFDFDFDFDFDFDFH
        dq 0DFDFDFDFDFDFDFDFH
        dq 0DFDFDFDFDFDFDFDFH
@_10:   dq 0EFEFEFEFEFEFEFEFH
        dq 0EFEFEFEFEFEFEFEFH
        dq 0EFEFEFEFEFEFEFEFH
        dq 0EFEFEFEFEFEFEFEFH
@_11:   dq 8080808080808080H
        dq 8080808080808080H
        dq 8080808080808080H
        dq 8080808080808080H
@_12:   db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH
        db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH
        db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH
        db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0EFH, 0DFH, 0BFH
end;

function IsValidUTF8Avx2(source: PUTF8Char): Boolean;
begin
  result := IsValidUTF8LenAvx2(source,StrLen(source));
end;
{$endif ASMX64AVX}

function IsValidUTF8Pas(source: PUTF8Char): Boolean;
var extra, i: integer;
    c: cardinal;
begin
  result := false;
  if source<>nil then
  repeat
    c := byte(source^);
    inc(source);
    if c=0 then break else
    if c and $80<>0 then begin
      extra := UTF8_EXTRABYTES[c];
      if extra=0 then exit else // invalid leading byte
      for i := 1 to extra do
        if byte(source^) and $c0<>$80 then
          exit else
          inc(source); // check valid UTF-8 content
    end;
  until false;
  result := true;
end;

function IsValidUTF8LenPas(source: PUTF8Char; sourcelen: PtrInt): Boolean;
var extra, i: integer;
    c: cardinal;
begin
  result := false;
  inc(sourcelen,PtrInt(source));
  if source<>nil then
    while PtrInt(PtrUInt(source))<sourcelen do begin
      c := byte(source^);
      inc(source);
      if c=0 then exit else
      if c and $80<>0 then begin
        extra := UTF8_EXTRABYTES[c];
        if extra=0 then exit else // invalid leading byte
        for i := 1 to extra do
          if (PtrInt(PtrUInt(source))>=sourcelen) or (byte(source^) and $c0<>$80) then
            exit else
            inc(source); // check valid UTF-8 content
      end;
    end;
  result := true;
end;

function IsValidUTF8(source: PUTF8Char): Boolean;
begin
  result := DoIsValidUTF8(source);
end;

function IsValidUTF8(source: PUTF8Char; sourcelen: PtrInt): Boolean;
begin
  result := DoIsValidUTF8Len(source,sourcelen);
end;

function IsValidUTF8(const source: RawUTF8): Boolean;
begin
  result := DoIsValidUTF8Len(pointer(Source),length(Source));
end;


{ ************ filtering and validation classes and functions *************** }

function IPToCardinal(P: PUTF8Char; out aValue: cardinal): boolean;
var i,c: cardinal;
    b: array[0..3] of byte;
................................................................................
  EMOJI_AFTERDOTS['D'] := eLaughing;
  EMOJI_AFTERDOTS['o'] := eOpen_mouth;
  EMOJI_AFTERDOTS['O'] := eOpen_mouth;
  EMOJI_AFTERDOTS['p'] := eYum;
  EMOJI_AFTERDOTS['P'] := eYum;
  EMOJI_AFTERDOTS['s'] := eScream;
  EMOJI_AFTERDOTS['S'] := eScream;
  DoIsValidUTF8 := IsValidUTF8Pas;
  DoIsValidUTF8Len := IsValidUTF8LenPas;
  {$ifdef ASMX64AVX}
  if CpuFeatures * [cfAVX2, cfSSE42, cfBMI1, cfBMI2, cfCLMUL] =
                   [cfAVX2, cfSSE42, cfBMI1, cfBMI2, cfCLMUL] then begin
    // Haswell CPUs can use simdjson AVX2 asm for IsValidUtf8()
    DoIsValidUTF8 := IsValidUTF8Avx2;
    DoIsValidUTF8Len := IsValidUTF8LenAvx2;
  end;
  {$endif ASMX64AVX}
end;


initialization
  Assert(SizeOf(TSynTableFieldType)=1); // as expected by TSynTableFieldProperties
  Assert(SizeOf(TSynTableFieldOptions)=1);
  {$ifndef NOVARIANTS}

Changes to Synopse.inc.

325
326
327
328
329
330
331

332
333
334
335
336
337
338
    {$define FPC_64}
    {$define PUREPASCAL} // e.g. x64, AARCH64
    {$ifdef CPUX64}
      {$define CPUINTEL}
      {$define FPC_CPUINTEL}
      {$ifndef BSD}
        {$define CPUX64ASM} // Delphi XE4 or Darwin asm are buggy :(

        {$define HASAESNI}  // SynCrypto rejected by Darwin asm
      {$endif BSD}
      {$define FPC_X64}   // supports AVX/AVX2/AVX512 - which Delphi doesn't
      {$ASMMODE INTEL}    // to share asm code with Delphi
    {$endif CPUX64}
    {$ifdef CPUAARCH64}
      {$define CPUARM3264}






>







325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
    {$define FPC_64}
    {$define PUREPASCAL} // e.g. x64, AARCH64
    {$ifdef CPUX64}
      {$define CPUINTEL}
      {$define FPC_CPUINTEL}
      {$ifndef BSD}
        {$define CPUX64ASM} // Delphi XE4 or Darwin asm are buggy :(
        {$define ASMX64AVX} // only FPC supports AVX/AVX2/AVX512
        {$define HASAESNI}  // SynCrypto rejected by Darwin asm
      {$endif BSD}
      {$define FPC_X64}   // supports AVX/AVX2/AVX512 - which Delphi doesn't
      {$ASMMODE INTEL}    // to share asm code with Delphi
    {$endif CPUX64}
    {$ifdef CPUAARCH64}
      {$define CPUARM3264}

Changes to SynopseCommit.inc.

1
'1.18.6307'
|
1
'1.18.6308'