Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Comment: | {6308} Haswell CPUs will use simdjson AVX2 asm for IsValidUtf8() - backport from mORMot 2 - as proposed by https://github.com/synopse/mORMot/pull/400 |
---|---|
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
ab50456505559dd8a9bcdcf6e9c45b4d |
User & Date: | ab 2021-07-31 14:41:28 |
2021-08-08
| ||
06:39 | {6309} fixed Android compilation check-in: ae98eeeb70 user: ab tags: trunk | |
2021-07-31
| ||
14:41 | {6308} Haswell CPUs will use simdjson AVX2 asm for IsValidUtf8() - backport from mORMot 2 - as proposed by https://github.com/synopse/mORMot/pull/400 check-in: ab50456505 user: ab tags: trunk | |
2021-07-24
| ||
12:06 | {6307} introducing TTestLowLevelTypes.JSONBenchmark check-in: 7a58b38729 user: ab tags: trunk | |
Changes to SynCommons.pas.
518 519 520 521 522 523 524 525 526 527 528 529 530 531 .... 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 ..... 16929 16930 16931 16932 16933 16934 16935 16936 16937 16938 16939 16940 16941 16942 16943 16944 16945 16946 16947 16948 16949 16950 16951 16952 16953 16954 16955 16956 16957 16958 16959 16960 16961 16962 16963 16964 16965 16966 ..... 18425 18426 18427 18428 18429 18430 18431 18432 18433 18434 18435 18436 18437 18438 18439 18440 18441 18442 18443 18444 18445 18446 18447 18448 18449 18450 18451 18452 18453 18454 18455 18456 18457 18458 18459 18460 18461 18462 18463 18464 18465 18466 18467 18468 18469 18470 18471 18472 18473 18474 18475 18476 18477 18478 18479 18480 18481 18482 18483 18484 18485 18486 18487 18488 |
/// class-reference type (metaclass) of a TInterfacedObject TInterfacedObjectClass = class of TInterfacedObject; { ************ fast UTF-8 / Unicode / Ansi types and conversion routines **** } type /// kind of adding in a TTextWriter TTextWriterKind = (twNone, twJSONEscape, twOnSameLine); /// an abstract class to handle Ansi to/from Unicode translation // - implementations of this class will handle efficiently all Code Pages // - this default implementation will use the Operating System APIs ................................................................................ MaxDestChars, sourceBytes: PtrInt; NoTrailingZero: boolean=false): PtrInt; overload; /// calculate the UTF-16 Unicode characters count, UTF-8 encoded in source^ // - count may not match the UCS4 glyphs number, in case of UTF-16 surrogates // - faster than System.UTF8ToUnicode with dest=nil function Utf8ToUnicodeLength(source: PUTF8Char): PtrUInt; /// returns TRUE if the supplied buffer has valid UTF-8 encoding // - will stop when the buffer contains #0 function IsValidUTF8(source: PUTF8Char): Boolean; overload; /// returns TRUE if the supplied buffer has valid UTF-8 encoding // - will also refuse #0 characters within the buffer function IsValidUTF8(source: PUTF8Char; sourcelen: PtrInt): Boolean; overload; /// returns TRUE if the supplied buffer has valid UTF-8 encoding // - will also refuse #0 characters within the buffer function IsValidUTF8(const source: RawUTF8): Boolean; overload; /// returns TRUE if the supplied buffer has valid UTF-8 encoding with no #1..#31 // control characters // - supplied input is a pointer to a #0 ended text buffer function IsValidUTF8WithoutControlChars(source: PUTF8Char): Boolean; overload; /// returns TRUE if the supplied buffer has valid UTF-8 encoding with no #0..#31 // control characters ................................................................................ { ************ some fast UTF-8 / Unicode / Ansi conversion routines } var // internal list of TSynAnsiConvert instances SynAnsiConvertList: TSynObjectList = nil; // some constants used for UTF-8 conversion, including surrogates const UTF16_HISURROGATE_MIN = $d800; UTF16_HISURROGATE_MAX = $dbff; UTF16_LOSURROGATE_MIN = $dc00; UTF16_LOSURROGATE_MAX = $dfff; UTF8_EXTRABYTES: array[$80..$ff] of byte = ( 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0); UTF8_EXTRA: array[0..6] of record offset, minimum: cardinal; end = ( // http://floodyberry.wordpress.com/2007/04/14/utf-8-conversion-tricks (offset: $00000000; minimum: $00010000), (offset: $00003080; minimum: $00000080), (offset: $000e2080; minimum: $00000800), (offset: $03c82080; minimum: $00010000), (offset: $fa082080; minimum: $00200000), (offset: $82082080; minimum: $04000000), (offset: $00000000; minimum: $04000000)); UTF8_EXTRA_SURROGATE = 3; UTF8_FIRSTBYTE: array[2..6] of byte = ($c0,$e0,$f0,$f8,$fc); {$ifdef HASINLINE} {$ifdef USE_VTYPE_STATIC} // circumvent weird bug on BSD + ARM (Alfred) procedure VarClear(var v: variant); // defined here for proper inlining const VTYPE_STATIC = $BFE8; // bitmask to avoid remote VarClearProc call var p: PInteger; // more efficient generated asm with an explicit temp variable begin p := @v; ................................................................................ Quit: result := PtrUInt(dest)-PtrUInt(begd); // dest-begd returns bytes length NoSource: if not NoTrailingZero then dest^ := #0; // always append a WideChar(0) to the end of the buffer end; function IsValidUTF8(source: PUTF8Char): Boolean; var extra, i: integer; c: cardinal; begin result := false; if source<>nil then repeat c := byte(source^); inc(source); if c=0 then break else if c and $80<>0 then begin extra := UTF8_EXTRABYTES[c]; if extra=0 then exit else // invalid leading byte for i := 1 to extra do if byte(source^) and $c0<>$80 then exit else inc(source); // check valid UTF-8 content end; until false; result := true; end; function IsValidUTF8(const source: RawUTF8): Boolean; begin result := IsValidUTF8(pointer(Source),length(Source)); end; function IsValidUTF8(source: PUTF8Char; sourcelen: PtrInt): Boolean; var extra, i: integer; c: cardinal; begin result := false; inc(sourcelen,PtrInt(source)); if source<>nil then while PtrInt(PtrUInt(source))<sourcelen do begin c := byte(source^); inc(source); if c=0 then exit else if c and $80<>0 then begin extra := UTF8_EXTRABYTES[c]; if extra=0 then exit else // invalid leading byte for i := 1 to extra do if (PtrInt(PtrUInt(source))>=sourcelen) or (byte(source^) and $c0<>$80) then exit else inc(source); // check valid UTF-8 content end; end; result := true; end; function IsValidUTF8WithoutControlChars(source: PUTF8Char): Boolean; var extra, i: integer; c: cardinal; begin result := false; if source<>nil then repeat |
> > > > > > > > > > > > > > > > > > > > > > > > < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < |
518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 .... 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 ..... 16941 16942 16943 16944 16945 16946 16947 16948 16949 16950 16951 16952 16953 16954 ..... 18413 18414 18415 18416 18417 18418 18419 18420 18421 18422 18423 18424 18425 18426 |
/// class-reference type (metaclass) of a TInterfacedObject TInterfacedObjectClass = class of TInterfacedObject; { ************ fast UTF-8 / Unicode / Ansi types and conversion routines **** } // some constants used for UTF-8 conversion, including surrogates const UTF16_HISURROGATE_MIN = $d800; UTF16_HISURROGATE_MAX = $dbff; UTF16_LOSURROGATE_MIN = $dc00; UTF16_LOSURROGATE_MAX = $dfff; UTF8_EXTRABYTES: array[$80..$ff] of byte = ( 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0); UTF8_EXTRA: array[0..6] of record offset, minimum: cardinal; end = ( // http://floodyberry.wordpress.com/2007/04/14/utf-8-conversion-tricks (offset: $00000000; minimum: $00010000), (offset: $00003080; minimum: $00000080), (offset: $000e2080; minimum: $00000800), (offset: $03c82080; minimum: $00010000), (offset: $fa082080; minimum: $00200000), (offset: $82082080; minimum: $04000000), (offset: $00000000; minimum: $04000000)); UTF8_EXTRA_SURROGATE = 3; UTF8_FIRSTBYTE: array[2..6] of byte = ($c0,$e0,$f0,$f8,$fc); type /// kind of adding in a TTextWriter TTextWriterKind = (twNone, twJSONEscape, twOnSameLine); /// an abstract class to handle Ansi to/from Unicode translation // - implementations of this class will handle efficiently all Code Pages // - this default implementation will use the Operating System APIs ................................................................................ MaxDestChars, sourceBytes: PtrInt; NoTrailingZero: boolean=false): PtrInt; overload; /// calculate the UTF-16 Unicode characters count, UTF-8 encoded in source^ // - count may not match the UCS4 glyphs number, in case of UTF-16 surrogates // - faster than System.UTF8ToUnicode with dest=nil function Utf8ToUnicodeLength(source: PUTF8Char): PtrUInt; /// returns TRUE if the supplied buffer has valid UTF-8 encoding with no #1..#31 // control characters // - supplied input is a pointer to a #0 ended text buffer function IsValidUTF8WithoutControlChars(source: PUTF8Char): Boolean; overload; /// returns TRUE if the supplied buffer has valid UTF-8 encoding with no #0..#31 // control characters ................................................................................ { ************ some fast UTF-8 / Unicode / Ansi conversion routines } var // internal list of TSynAnsiConvert instances SynAnsiConvertList: TSynObjectList = nil; {$ifdef HASINLINE} {$ifdef USE_VTYPE_STATIC} // circumvent weird bug on BSD + ARM (Alfred) procedure VarClear(var v: variant); // defined here for proper inlining const VTYPE_STATIC = $BFE8; // bitmask to avoid remote VarClearProc call var p: PInteger; // more efficient generated asm with an explicit temp variable begin p := @v; ................................................................................ Quit: result := PtrUInt(dest)-PtrUInt(begd); // dest-begd returns bytes length NoSource: if not NoTrailingZero then dest^ := #0; // always append a WideChar(0) to the end of the buffer end; function IsValidUTF8WithoutControlChars(source: PUTF8Char): Boolean; var extra, i: integer; c: cardinal; begin result := false; if source<>nil then repeat |
Changes to SynTable.pas.
304 305 306 307 308 309 310 311 312 313 314 315 316 317 .... 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 ..... 18164 18165 18166 18167 18168 18169 18170 18171 18172 18173 18174 18175 18176 18177 |
/// number of bits to use for each interresting soundex char // - default is to use 8 bits, i.e. 4 soundex chars, which is the // standard approach // - for a more detailled soundex, use 4 bits resolution, which will // compute up to 7 soundex chars in a cardinal (that's our choice) SOUNDEX_BITS = 4; { ************ filtering and validation classes and functions ************** } /// convert an IPv4 'x.x.x.x' text into its 32-bit value // - returns TRUE if the text was a valid IPv4 text, unserialized as 32-bit aValue // - returns FALSE on parsing error, also setting aValue=0 // - '' or '127.0.0.1' will also return false ................................................................................ if result<>0 then begin dec(result,ord('A')-1); // first Soundex char is first char SoundExComputeUTF8(U,result,SOUNDEXVALUES[Lang]); end; if next<>nil then next^ := FindNextUTF8WordBegin(U); end; { ************ filtering and validation classes and functions *************** } function IPToCardinal(P: PUTF8Char; out aValue: cardinal): boolean; var i,c: cardinal; b: array[0..3] of byte; ................................................................................ EMOJI_AFTERDOTS['D'] := eLaughing; EMOJI_AFTERDOTS['o'] := eOpen_mouth; EMOJI_AFTERDOTS['O'] := eOpen_mouth; EMOJI_AFTERDOTS['p'] := eYum; EMOJI_AFTERDOTS['P'] := eYum; EMOJI_AFTERDOTS['s'] := eScream; EMOJI_AFTERDOTS['S'] := eScream; end; initialization Assert(SizeOf(TSynTableFieldType)=1); // as expected by TSynTableFieldProperties Assert(SizeOf(TSynTableFieldOptions)=1); {$ifndef NOVARIANTS} |
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > |
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 .... 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 ..... 18480 18481 18482 18483 18484 18485 18486 18487 18488 18489 18490 18491 18492 18493 18494 18495 18496 18497 18498 18499 18500 18501 18502 18503 |
/// number of bits to use for each interresting soundex char // - default is to use 8 bits, i.e. 4 soundex chars, which is the // standard approach // - for a more detailled soundex, use 4 bits resolution, which will // compute up to 7 soundex chars in a cardinal (that's our choice) SOUNDEX_BITS = 4; var DoIsValidUTF8: function(source: PUTF8Char): Boolean; DoIsValidUTF8Len: function(source: PUTF8Char; sourcelen: PtrInt): Boolean; /// returns TRUE if the supplied buffer has valid UTF-8 encoding // - will stop when the buffer contains #0 // - on Haswell AVX2 Intel/AMD CPUs, will use very efficient ASM function IsValidUTF8(source: PUTF8Char): Boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// returns TRUE if the supplied buffer has valid UTF-8 encoding // - will also refuse #0 characters within the buffer // - on Haswell AVX2 Intel/AMD CPUs, will use very efficient ASM function IsValidUTF8(source: PUTF8Char; sourcelen: PtrInt): Boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// returns TRUE if the supplied buffer has valid UTF-8 encoding // - will also refuse #0 characters within the buffer // - on Haswell AVX2 Intel/AMD CPUs, will use very efficient ASM function IsValidUTF8(const source: RawUTF8): Boolean; overload; { ************ filtering and validation classes and functions ************** } /// convert an IPv4 'x.x.x.x' text into its 32-bit value // - returns TRUE if the text was a valid IPv4 text, unserialized as 32-bit aValue // - returns FALSE on parsing error, also setting aValue=0 // - '' or '127.0.0.1' will also return false ................................................................................ if result<>0 then begin dec(result,ord('A')-1); // first Soundex char is first char SoundExComputeUTF8(U,result,SOUNDEXVALUES[Lang]); end; if next<>nil then next^ := FindNextUTF8WordBegin(U); end; {$ifdef ASMX64AVX} // AVX2 ASM not available on Delphi yet // adapted from https://github.com/simdjson/simdjson - Apache License 2.0 function IsValidUtf8LenAvx2(source: PUtf8Char; sourcelen: PtrInt): boolean; {$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC} push rbp mov r8, source mov rdx, sourcelen mov rsi, r8 mov ecx, 64 mov rax, rsi mov rdi, rdx mov rbp, rsp and rsp, 0FFFFFFFFFFFFFFE0H // align stack at 32 bytes sub rsp, 160 cmp rdx, 64 cmovnc rcx, rdx sub rcx, 64 je @small vpxor xmm3, xmm3, xmm3 vmovdqa ymm7, ymmword ptr [rip + @0f] vmovdqa ymm15, ymmword ptr [rip + @_6] xor esi, esi vmovdqa ymm14, ymmword ptr [rip + @_7] vmovdqa ymm13, ymmword ptr [rip + @_8] vmovdqa ymm5, ymm3 vmovdqa ymm2, ymm3 // main processing loop, 64 bytes per iteration align 16 @loop: vmovdqu xmm6, xmmword ptr [rax + rsi] vinserti128 ymm0, ymm6, xmmword ptr [rax + rsi + 10H], 01H vmovdqu xmm6, xmmword ptr [rax + rsi + 20H] vinserti128 ymm1, ymm6, xmmword ptr [rax + rsi + 30H], 01H add rsi, 64 vpor ymm4, ymm1, ymm0 vpmovmskb rdx, ymm4 // check set MSB of each 64 bytes test edx, edx jne @check vpor ymm2, ymm5, ymm2 vmovdqa ymm4, ymm2 cmp rcx, rsi ja @loop // process trailing 0..63 bytes @trail: sub rdi, rsi jz @ended add rsi, rax vmovdqa xmm0, xmmword ptr [rip + @20] lea rdx, qword ptr [rsp + 60H] // copy on stack with space padding sub rsi, rdx vmovdqa xmmword ptr [rdx], xmm0 vmovdqa xmmword ptr [rdx + 10H], xmm0 vmovdqa xmmword ptr [rdx + 20H], xmm0 vmovdqa xmmword ptr [rdx + 30H], xmm0 @by8: sub rdi, 8 jb @by1 mov rax, qword ptr [rsi + rdx] mov qword ptr [rdx], rax add rdx, 8 // in-order copy to preserve UTF-8 encoding jmp @by8 @by1: add rdi, 8 jz @0 @sml: mov al, byte ptr [rsi + rdx] mov byte ptr [rdx], al add rdx, 1 sub rdi, 1 jnz @sml @0: vmovdqa ymm1, ymmword ptr [rsp + 60H] vmovdqa ymm2, ymmword ptr [rsp + 80H] vpor ymm0, ymm1, ymm2 vpmovmskb rax, ymm0 // check any set MSB test eax, eax jne @last @ended: vpor ymm5, ymm5, ymm4 vptest ymm5, ymm5 sete al vzeroupper leave ret @small: vpxor xmm4, xmm4, xmm4 xor esi, esi vmovdqa ymm3, ymm4 vmovdqa ymm5, ymm4 jmp @trail // validate UTF-8 extra bytes from main loop align 8 @check: vpsrlw ymm9, ymm0, 4 vpsrlw ymm12, ymm1, 4 vperm2i128 ymm3, ymm3, ymm0, 21H vpalignr ymm5, ymm0, ymm3, 0FH vpalignr ymm11, ymm0, ymm3, 0EH vpsubusb ymm11, ymm11, ymmword ptr [rip + @_9] vpalignr ymm3, ymm0, ymm3, 0DH vperm2i128 ymm0, ymm0, ymm1, 21H vpsubusb ymm3, ymm3, ymmword ptr [rip + @_10] vpalignr ymm8, ymm1, ymm0, 0FH vpsrlw ymm10, ymm5, 4 vpand ymm5, ymm7, ymm5 vpsrlw ymm6, ymm8, 4 vpalignr ymm4, ymm1, ymm0, 0EH vpsubusb ymm4, ymm4, ymmword ptr [rip + @_9] vpalignr ymm0, ymm1, ymm0, 0DH vpsubusb ymm0, ymm0, ymmword ptr [rip + @_10] vpand ymm10, ymm10, ymm7 vpand ymm6, ymm6, ymm7 vpand ymm8, ymm7, ymm8 vpor ymm3, ymm3, ymm11 vpor ymm0, ymm4, ymm0 vpxor xmm11, xmm11, xmm11 vpshufb ymm10, ymm15, ymm10 vpshufb ymm5, ymm14, ymm5 vpand ymm9, ymm9, ymm7 vpshufb ymm6, ymm15, ymm6 vpshufb ymm8, ymm14, ymm8 vpand ymm12, ymm12, ymm7 vpand ymm5, ymm5, ymm10 vpcmpgtb ymm3, ymm3, ymm11 vpcmpgtb ymm0, ymm0, ymm11 vpshufb ymm9, ymm13, ymm9 vpand ymm3, ymm3, ymmword ptr [rip + @_11] vpand ymm0, ymm0, ymmword ptr [rip + @_11] vpshufb ymm12, ymm13, ymm12 vpand ymm6, ymm6, ymm8 vpand ymm9, ymm5, ymm9 vpsubusb ymm5, ymm1, ymmword ptr [rip + @_12] vpand ymm12, ymm6, ymm12 vpxor ymm9, ymm3, ymm9 vmovdqa ymm3, ymm1 vpxor ymm12, ymm0, ymm12 vpor ymm9, ymm9, ymm12 vpor ymm2, ymm9, ymm2 vmovdqa ymm4, ymm2 cmp rcx, rsi ja @loop jmp @trail // validate UTF-8 extra bytes from input ending align 8 @last: vmovdqa ymm5, ymmword ptr [rip + @0f] vperm2i128 ymm3, ymm3, ymm1, 21H vmovdqa ymm9, ymmword ptr [rip + @_7] vpsrlw ymm11, ymm1, 4 vpalignr ymm0, ymm1, ymm3, 0FH vmovdqa ymm13, ymmword ptr [rip + @_10] vmovdqa ymm14, ymmword ptr [rip + @_9] vpsrlw ymm6, ymm0, 4 vpand ymm0, ymm5, ymm0 vpand ymm11, ymm11, ymm5 vmovdqa ymm7, ymmword ptr [rip + @_6] vpshufb ymm10, ymm9, ymm0 vpalignr ymm0, ymm1, ymm3, 0EH vpand ymm6, ymm6, ymm5 vmovdqa ymm8, ymmword ptr [rip + @_8] vpalignr ymm3, ymm1, ymm3, 0DH vperm2i128 ymm1, ymm1, ymm2, 21H vpsubusb ymm0, ymm0, ymm14 vpsubusb ymm12, ymm3, ymm13 vpalignr ymm3, ymm2, ymm1, 0FH vpshufb ymm6, ymm7, ymm6 vpsrlw ymm15, ymm3, 4 vpand ymm3, ymm5, ymm3 vpor ymm0, ymm0, ymm12 vpshufb ymm9, ymm9, ymm3 vpsrlw ymm3, ymm2, 4 vpand ymm15, ymm15, ymm5 vpand ymm5, ymm3, ymm5 vpalignr ymm3, ymm2, ymm1, 0EH vpxor xmm12, xmm12, xmm12 vpalignr ymm1, ymm2, ymm1, 0DH vpsubusb ymm3, ymm3, ymm14 vpshufb ymm11, ymm8, ymm11 vpsubusb ymm1, ymm1, ymm13 vpcmpgtb ymm0, ymm0, ymm12 vpshufb ymm7, ymm7, ymm15 vpor ymm1, ymm3, ymm1 vpshufb ymm8, ymm8, ymm5 vpsubusb ymm5, ymm2, ymmword ptr [rip + @_12] vmovdqa ymm2, ymmword ptr [rip + @_11] vpcmpgtb ymm1, ymm1, ymm12 vpand ymm6, ymm6, ymm10 vpand ymm7, ymm7, ymm9 vpand ymm0, ymm0, ymm2 vpand ymm11, ymm6, ymm11 vpand ymm8, ymm7, ymm8 vpxor ymm0, ymm0, ymm11 vpor ymm5, ymm4, ymm5 vpand ymm1, ymm1, ymm2 vpxor ymm1, ymm1, ymm8 vpor ymm0, ymm0, ymm1 vpor ymm5, ymm0, ymm5 vptest ymm5, ymm5 sete al vzeroupper leave ret align 16 @20: dq 2020202020202020H dq 2020202020202020H align 32 @0f: dq 0F0F0F0F0F0F0F0FH dq 0F0F0F0F0F0F0F0FH dq 0F0F0F0F0F0F0F0FH dq 0F0F0F0F0F0F0F0FH @_6: dq 0202020202020202H dq 4915012180808080H dq 0202020202020202H dq 4915012180808080H @_7: dq 0CBCBCB8B8383A3E7H dq 0CBCBDBCBCBCBCBCBH dq 0CBCBCB8B8383A3E7H dq 0CBCBDBCBCBCBCBCBH @_8: dq 0101010101010101H dq 01010101BABAAEE6H dq 0101010101010101H dq 01010101BABAAEE6H @_9: dq 0DFDFDFDFDFDFDFDFH dq 0DFDFDFDFDFDFDFDFH dq 0DFDFDFDFDFDFDFDFH dq 0DFDFDFDFDFDFDFDFH @_10: dq 0EFEFEFEFEFEFEFEFH dq 0EFEFEFEFEFEFEFEFH dq 0EFEFEFEFEFEFEFEFH dq 0EFEFEFEFEFEFEFEFH @_11: dq 8080808080808080H dq 8080808080808080H dq 8080808080808080H dq 8080808080808080H @_12: db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0EFH, 0DFH, 0BFH end; function IsValidUTF8Avx2(source: PUTF8Char): Boolean; begin result := IsValidUTF8LenAvx2(source,StrLen(source)); end; {$endif ASMX64AVX} function IsValidUTF8Pas(source: PUTF8Char): Boolean; var extra, i: integer; c: cardinal; begin result := false; if source<>nil then repeat c := byte(source^); inc(source); if c=0 then break else if c and $80<>0 then begin extra := UTF8_EXTRABYTES[c]; if extra=0 then exit else // invalid leading byte for i := 1 to extra do if byte(source^) and $c0<>$80 then exit else inc(source); // check valid UTF-8 content end; until false; result := true; end; function IsValidUTF8LenPas(source: PUTF8Char; sourcelen: PtrInt): Boolean; var extra, i: integer; c: cardinal; begin result := false; inc(sourcelen,PtrInt(source)); if source<>nil then while PtrInt(PtrUInt(source))<sourcelen do begin c := byte(source^); inc(source); if c=0 then exit else if c and $80<>0 then begin extra := UTF8_EXTRABYTES[c]; if extra=0 then exit else // invalid leading byte for i := 1 to extra do if (PtrInt(PtrUInt(source))>=sourcelen) or (byte(source^) and $c0<>$80) then exit else inc(source); // check valid UTF-8 content end; end; result := true; end; function IsValidUTF8(source: PUTF8Char): Boolean; begin result := DoIsValidUTF8(source); end; function IsValidUTF8(source: PUTF8Char; sourcelen: PtrInt): Boolean; begin result := DoIsValidUTF8Len(source,sourcelen); end; function IsValidUTF8(const source: RawUTF8): Boolean; begin result := DoIsValidUTF8Len(pointer(Source),length(Source)); end; { ************ filtering and validation classes and functions *************** } function IPToCardinal(P: PUTF8Char; out aValue: cardinal): boolean; var i,c: cardinal; b: array[0..3] of byte; ................................................................................ EMOJI_AFTERDOTS['D'] := eLaughing; EMOJI_AFTERDOTS['o'] := eOpen_mouth; EMOJI_AFTERDOTS['O'] := eOpen_mouth; EMOJI_AFTERDOTS['p'] := eYum; EMOJI_AFTERDOTS['P'] := eYum; EMOJI_AFTERDOTS['s'] := eScream; EMOJI_AFTERDOTS['S'] := eScream; DoIsValidUTF8 := IsValidUTF8Pas; DoIsValidUTF8Len := IsValidUTF8LenPas; {$ifdef ASMX64AVX} if CpuFeatures * [cfAVX2, cfSSE42, cfBMI1, cfBMI2, cfCLMUL] = [cfAVX2, cfSSE42, cfBMI1, cfBMI2, cfCLMUL] then begin // Haswell CPUs can use simdjson AVX2 asm for IsValidUtf8() DoIsValidUTF8 := IsValidUTF8Avx2; DoIsValidUTF8Len := IsValidUTF8LenAvx2; end; {$endif ASMX64AVX} end; initialization Assert(SizeOf(TSynTableFieldType)=1); // as expected by TSynTableFieldProperties Assert(SizeOf(TSynTableFieldOptions)=1); {$ifndef NOVARIANTS} |
Changes to Synopse.inc.
325 326 327 328 329 330 331 332 333 334 335 336 337 338 |
{$define FPC_64} {$define PUREPASCAL} // e.g. x64, AARCH64 {$ifdef CPUX64} {$define CPUINTEL} {$define FPC_CPUINTEL} {$ifndef BSD} {$define CPUX64ASM} // Delphi XE4 or Darwin asm are buggy :( {$define HASAESNI} // SynCrypto rejected by Darwin asm {$endif BSD} {$define FPC_X64} // supports AVX/AVX2/AVX512 - which Delphi doesn't {$ASMMODE INTEL} // to share asm code with Delphi {$endif CPUX64} {$ifdef CPUAARCH64} {$define CPUARM3264} |
> |
325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 |
{$define FPC_64}
{$define PUREPASCAL} // e.g. x64, AARCH64
{$ifdef CPUX64}
{$define CPUINTEL}
{$define FPC_CPUINTEL}
{$ifndef BSD}
{$define CPUX64ASM} // Delphi XE4 or Darwin asm are buggy :(
{$define ASMX64AVX} // only FPC supports AVX/AVX2/AVX512
{$define HASAESNI} // SynCrypto rejected by Darwin asm
{$endif BSD}
{$define FPC_X64} // supports AVX/AVX2/AVX512 - which Delphi doesn't
{$ASMMODE INTEL} // to share asm code with Delphi
{$endif CPUX64}
{$ifdef CPUAARCH64}
{$define CPUARM3264}
|
Changes to SynopseCommit.inc.
1 |
'1.18.6307'
|
| |
1 |
'1.18.6308'
|