@@ -327,175 +327,52 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {
327
327
}
328
328
}
329
329
330
- /// ASCII test *without* the chunk-at-a-time optimizations.
331
- ///
332
- /// This is carefully structured to produce nice small code -- it's smaller in
333
- /// `-O` than what the "obvious" ways produces under `-C opt-level=s`. If you
334
- /// touch it, be sure to run (and update if needed) the assembly test.
335
- #[ unstable( feature = "str_internals" , issue = "none" ) ]
336
- #[ doc( hidden) ]
337
330
#[ inline]
338
- pub const fn is_ascii_simple ( mut bytes : & [ u8 ] ) -> bool {
339
- while let [ rest @ .., last ] = bytes {
340
- if !last . is_ascii ( ) {
331
+ const fn is_ascii_const ( mut bytes : & [ u8 ] ) -> bool {
332
+ while let [ first , rest @ ..] = bytes {
333
+ if !first . is_ascii ( ) {
341
334
break ;
342
335
}
343
336
bytes = rest;
344
337
}
345
338
bytes. is_empty ( )
346
339
}
347
340
341
+ /// The implementation using iterators produces a tighter loop than the
342
+ /// implementation using pattern-matching when inlined into `is_ascii_chunked`.
343
+ /// So we have duplicate implementations of the scalar case until iterators are
344
+ /// usable in const contexts.
345
+ #[ inline( always) ]
346
+ fn is_ascii_scalar ( bytes : & [ u8 ] ) -> bool {
347
+ bytes. iter ( ) . all ( u8:: is_ascii)
348
+ }
349
+
348
350
/// Optimized ASCII test that will use usize-at-a-time operations instead of
349
351
/// byte-at-a-time operations (when possible).
350
- ///
351
- /// The algorithm we use here is pretty simple. If `s` is too short, we just
352
- /// check each byte and be done with it. Otherwise:
353
- ///
354
- /// - Read the first word with an unaligned load.
355
- /// - Align the pointer, read subsequent words until end with aligned loads.
356
- /// - Read the last `usize` from `s` with an unaligned load.
357
- ///
358
- /// If any of these loads produces something for which `contains_nonascii`
359
- /// (above) returns true, then we know the answer is false.
360
- #[ cfg( not( all( target_arch = "x86_64" , target_feature = "sse2" ) ) ) ]
361
352
#[ inline]
362
353
#[ rustc_allow_const_fn_unstable( const_eval_select) ] // fallback impl has same behavior
363
- const fn is_ascii ( s : & [ u8 ] ) -> bool {
354
+ const fn is_ascii ( bytes : & [ u8 ] ) -> bool {
364
355
// The runtime version behaves the same as the compiletime version, it's
365
356
// just more optimized.
366
357
const_eval_select ! (
367
- @capture { s : & [ u8 ] } -> bool :
358
+ @capture { bytes : & [ u8 ] } -> bool :
368
359
if const {
369
- is_ascii_simple ( s )
360
+ is_ascii_const ( bytes )
370
361
} else {
371
- /// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
372
- /// from `../str/mod.rs`, which does something similar for utf8 validation.
373
- const fn contains_nonascii( v: usize ) -> bool {
374
- const NONASCII_MASK : usize = usize :: repeat_u8( 0x80 ) ;
375
- ( NONASCII_MASK & v) != 0
376
- }
377
-
378
- const USIZE_SIZE : usize = size_of:: <usize >( ) ;
379
-
380
- let len = s. len( ) ;
381
- let align_offset = s. as_ptr( ) . align_offset( USIZE_SIZE ) ;
382
-
383
- // If we wouldn't gain anything from the word-at-a-time implementation, fall
384
- // back to a scalar loop.
385
- //
386
- // We also do this for architectures where `size_of::<usize>()` isn't
387
- // sufficient alignment for `usize`, because it's a weird edge case.
388
- if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < align_of:: <usize >( ) {
389
- return is_ascii_simple( s) ;
390
- }
391
-
392
- // We always read the first word unaligned, which means `align_offset` is
393
- // 0, we'd read the same value again for the aligned read.
394
- let offset_to_aligned = if align_offset == 0 { USIZE_SIZE } else { align_offset } ;
395
-
396
- let start = s. as_ptr( ) ;
397
- // SAFETY: We verify `len < USIZE_SIZE` above.
398
- let first_word = unsafe { ( start as * const usize ) . read_unaligned( ) } ;
399
-
400
- if contains_nonascii( first_word) {
401
- return false ;
402
- }
403
- // We checked this above, somewhat implicitly. Note that `offset_to_aligned`
404
- // is either `align_offset` or `USIZE_SIZE`, both of are explicitly checked
405
- // above.
406
- debug_assert!( offset_to_aligned <= len) ;
407
-
408
- // SAFETY: word_ptr is the (properly aligned) usize ptr we use to read the
409
- // middle chunk of the slice.
410
- let mut word_ptr = unsafe { start. add( offset_to_aligned) as * const usize } ;
411
-
412
- // `byte_pos` is the byte index of `word_ptr`, used for loop end checks.
413
- let mut byte_pos = offset_to_aligned;
414
-
415
- // Paranoia check about alignment, since we're about to do a bunch of
416
- // unaligned loads. In practice this should be impossible barring a bug in
417
- // `align_offset` though.
418
- // While this method is allowed to spuriously fail in CTFE, if it doesn't
419
- // have alignment information it should have given a `usize::MAX` for
420
- // `align_offset` earlier, sending things through the scalar path instead of
421
- // this one, so this check should pass if it's reachable.
422
- debug_assert!( word_ptr. is_aligned_to( align_of:: <usize >( ) ) ) ;
423
-
424
- // Read subsequent words until the last aligned word, excluding the last
425
- // aligned word by itself to be done in tail check later, to ensure that
426
- // tail is always one `usize` at most to extra branch `byte_pos == len`.
427
- while byte_pos < len - USIZE_SIZE {
428
- // Sanity check that the read is in bounds
429
- debug_assert!( byte_pos + USIZE_SIZE <= len) ;
430
- // And that our assumptions about `byte_pos` hold.
431
- debug_assert!( word_ptr. cast:: <u8 >( ) == start. wrapping_add( byte_pos) ) ;
432
-
433
- // SAFETY: We know `word_ptr` is properly aligned (because of
434
- // `align_offset`), and we know that we have enough bytes between `word_ptr` and the end
435
- let word = unsafe { word_ptr. read( ) } ;
436
- if contains_nonascii( word) {
437
- return false ;
438
- }
439
-
440
- byte_pos += USIZE_SIZE ;
441
- // SAFETY: We know that `byte_pos <= len - USIZE_SIZE`, which means that
442
- // after this `add`, `word_ptr` will be at most one-past-the-end.
443
- word_ptr = unsafe { word_ptr. add( 1 ) } ;
444
- }
445
-
446
- // Sanity check to ensure there really is only one `usize` left. This should
447
- // be guaranteed by our loop condition.
448
- debug_assert!( byte_pos <= len && len - byte_pos <= USIZE_SIZE ) ;
449
-
450
- // SAFETY: This relies on `len >= USIZE_SIZE`, which we check at the start.
451
- let last_word = unsafe { ( start. add( len - USIZE_SIZE ) as * const usize ) . read_unaligned( ) } ;
452
-
453
- !contains_nonascii( last_word)
362
+ const CHUNK_SIZE : usize = if cfg!( all( target_arch = "x86_64" , target_feature = "sse2" ) ) {
363
+ 4 * size_of:: <usize >( )
364
+ } else {
365
+ 2 * size_of:: <usize >( )
366
+ } ;
367
+ is_ascii_chunked:: <CHUNK_SIZE >( bytes)
454
368
}
455
369
)
456
370
}
457
371
458
- /// ASCII test optimized to use the `pmovmskb` instruction available on `x86-64`
459
- /// platforms.
460
- ///
461
- /// Other platforms are not likely to benefit from this code structure, so they
462
- /// use SWAR techniques to test for ASCII in `usize`-sized chunks.
463
- #[ cfg( all( target_arch = "x86_64" , target_feature = "sse2" ) ) ]
372
+ /// Test for ASCII-ness `CHUNK_SIZE` bytes at a time.
373
+ /// This loop should be simple enough that LLVM can auto-vectorise it.
464
374
#[ inline]
465
- const fn is_ascii ( bytes : & [ u8 ] ) -> bool {
466
- // Process chunks of 32 bytes at a time in the fast path to enable
467
- // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
468
- // can be OR'd together and then the resulting vector can be tested for
469
- // non-ASCII bytes.
470
- const CHUNK_SIZE : usize = 32 ;
471
-
472
- let mut i = 0 ;
473
-
474
- while i + CHUNK_SIZE <= bytes. len ( ) {
475
- let chunk_end = i + CHUNK_SIZE ;
476
-
477
- // Get LLVM to produce a `pmovmskb` instruction on x86-64 which
478
- // creates a mask from the most significant bit of each byte.
479
- // ASCII bytes are less than 128 (0x80), so their most significant
480
- // bit is unset.
481
- let mut count = 0 ;
482
- while i < chunk_end {
483
- count += bytes[ i] . is_ascii ( ) as u8 ;
484
- i += 1 ;
485
- }
486
-
487
- // All bytes should be <= 127 so count is equal to chunk size.
488
- if count != CHUNK_SIZE as u8 {
489
- return false ;
490
- }
491
- }
492
-
493
- // Process the remaining `bytes.len() % N` bytes.
494
- let mut is_ascii = true ;
495
- while i < bytes. len ( ) {
496
- is_ascii &= bytes[ i] . is_ascii ( ) ;
497
- i += 1 ;
498
- }
499
-
500
- is_ascii
375
+ fn is_ascii_chunked < const CHUNK_SIZE : usize > ( bytes : & [ u8 ] ) -> bool {
376
+ let ( chunks, remainder) = bytes. as_chunks :: < CHUNK_SIZE > ( ) ;
377
+ chunks. iter ( ) . all ( |chunk| is_ascii_scalar ( chunk) ) && is_ascii_scalar ( remainder)
501
378
}
0 commit comments