|
3 | 3 | use core::ascii::EscapeDefault;
|
4 | 4 |
|
5 | 5 | use crate::fmt::{self, Write};
|
6 |
| -#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))] |
7 | 6 | use crate::intrinsics::const_eval_select;
|
8 | 7 | use crate::{ascii, iter, ops};
|
9 | 8 |
|
@@ -327,175 +326,52 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {
|
327 | 326 | }
|
328 | 327 | }
|
329 | 328 |
|
330 |
| -/// ASCII test *without* the chunk-at-a-time optimizations. |
331 |
| -/// |
332 |
| -/// This is carefully structured to produce nice small code -- it's smaller in |
333 |
| -/// `-O` than what the "obvious" ways produces under `-C opt-level=s`. If you |
334 |
| -/// touch it, be sure to run (and update if needed) the assembly test. |
335 |
| -#[unstable(feature = "str_internals", issue = "none")] |
336 |
| -#[doc(hidden)] |
337 | 329 | #[inline]
|
338 |
| -pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool { |
339 |
| - while let [rest @ .., last] = bytes { |
340 |
| - if !last.is_ascii() { |
| 330 | +const fn is_ascii_const(mut bytes: &[u8]) -> bool { |
| 331 | + while let [first, rest @ ..] = bytes { |
| 332 | + if !first.is_ascii() { |
341 | 333 | break;
|
342 | 334 | }
|
343 | 335 | bytes = rest;
|
344 | 336 | }
|
345 | 337 | bytes.is_empty()
|
346 | 338 | }
|
347 | 339 |
|
| 340 | +/// The implementation using iterators produces a tighter loop than the |
| 341 | +/// implementation using pattern-matching when inlined into `is_ascii_chunked`. |
| 342 | +/// So we have duplicate implementations of the scalar case until iterators are |
| 343 | +/// usable in const contexts. |
| 344 | +#[inline(always)] |
| 345 | +fn is_ascii_scalar(bytes: &[u8]) -> bool { |
| 346 | + bytes.iter().all(u8::is_ascii) |
| 347 | +} |
| 348 | + |
348 | 349 | /// Optimized ASCII test that will use usize-at-a-time operations instead of
|
349 | 350 | /// byte-at-a-time operations (when possible).
|
350 |
| -/// |
351 |
| -/// The algorithm we use here is pretty simple. If `s` is too short, we just |
352 |
| -/// check each byte and be done with it. Otherwise: |
353 |
| -/// |
354 |
| -/// - Read the first word with an unaligned load. |
355 |
| -/// - Align the pointer, read subsequent words until end with aligned loads. |
356 |
| -/// - Read the last `usize` from `s` with an unaligned load. |
357 |
| -/// |
358 |
| -/// If any of these loads produces something for which `contains_nonascii` |
359 |
| -/// (above) returns true, then we know the answer is false. |
360 |
| -#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))] |
361 | 351 | #[inline]
|
362 | 352 | #[rustc_allow_const_fn_unstable(const_eval_select)] // fallback impl has same behavior
|
363 |
| -const fn is_ascii(s: &[u8]) -> bool { |
| 353 | +const fn is_ascii(bytes: &[u8]) -> bool { |
364 | 354 | // The runtime version behaves the same as the compiletime version, it's
|
365 | 355 | // just more optimized.
|
366 | 356 | const_eval_select!(
|
367 |
| - @capture { s: &[u8] } -> bool: |
| 357 | + @capture { bytes: &[u8] } -> bool: |
368 | 358 | if const {
|
369 |
| - is_ascii_simple(s) |
| 359 | + is_ascii_const(bytes) |
370 | 360 | } else {
|
371 |
| - /// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed |
372 |
| - /// from `../str/mod.rs`, which does something similar for utf8 validation. |
373 |
| - const fn contains_nonascii(v: usize) -> bool { |
374 |
| - const NONASCII_MASK: usize = usize::repeat_u8(0x80); |
375 |
| - (NONASCII_MASK & v) != 0 |
376 |
| - } |
377 |
| - |
378 |
| - const USIZE_SIZE: usize = size_of::<usize>(); |
379 |
| - |
380 |
| - let len = s.len(); |
381 |
| - let align_offset = s.as_ptr().align_offset(USIZE_SIZE); |
382 |
| - |
383 |
| - // If we wouldn't gain anything from the word-at-a-time implementation, fall |
384 |
| - // back to a scalar loop. |
385 |
| - // |
386 |
| - // We also do this for architectures where `size_of::<usize>()` isn't |
387 |
| - // sufficient alignment for `usize`, because it's a weird edge case. |
388 |
| - if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < align_of::<usize>() { |
389 |
| - return is_ascii_simple(s); |
390 |
| - } |
391 |
| - |
392 |
| - // We always read the first word unaligned, which means `align_offset` is |
393 |
| - // 0, we'd read the same value again for the aligned read. |
394 |
| - let offset_to_aligned = if align_offset == 0 { USIZE_SIZE } else { align_offset }; |
395 |
| - |
396 |
| - let start = s.as_ptr(); |
397 |
| - // SAFETY: We verify `len < USIZE_SIZE` above. |
398 |
| - let first_word = unsafe { (start as *const usize).read_unaligned() }; |
399 |
| - |
400 |
| - if contains_nonascii(first_word) { |
401 |
| - return false; |
402 |
| - } |
403 |
| - // We checked this above, somewhat implicitly. Note that `offset_to_aligned` |
404 |
| - // is either `align_offset` or `USIZE_SIZE`, both of are explicitly checked |
405 |
| - // above. |
406 |
| - debug_assert!(offset_to_aligned <= len); |
407 |
| - |
408 |
| - // SAFETY: word_ptr is the (properly aligned) usize ptr we use to read the |
409 |
| - // middle chunk of the slice. |
410 |
| - let mut word_ptr = unsafe { start.add(offset_to_aligned) as *const usize }; |
411 |
| - |
412 |
| - // `byte_pos` is the byte index of `word_ptr`, used for loop end checks. |
413 |
| - let mut byte_pos = offset_to_aligned; |
414 |
| - |
415 |
| - // Paranoia check about alignment, since we're about to do a bunch of |
416 |
| - // unaligned loads. In practice this should be impossible barring a bug in |
417 |
| - // `align_offset` though. |
418 |
| - // While this method is allowed to spuriously fail in CTFE, if it doesn't |
419 |
| - // have alignment information it should have given a `usize::MAX` for |
420 |
| - // `align_offset` earlier, sending things through the scalar path instead of |
421 |
| - // this one, so this check should pass if it's reachable. |
422 |
| - debug_assert!(word_ptr.is_aligned_to(align_of::<usize>())); |
423 |
| - |
424 |
| - // Read subsequent words until the last aligned word, excluding the last |
425 |
| - // aligned word by itself to be done in tail check later, to ensure that |
426 |
| - // tail is always one `usize` at most to extra branch `byte_pos == len`. |
427 |
| - while byte_pos < len - USIZE_SIZE { |
428 |
| - // Sanity check that the read is in bounds |
429 |
| - debug_assert!(byte_pos + USIZE_SIZE <= len); |
430 |
| - // And that our assumptions about `byte_pos` hold. |
431 |
| - debug_assert!(word_ptr.cast::<u8>() == start.wrapping_add(byte_pos)); |
432 |
| - |
433 |
| - // SAFETY: We know `word_ptr` is properly aligned (because of |
434 |
| - // `align_offset`), and we know that we have enough bytes between `word_ptr` and the end |
435 |
| - let word = unsafe { word_ptr.read() }; |
436 |
| - if contains_nonascii(word) { |
437 |
| - return false; |
438 |
| - } |
439 |
| - |
440 |
| - byte_pos += USIZE_SIZE; |
441 |
| - // SAFETY: We know that `byte_pos <= len - USIZE_SIZE`, which means that |
442 |
| - // after this `add`, `word_ptr` will be at most one-past-the-end. |
443 |
| - word_ptr = unsafe { word_ptr.add(1) }; |
444 |
| - } |
445 |
| - |
446 |
| - // Sanity check to ensure there really is only one `usize` left. This should |
447 |
| - // be guaranteed by our loop condition. |
448 |
| - debug_assert!(byte_pos <= len && len - byte_pos <= USIZE_SIZE); |
449 |
| - |
450 |
| - // SAFETY: This relies on `len >= USIZE_SIZE`, which we check at the start. |
451 |
| - let last_word = unsafe { (start.add(len - USIZE_SIZE) as *const usize).read_unaligned() }; |
452 |
| - |
453 |
| - !contains_nonascii(last_word) |
| 361 | + const CHUNK_SIZE: usize = if cfg!(all(target_arch = "x86_64", target_feature = "sse2")) { |
| 362 | + 4 * size_of::<usize>() |
| 363 | + } else { |
| 364 | + 2 * size_of::<usize>() |
| 365 | + }; |
| 366 | + is_ascii_chunked::<CHUNK_SIZE>(bytes) |
454 | 367 | }
|
455 | 368 | )
|
456 | 369 | }
|
457 | 370 |
|
458 |
| -/// ASCII test optimized to use the `pmovmskb` instruction available on `x86-64` |
459 |
| -/// platforms. |
460 |
| -/// |
461 |
| -/// Other platforms are not likely to benefit from this code structure, so they |
462 |
| -/// use SWAR techniques to test for ASCII in `usize`-sized chunks. |
463 |
| -#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] |
| 371 | +/// Test for ASCII-ness `CHUNK_SIZE` bytes at a time. |
| 372 | +/// This loop should be simple enough that LLVM can auto-vectorise it. |
464 | 373 | #[inline]
|
465 |
| -const fn is_ascii(bytes: &[u8]) -> bool { |
466 |
| - // Process chunks of 32 bytes at a time in the fast path to enable |
467 |
| - // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers |
468 |
| - // can be OR'd together and then the resulting vector can be tested for |
469 |
| - // non-ASCII bytes. |
470 |
| - const CHUNK_SIZE: usize = 32; |
471 |
| - |
472 |
| - let mut i = 0; |
473 |
| - |
474 |
| - while i + CHUNK_SIZE <= bytes.len() { |
475 |
| - let chunk_end = i + CHUNK_SIZE; |
476 |
| - |
477 |
| - // Get LLVM to produce a `pmovmskb` instruction on x86-64 which |
478 |
| - // creates a mask from the most significant bit of each byte. |
479 |
| - // ASCII bytes are less than 128 (0x80), so their most significant |
480 |
| - // bit is unset. |
481 |
| - let mut count = 0; |
482 |
| - while i < chunk_end { |
483 |
| - count += bytes[i].is_ascii() as u8; |
484 |
| - i += 1; |
485 |
| - } |
486 |
| - |
487 |
| - // All bytes should be <= 127 so count is equal to chunk size. |
488 |
| - if count != CHUNK_SIZE as u8 { |
489 |
| - return false; |
490 |
| - } |
491 |
| - } |
492 |
| - |
493 |
| - // Process the remaining `bytes.len() % N` bytes. |
494 |
| - let mut is_ascii = true; |
495 |
| - while i < bytes.len() { |
496 |
| - is_ascii &= bytes[i].is_ascii(); |
497 |
| - i += 1; |
498 |
| - } |
499 |
| - |
500 |
| - is_ascii |
| 374 | +fn is_ascii_chunked<const CHUNK_SIZE: usize>(bytes: &[u8]) -> bool { |
| 375 | + let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>(); |
| 376 | + chunks.iter().all(|chunk| is_ascii_scalar(chunk)) && is_ascii_scalar(remainder) |
501 | 377 | }
|
0 commit comments