From 3b50253b57b130fdcef167fc0c03a9a19210fae2 Mon Sep 17 00:00:00 2001 From: Paul Murphy Date: Mon, 4 Aug 2025 13:00:06 -0500 Subject: [PATCH 1/2] compiler-builtins: plumb LSE support for aarch64 on linux Add dynamic support for aarch64 LSE atomic ops on linux targets when optimized-compiler-builtins is not enabled. A hook, __enable_rust_lse, is provided for the runtime to enable them if available. A future patch will use this to enable them if available. The resulting asm should exactly match that of LLVM's compiler-rt builtins, though the symbol naming for the support function and global does not. --- .../compiler-builtins/src/aarch64_linux.rs | 76 ++++++++++++++++--- 1 file changed, 66 insertions(+), 10 deletions(-) diff --git a/library/compiler-builtins/compiler-builtins/src/aarch64_linux.rs b/library/compiler-builtins/compiler-builtins/src/aarch64_linux.rs index 38fcab152aed2..01d7fb4732918 100644 --- a/library/compiler-builtins/compiler-builtins/src/aarch64_linux.rs +++ b/library/compiler-builtins/compiler-builtins/src/aarch64_linux.rs @@ -6,9 +6,6 @@ //! which is supported on the current CPU. //! See for more discussion. //! -//! Currently we only support LL/SC, because LSE requires `getauxval` from libc in order to do runtime detection. -//! Use the `compiler-rt` intrinsics if you want LSE support. -//! //! Ported from `aarch64/lse.S` in LLVM's compiler-rt. //! //! Generate functions for each of the following symbols: @@ -24,7 +21,18 @@ //! We do something similar, but with macro arguments. #![cfg_attr(feature = "c", allow(unused_macros))] // avoid putting the macros into a submodule -// We don't do runtime dispatch so we don't have to worry about the `__aarch64_have_lse_atomics` global ctor. +use core::sync::atomic::{AtomicU8, Ordering}; + +/// non-zero if the host supports LSE atomics. +static HAVE_LSE_ATOMICS: AtomicU8 = AtomicU8::new(0); + +intrinsics! { + /// Call to enable LSE in outline atomic operations. The caller must verify + /// LSE operations are supported. + pub extern "C" fn __rust_enable_lse() { + HAVE_LSE_ATOMICS.store(1, Ordering::Relaxed); + } +} /// Translate a byte size to a Rust type. #[rustfmt::skip] @@ -45,6 +53,7 @@ macro_rules! reg { (2, $num:literal) => { concat!("w", $num) }; (4, $num:literal) => { concat!("w", $num) }; (8, $num:literal) => { concat!("x", $num) }; + (16, $num:literal) => { concat!("x", $num) }; } /// Given an atomic ordering, translate it to the acquire suffix for the lxdr aarch64 ASM instruction. @@ -126,6 +135,41 @@ macro_rules! stxp { }; } +// If supported, perform the requested LSE op and return, or fallthrough. +macro_rules! try_lse_op { + ($op: literal, $ordering:ident, $bytes:tt, $($reg:literal,)* [ $mem:ident ] ) => { + concat!( + ".arch_extension lse; ", + "adrp x16, {have_lse}; ", + "ldrb w16, [x16, :lo12:{have_lse}]; ", + "cbz w16, 8f; ", + // LSE_OP s(reg),* [$mem] + concat!(lse!($op, $ordering, $bytes), $( " ", reg!($bytes, $reg), ", " ,)* "[", stringify!($mem), "]; ",), + "ret; ", + "8:" + ) + }; +} + +// Translate memory ordering to the LSE suffix +#[rustfmt::skip] +macro_rules! lse_mem_sfx { + (Relaxed) => { "" }; + (Acquire) => { "a" }; + (Release) => { "l" }; + (AcqRel) => { "al" }; +} + +// Generate the aarch64 LSE operation for memory ordering and width +macro_rules! lse { + ($op:literal, $order:ident, 16) => { + concat!($op, "p", lse_mem_sfx!($order)) + }; + ($op:literal, $order:ident, $bytes:tt) => { + concat!($op, lse_mem_sfx!($order), size!($bytes)) + }; +} + /// See . macro_rules! compare_and_swap { ($ordering:ident, $bytes:tt, $name:ident) => { @@ -137,7 +181,9 @@ macro_rules! compare_and_swap { ) -> int_ty!($bytes) { // We can't use `AtomicI8::compare_and_swap`; we *are* compare_and_swap. core::arch::naked_asm! { - // UXT s(tmp0), s(0) + // CAS s(0), s(1), [x2]; if LSE supported. + try_lse_op!("cas", $ordering, $bytes, 0, 1, [x2]), + // UXT s(tmp0), s(0) concat!(uxt!($bytes), " ", reg!($bytes, 16), ", ", reg!($bytes, 0)), "0:", // LDXR s(0), [x2] @@ -150,6 +196,7 @@ macro_rules! compare_and_swap { "cbnz w17, 0b", "1:", "ret", + have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS, } } } @@ -166,6 +213,8 @@ macro_rules! compare_and_swap_i128 { expected: i128, desired: i128, ptr: *mut i128 ) -> i128 { core::arch::naked_asm! { + // CASP x0, x1, x2, x3, [x4]; if LSE supported. + try_lse_op!("cas", $ordering, 16, 0, 1, 2, 3, [x4]), "mov x16, x0", "mov x17, x1", "0:", @@ -179,6 +228,7 @@ macro_rules! compare_and_swap_i128 { "cbnz w15, 0b", "1:", "ret", + have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS, } } } @@ -195,6 +245,8 @@ macro_rules! swap { left: int_ty!($bytes), right_ptr: *mut int_ty!($bytes) ) -> int_ty!($bytes) { core::arch::naked_asm! { + // SWP s(0), s(0), [x1]; if LSE supported. + try_lse_op!("swp", $ordering, $bytes, 0, 0, [x1]), // mov s(tmp0), s(0) concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)), "0:", @@ -204,6 +256,7 @@ macro_rules! swap { concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 16), ", [x1]"), "cbnz w17, 0b", "ret", + have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS, } } } @@ -212,7 +265,7 @@ macro_rules! swap { /// See (e.g.) . macro_rules! fetch_op { - ($ordering:ident, $bytes:tt, $name:ident, $op:literal) => { + ($ordering:ident, $bytes:tt, $name:ident, $op:literal, $lse_op:literal) => { intrinsics! { #[maybe_use_optimized_c_shim] #[unsafe(naked)] @@ -220,6 +273,8 @@ macro_rules! fetch_op { val: int_ty!($bytes), ptr: *mut int_ty!($bytes) ) -> int_ty!($bytes) { core::arch::naked_asm! { + // LSEOP s(0), s(0), [x1]; if LSE supported. + try_lse_op!($lse_op, $ordering, $bytes, 0, 0, [x1]), // mov s(tmp0), s(0) concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)), "0:", @@ -231,6 +286,7 @@ macro_rules! fetch_op { concat!(stxr!($ordering, $bytes), " w15, ", reg!($bytes, 17), ", [x1]"), "cbnz w15, 0b", "ret", + have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS, } } } @@ -240,25 +296,25 @@ macro_rules! fetch_op { // We need a single macro to pass to `foreach_ldadd`. macro_rules! add { ($ordering:ident, $bytes:tt, $name:ident) => { - fetch_op! { $ordering, $bytes, $name, "add" } + fetch_op! { $ordering, $bytes, $name, "add", "ldadd" } }; } macro_rules! and { ($ordering:ident, $bytes:tt, $name:ident) => { - fetch_op! { $ordering, $bytes, $name, "bic" } + fetch_op! { $ordering, $bytes, $name, "bic", "ldclr" } }; } macro_rules! xor { ($ordering:ident, $bytes:tt, $name:ident) => { - fetch_op! { $ordering, $bytes, $name, "eor" } + fetch_op! { $ordering, $bytes, $name, "eor", "ldeor" } }; } macro_rules! or { ($ordering:ident, $bytes:tt, $name:ident) => { - fetch_op! { $ordering, $bytes, $name, "orr" } + fetch_op! { $ordering, $bytes, $name, "orr", "ldset" } }; } From 6936bb975a50bf3c575d5642018ce19c58dcacf1 Mon Sep 17 00:00:00 2001 From: Paul Murphy Date: Thu, 31 Jul 2025 17:25:00 -0400 Subject: [PATCH 2/2] Dynamically enable LSE for aarch64 rust provided intrinsics Create a private module to hold the bootstrap code needed enable LSE at startup on aarch64-*-linux-* targets when rust implements the intrinsics. This is a bit more heavyweight than compiler-rt's LSE initialization, but has the benefit of initializing the aarch64 cpu feature detection for other uses. Using the rust initialization code does use some atomic operations, that's OK. Mixing LSE and non-LSE operations should work while the update flag propagates. --- library/std/src/sys/configure_builtins.rs | 22 ++++++++++++++++++++++ library/std/src/sys/mod.rs | 5 +++++ 2 files changed, 27 insertions(+) create mode 100644 library/std/src/sys/configure_builtins.rs diff --git a/library/std/src/sys/configure_builtins.rs b/library/std/src/sys/configure_builtins.rs new file mode 100644 index 0000000000000..9d776b778dcbe --- /dev/null +++ b/library/std/src/sys/configure_builtins.rs @@ -0,0 +1,22 @@ +/// Hook into .init_array to enable LSE atomic operations at startup, if +/// supported. +#[cfg(all(target_arch = "aarch64", target_os = "linux", not(feature = "compiler-builtins-c")))] +#[used] +#[unsafe(link_section = ".init_array.90")] +static RUST_LSE_INIT: extern "C" fn() = { + extern "C" fn init_lse() { + use crate::arch; + + // This is provided by compiler-builtins::aarch64_linux. + unsafe extern "C" { + fn __rust_enable_lse(); + } + + if arch::is_aarch64_feature_detected!("lse") { + unsafe { + __rust_enable_lse(); + } + } + } + init_lse +}; diff --git a/library/std/src/sys/mod.rs b/library/std/src/sys/mod.rs index f9a02b522e5e1..8ec0a0e33023f 100644 --- a/library/std/src/sys/mod.rs +++ b/library/std/src/sys/mod.rs @@ -1,5 +1,10 @@ #![allow(unsafe_op_in_unsafe_fn)] +/// The configure builtins provides runtime support compiler-builtin features +/// which require dynamic intialization to work as expected, e.g. aarch64 +/// outline-atomics. +mod configure_builtins; + /// The PAL (platform abstraction layer) contains platform-specific abstractions /// for implementing the features in the other submodules, e.g. UNIX file /// descriptors.