diff --git a/library/compiler-builtins/compiler-builtins/src/aarch64_linux.rs b/library/compiler-builtins/compiler-builtins/src/aarch64_linux.rs
index 38fcab152aed2..01d7fb4732918 100644
--- a/library/compiler-builtins/compiler-builtins/src/aarch64_linux.rs
+++ b/library/compiler-builtins/compiler-builtins/src/aarch64_linux.rs
@@ -6,9 +6,6 @@
 //! which is supported on the current CPU.
 //! See <https://community.arm.com/arm-community-blogs/b/tools-software-ides-blog/posts/making-the-most-of-the-arm-architecture-in-gcc-10#:~:text=out%20of%20line%20atomics> for more discussion.
 //!
-//! Currently we only support LL/SC, because LSE requires `getauxval` from libc in order to do runtime detection.
-//! Use the `compiler-rt` intrinsics if you want LSE support.
-//!
 //! Ported from `aarch64/lse.S` in LLVM's compiler-rt.
 //!
 //! Generate functions for each of the following symbols:
@@ -24,7 +21,18 @@
 //! We do something similar, but with macro arguments.
 #![cfg_attr(feature = "c", allow(unused_macros))] // avoid putting the macros into a submodule
 
-// We don't do runtime dispatch so we don't have to worry about the `__aarch64_have_lse_atomics` global ctor.
+use core::sync::atomic::{AtomicU8, Ordering};
+
+/// non-zero if the host supports LSE atomics.
+static HAVE_LSE_ATOMICS: AtomicU8 = AtomicU8::new(0);
+
+intrinsics! {
+    /// Call to enable LSE in outline atomic operations. The caller must verify
+    /// LSE operations are supported.
+    pub extern "C" fn __rust_enable_lse() {
+        HAVE_LSE_ATOMICS.store(1, Ordering::Relaxed);
+    }
+}
 
 /// Translate a byte size to a Rust type.
 #[rustfmt::skip]
@@ -45,6 +53,7 @@ macro_rules! reg {
     (2, $num:literal) => { concat!("w", $num) };
     (4, $num:literal) => { concat!("w", $num) };
     (8, $num:literal) => { concat!("x", $num) };
+    (16, $num:literal) => { concat!("x", $num) };
 }
 
 /// Given an atomic ordering, translate it to the acquire suffix for the lxdr aarch64 ASM instruction.
@@ -126,6 +135,41 @@ macro_rules! stxp {
     };
 }
 
+// If supported, perform the requested LSE op and return, or fallthrough.
+macro_rules! try_lse_op {
+    ($op: literal, $ordering:ident, $bytes:tt, $($reg:literal,)* [ $mem:ident ] ) => {
+        concat!(
+            ".arch_extension lse; ",
+            "adrp    x16, {have_lse}; ",
+            "ldrb    w16, [x16, :lo12:{have_lse}]; ",
+            "cbz     w16, 8f; ",
+            // LSE_OP  s(reg),* [$mem]
+            concat!(lse!($op, $ordering, $bytes), $( " ", reg!($bytes, $reg), ", " ,)* "[", stringify!($mem), "]; ",),
+            "ret; ",
+            "8:"
+        )
+    };
+}
+
+// Translate memory ordering to the LSE suffix
+#[rustfmt::skip]
+macro_rules! lse_mem_sfx {
+    (Relaxed) => { "" };
+    (Acquire) => { "a" };
+    (Release) => { "l" };
+    (AcqRel) => { "al" };
+}
+
+// Generate the aarch64 LSE operation for memory ordering and width
+macro_rules! lse {
+    ($op:literal, $order:ident, 16) => {
+        concat!($op, "p", lse_mem_sfx!($order))
+    };
+    ($op:literal, $order:ident, $bytes:tt) => {
+        concat!($op, lse_mem_sfx!($order), size!($bytes))
+    };
+}
+
 /// See <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.compare_and_swap>.
 macro_rules! compare_and_swap {
     ($ordering:ident, $bytes:tt, $name:ident) => {
@@ -137,7 +181,9 @@ macro_rules! compare_and_swap {
             ) -> int_ty!($bytes) {
                 // We can't use `AtomicI8::compare_and_swap`; we *are* compare_and_swap.
                 core::arch::naked_asm! {
-                    // UXT s(tmp0), s(0)
+                    // CAS    s(0), s(1), [x2]; if LSE supported.
+                    try_lse_op!("cas", $ordering, $bytes, 0, 1, [x2]),
+                    // UXT    s(tmp0), s(0)
                     concat!(uxt!($bytes), " ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
                     "0:",
                     // LDXR   s(0), [x2]
@@ -150,6 +196,7 @@ macro_rules! compare_and_swap {
                     "cbnz   w17, 0b",
                     "1:",
                     "ret",
+                    have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
                 }
             }
         }
@@ -166,6 +213,8 @@ macro_rules! compare_and_swap_i128 {
                 expected: i128, desired: i128, ptr: *mut i128
             ) -> i128 {
                 core::arch::naked_asm! {
+                    // CASP   x0, x1, x2, x3, [x4]; if LSE supported.
+                    try_lse_op!("cas", $ordering, 16, 0, 1, 2, 3, [x4]),
                     "mov    x16, x0",
                     "mov    x17, x1",
                     "0:",
@@ -179,6 +228,7 @@ macro_rules! compare_and_swap_i128 {
                     "cbnz   w15, 0b",
                     "1:",
                     "ret",
+                    have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
                 }
             }
         }
@@ -195,6 +245,8 @@ macro_rules! swap {
                 left: int_ty!($bytes), right_ptr: *mut int_ty!($bytes)
             ) -> int_ty!($bytes) {
                 core::arch::naked_asm! {
+                    // SWP    s(0), s(0), [x1]; if LSE supported.
+                    try_lse_op!("swp", $ordering, $bytes, 0, 0, [x1]),
                     // mov    s(tmp0), s(0)
                     concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
                     "0:",
@@ -204,6 +256,7 @@ macro_rules! swap {
                     concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 16), ", [x1]"),
                     "cbnz   w17, 0b",
                     "ret",
+                    have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
                 }
             }
         }
@@ -212,7 +265,7 @@ macro_rules! swap {
 
 /// See (e.g.) <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.fetch_add>.
 macro_rules! fetch_op {
-    ($ordering:ident, $bytes:tt, $name:ident, $op:literal) => {
+    ($ordering:ident, $bytes:tt, $name:ident, $op:literal, $lse_op:literal) => {
         intrinsics! {
             #[maybe_use_optimized_c_shim]
             #[unsafe(naked)]
@@ -220,6 +273,8 @@ macro_rules! fetch_op {
                 val: int_ty!($bytes), ptr: *mut int_ty!($bytes)
             ) -> int_ty!($bytes) {
                 core::arch::naked_asm! {
+                    // LSEOP  s(0), s(0), [x1]; if LSE supported.
+                    try_lse_op!($lse_op, $ordering, $bytes, 0, 0, [x1]),
                     // mov    s(tmp0), s(0)
                     concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
                     "0:",
@@ -231,6 +286,7 @@ macro_rules! fetch_op {
                     concat!(stxr!($ordering, $bytes), " w15, ", reg!($bytes, 17), ", [x1]"),
                     "cbnz  w15, 0b",
                     "ret",
+                    have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
                 }
             }
         }
@@ -240,25 +296,25 @@ macro_rules! fetch_op {
 // We need a single macro to pass to `foreach_ldadd`.
 macro_rules! add {
     ($ordering:ident, $bytes:tt, $name:ident) => {
-        fetch_op! { $ordering, $bytes, $name, "add" }
+        fetch_op! { $ordering, $bytes, $name, "add", "ldadd" }
     };
 }
 
 macro_rules! and {
     ($ordering:ident, $bytes:tt, $name:ident) => {
-        fetch_op! { $ordering, $bytes, $name, "bic" }
+        fetch_op! { $ordering, $bytes, $name, "bic", "ldclr" }
     };
 }
 
 macro_rules! xor {
     ($ordering:ident, $bytes:tt, $name:ident) => {
-        fetch_op! { $ordering, $bytes, $name, "eor" }
+        fetch_op! { $ordering, $bytes, $name, "eor", "ldeor" }
     };
 }
 
 macro_rules! or {
     ($ordering:ident, $bytes:tt, $name:ident) => {
-        fetch_op! { $ordering, $bytes, $name, "orr" }
+        fetch_op! { $ordering, $bytes, $name, "orr", "ldset" }
     };
 }
 
diff --git a/library/std/src/sys/configure_builtins.rs b/library/std/src/sys/configure_builtins.rs
new file mode 100644
index 0000000000000..9d776b778dcbe
--- /dev/null
+++ b/library/std/src/sys/configure_builtins.rs
@@ -0,0 +1,22 @@
+/// Hook into .init_array to enable LSE atomic operations at startup, if
+/// supported.
+#[cfg(all(target_arch = "aarch64", target_os = "linux", not(feature = "compiler-builtins-c")))]
+#[used]
+#[unsafe(link_section = ".init_array.90")]
+static RUST_LSE_INIT: extern "C" fn() = {
+    extern "C" fn init_lse() {
+        use crate::arch;
+
+        // This is provided by compiler-builtins::aarch64_linux.
+        unsafe extern "C" {
+            fn __rust_enable_lse();
+        }
+
+        if arch::is_aarch64_feature_detected!("lse") {
+            unsafe {
+                __rust_enable_lse();
+            }
+        }
+    }
+    init_lse
+};
diff --git a/library/std/src/sys/mod.rs b/library/std/src/sys/mod.rs
index f9a02b522e5e1..8ec0a0e33023f 100644
--- a/library/std/src/sys/mod.rs
+++ b/library/std/src/sys/mod.rs
@@ -1,5 +1,10 @@
 #![allow(unsafe_op_in_unsafe_fn)]
 
+/// The configure builtins provides runtime support compiler-builtin features
+/// which require dynamic intialization to work as expected, e.g. aarch64
+/// outline-atomics.
+mod configure_builtins;
+
 /// The PAL (platform abstraction layer) contains platform-specific abstractions
 /// for implementing the features in the other submodules, e.g. UNIX file
 /// descriptors.