diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index d13dd6b0f..3afadbfe8 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -5,7 +5,7 @@ on:
 
 concurrency:
   # Make sure that new pushes cancel running jobs
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
   cancel-in-progress: true
 
 env:
@@ -13,7 +13,7 @@ env:
   RUSTDOCFLAGS: -Dwarnings
   RUSTFLAGS: -Dwarnings
   RUST_BACKTRACE: full
-  BENCHMARK_RUSTC: nightly-2025-01-16 # Pin the toolchain for reproducable results
+  BENCHMARK_RUSTC: nightly-2025-05-28 # Pin the toolchain for reproducable results
 
 jobs:
   # Determine which tests should be run based on changed files.
@@ -34,7 +34,9 @@ jobs:
       - name: Fetch pull request ref
         run: git fetch origin "$GITHUB_REF:$GITHUB_REF"
         if: github.event_name == 'pull_request'
-      - run: python3 ci/ci-util.py generate-matrix >> "$GITHUB_OUTPUT"
+      - run: |
+          set -eo pipefail # Needed to actually fail the job if ci-util fails
+          python3 ci/ci-util.py generate-matrix | tee "$GITHUB_OUTPUT"
         id: script
 
   test:
@@ -49,9 +51,7 @@ jobs:
         - target: aarch64-unknown-linux-gnu
           os: ubuntu-24.04-arm
         - target: aarch64-pc-windows-msvc
-          os: windows-2025
-          test_verbatim: 1
-          build_only: 1
+          os: windows-11-arm
         - target: arm-unknown-linux-gnueabi
           os: ubuntu-24.04
         - target: arm-unknown-linux-gnueabihf
@@ -70,8 +70,12 @@ jobs:
           os: ubuntu-24.04
         - target: powerpc64le-unknown-linux-gnu
           os: ubuntu-24.04
+        - target: powerpc64le-unknown-linux-gnu
+          os: ubuntu-24.04-ppc64le
         - target: riscv64gc-unknown-linux-gnu
           os: ubuntu-24.04
+        - target: s390x-unknown-linux-gnu
+          os: ubuntu-24.04-s390x
         - target: thumbv6m-none-eabi
           os: ubuntu-24.04
         - target: thumbv7em-none-eabi
@@ -88,10 +92,8 @@ jobs:
           os: macos-13
         - target: i686-pc-windows-msvc
           os: windows-2025
-          test_verbatim: 1
         - target: x86_64-pc-windows-msvc
           os: windows-2025
-          test_verbatim: 1
         - target: i686-pc-windows-gnu
           os: windows-2025
           channel: nightly-i686-gnu
@@ -102,14 +104,24 @@ jobs:
     needs: [calculate_vars]
     env:
       BUILD_ONLY: ${{ matrix.build_only }}
-      TEST_VERBATIM: ${{ matrix.test_verbatim }}
       MAY_SKIP_LIBM_CI: ${{ needs.calculate_vars.outputs.may_skip_libm_ci }}
     steps:
+    - name: Print $HOME
+      shell: bash
+      run: |
+        set -x
+        echo "${HOME:-not found}"
+        pwd
+        printenv
     - name: Print runner information
       run: uname -a
+
+    # Native ppc and s390x runners don't have rustup by default
+    - name: Install rustup
+      if: matrix.os == 'ubuntu-24.04-ppc64le' || matrix.os == 'ubuntu-24.04-s390x'
+      run: sudo apt-get update && sudo apt-get install -y rustup
+
     - uses: actions/checkout@v4
-      with:
-        submodules: true
     - name: Install Rust (rustup)
       shell: bash
       run: |
@@ -119,8 +131,12 @@ jobs:
         rustup update "$channel" --no-self-update
         rustup default "$channel"
         rustup target add "${{ matrix.target }}"
-        rustup component add llvm-tools-preview
+
+    # Our scripts use nextest if possible. This is skipped on the native ppc
+    # and s390x runners since install-action doesn't support them.
     - uses: taiki-e/install-action@nextest
+      if: "!(matrix.os == 'ubuntu-24.04-ppc64le' || matrix.os == 'ubuntu-24.04-s390x')"
+
     - uses: Swatinem/rust-cache@v2
       with:
         key: ${{ matrix.target }}
@@ -148,6 +164,10 @@ jobs:
     - run: echo "RUST_COMPILER_RT_ROOT=$(realpath ./compiler-rt)" >> "$GITHUB_ENV"
       shell: bash
 
+    - name: Download musl source
+      run: ./ci/update-musl.sh
+      shell: bash
+
     - name: Verify API list
       if: matrix.os == 'ubuntu-24.04'
       run: python3 etc/update-api-list.py --check
@@ -183,8 +203,6 @@ jobs:
     timeout-minutes: 10
     steps:
     - uses: actions/checkout@v4
-      with:
-        submodules: true
     # Unlike rustfmt, stable clippy does not work on code with nightly features.
     - name: Install nightly `clippy`
       run: |
@@ -192,16 +210,41 @@ jobs:
         rustup default nightly
         rustup component add clippy
     - uses: Swatinem/rust-cache@v2
+    - name: Download musl source
+      run: ./ci/update-musl.sh
     - run: cargo clippy --workspace --all-targets
 
+  build-custom:
+    name: Build custom target
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install Rust
+      run: |
+        rustup update nightly --no-self-update
+        rustup default nightly
+        rustup component add rust-src
+    - uses: Swatinem/rust-cache@v2
+    - run: |
+        # Ensure we can build with custom target.json files (these can interact
+        # poorly with build scripts)
+        cargo build -p compiler_builtins -p libm \
+          --target etc/thumbv7em-none-eabi-renamed.json \
+          -Zbuild-std=core
+
   benchmarks:
     name: Benchmarks
-    runs-on: ubuntu-24.04
     timeout-minutes: 20
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - target: x86_64-unknown-linux-gnu
+          os: ubuntu-24.04
+    runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@master
-      with:
-        submodules: true
     - uses: taiki-e/install-action@cargo-binstall
 
     - name: Set up dependencies
@@ -216,19 +259,23 @@ jobs:
         cargo binstall -y iai-callgrind-runner --version "$iai_version"
         sudo apt-get install valgrind
     - uses: Swatinem/rust-cache@v2
+      with:
+        key: ${{ matrix.target }}
+    - name: Download musl source
+      run: ./ci/update-musl.sh
 
     - name: Run icount benchmarks
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         PR_NUMBER: ${{ github.event.pull_request.number }}
-      run: ./ci/bench-icount.sh
+      run: ./ci/bench-icount.sh ${{ matrix.target }}
 
     - name: Upload the benchmark baseline
       uses: actions/upload-artifact@v4
       with:
         name: ${{ env.BASELINE_NAME }}
         path: ${{ env.BASELINE_NAME }}.tar.xz
-    
+
     - name: Run wall time benchmarks
       run: |
         # Always use the same seed for benchmarks. Ideally we should switch to a
@@ -249,8 +296,6 @@ jobs:
     timeout-minutes: 10
     steps:
     - uses: actions/checkout@v4
-      with:
-        submodules: true
     - name: Install Rust (rustup)
       run: rustup update nightly --no-self-update && rustup default nightly
       shell: bash
@@ -285,10 +330,8 @@ jobs:
     timeout-minutes: 10
     steps:
     - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: Install stable `rustfmt`
-      run: rustup set profile minimal && rustup default stable && rustup component add rustfmt
+    - name: Install nightly `rustfmt`
+      run: rustup set profile minimal && rustup default nightly && rustup component add rustfmt
     - run: cargo fmt -- --check
 
   extensive:
@@ -310,13 +353,13 @@ jobs:
       TO_TEST: ${{ matrix.to_test }}
     steps:
       - uses: actions/checkout@v4
-        with:
-          submodules: true
       - name: Install Rust
         run: |
           rustup update nightly --no-self-update
           rustup default nightly
       - uses: Swatinem/rust-cache@v2
+      - name: download musl source
+        run: ./ci/update-musl.sh
       - name: Run extensive tests
         run: ./ci/run-extensive.sh
       - name: Print test logs if available
@@ -326,6 +369,7 @@ jobs:
   success:
     needs:
       - benchmarks
+      - build-custom
       - clippy
       - extensive
       - miri
diff --git a/.github/workflows/rustc-pull.yml b/.github/workflows/rustc-pull.yml
new file mode 100644
index 000000000..ad7693e17
--- /dev/null
+++ b/.github/workflows/rustc-pull.yml
@@ -0,0 +1,24 @@
+# Perform a subtree sync (pull) using the josh-sync tool once every few days (or on demand).
+name: rustc-pull
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Run at 04:00 UTC every Monday and Thursday
+    - cron: '0 4 * * 1,4'
+
+jobs:
+  pull:
+    if: github.repository == 'rust-lang/compiler-builtins'
+    uses: rust-lang/josh-sync/.github/workflows/rustc-pull.yml@main
+    with:
+      github-app-id: ${{ vars.APP_CLIENT_ID }}
+      # https://rust-lang.zulipchat.com/#narrow/channel/219381-t-libs/topic/compiler-builtins.20subtree.20sync.20automation/with/528482375
+      zulip-stream-id: 219381
+      zulip-topic: 'compiler-builtins subtree sync automation'
+      zulip-bot-email: "compiler-builtins-ci-bot@rust-lang.zulipchat.com"
+      pr-base-branch: master
+      branch-name: rustc-pull
+    secrets:
+      zulip-api-token: ${{ secrets.ZULIP_API_TOKEN }}
+      github-app-secret: ${{ secrets.APP_PRIVATE_KEY }}
diff --git a/.gitignore b/.gitignore
index 5287a6c72..f12b871c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,6 @@ iai-home
 *.bk
 *.rs.bk
 .#*
+
+# Manually managed
+crates/musl-math-sys/musl
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 792ed9ab2..000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,4 +0,0 @@
-[submodule "crates/musl-math-sys/musl"]
-	path = crates/musl-math-sys/musl
-	url = https://git.musl-libc.org/git/musl
-	shallow = true
diff --git a/.release-plz.toml b/.release-plz.toml
deleted file mode 100644
index 8023ade9b..000000000
--- a/.release-plz.toml
+++ /dev/null
@@ -1,13 +0,0 @@
-[workspace]
-# As part of the release process, we delete `libm/Cargo.toml`. Since
-# this is only run in CI, we shouldn't need to worry about it.
-allow_dirty = true
-publish_allow_dirty = true
-
-[[package]]
-name = "compiler_builtins"
-semver_check = false
-changelog_include = ["libm"] # libm is included as part of builtins
-
-[[package]]
-name = "libm"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9f67cfc31..9ae4f893c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -165,3 +165,12 @@ cargo bench --no-default-features \
 
 [`iai-callgrind-runner`]: https://crates.io/crates/iai-callgrind-runner
 [Valgrind]: https://valgrind.org/
+
+## Subtree synchronization
+
+`compiler-builtins` is included as a [Josh subtree] in the main compiler
+repository (`rust-lang/rust`). You can find a guide on how to create synchronization
+(pull and push) PRs at the [`rustc-dev-guide` page].
+
+[Josh subtree]: https://rustc-dev-guide.rust-lang.org/external-repos.html#josh-subtrees
+[`rustc-dev-guide` page]: https://rustc-dev-guide.rust-lang.org/external-repos.html#synchronizing-a-josh-subtree
diff --git a/Cargo.toml b/Cargo.toml
index b39ec8a25..956d738f3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,19 +1,20 @@
 [workspace]
 resolver = "2"
 members = [
+    "builtins-shim",
     "builtins-test",
-    "compiler-builtins",
     "crates/libm-macros",
     "crates/musl-math-sys",
     "crates/panic-handler",
+    "crates/symbol-check",
     "crates/util",
     "libm",
     "libm-test",
 ]
 
 default-members = [
+    "builtins-shim",
     "builtins-test",
-    "compiler-builtins",
     "crates/libm-macros",
     "libm",
     "libm-test",
@@ -24,6 +25,10 @@ exclude = [
     # and `mangled-names` disabled, which is the opposite of what is needed for
     # other tests, so it makes sense to keep it out of the workspace.
     "builtins-test-intrinsics",
+    # We test via the `builtins-shim` crate, so exclude the `compiler-builtins`
+    # that has a dependency on `core`. See `builtins-shim/Cargo.toml` for more
+    # details.
+    "compiler-builtins",
 ]
 
 [profile.release]
diff --git a/README.md b/README.md
index 3130ff7b7..177bce624 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ This repository contains two main crates:
 * `compiler-builtins`: symbols that the compiler expects to be available at
   link time
 * `libm`: a Rust implementation of C math libraries, used to provide
-  implementations in `ocre`.
+  implementations in `core`.
 
 More details are at [compiler-builtins/README.md](compiler-builtins/README.md)
 and [libm/README.md](libm/README.md).
diff --git a/builtins-shim/Cargo.toml b/builtins-shim/Cargo.toml
new file mode 100644
index 000000000..707ebdbc7
--- /dev/null
+++ b/builtins-shim/Cargo.toml
@@ -0,0 +1,64 @@
+# NOTE: Must be kept in sync with `../compiler-builtins/Cargo.toml`.
+#
+# The manifest at `../compiler-builtins` is what actually gets used in the
+# rust-lang/rust tree; however, we can't build it out of tree because it
+# depends on `core` by path, and even optional Cargo dependencies need to be
+# available at build time. So, we work around this by having this "shim"
+# manifest that is identical except for the `core` dependency and forwards
+# to the same sources, which acts as the `compiler-builtins` Cargo entrypoint
+# for out of tree testing
+
+[package]
+name = "compiler_builtins"
+version = "0.1.160"
+authors = ["Jorge Aparicio <japaricious@gmail.com>"]
+description = "Compiler intrinsics used by the Rust compiler."
+repository = "https://github.com/rust-lang/compiler-builtins"
+license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)"
+edition = "2024"
+publish = false
+links = "compiler-rt"
+
+build = "../compiler-builtins/build.rs"
+
+[lib]
+path = "../compiler-builtins/src/lib.rs"
+bench = false
+doctest = false
+test = false
+
+[build-dependencies]
+cc = { optional = true, version = "1.2" }
+
+[features]
+default = ["compiler-builtins"]
+
+# Enable compilation of C code in compiler-rt, filling in some more optimized
+# implementations and also filling in unimplemented intrinsics
+c = ["dep:cc"]
+
+# For implementations where there is both a generic version and a platform-
+# specific version, use the generic version. This is meant to enable testing
+# the generic versions on all platforms.
+no-asm = []
+
+# Workaround for codegen backends which haven't yet implemented `f16` and
+# `f128` support. Disabled any intrinsics which use those types.
+no-f16-f128 = []
+
+# Flag this library as the unstable compiler-builtins lib
+compiler-builtins = []
+
+# Generate memory-related intrinsics like memcpy
+mem = []
+
+# Mangle all names so this can be linked in with other versions or other
+# compiler-rt implementations. Also used for testing
+mangled-names = []
+
+# Only used in the compiler's build system
+rustc-dep-of-std = ["compiler-builtins"]
+
+# This makes certain traits and function specializations public that
+# are not normally public but are required by the `builtins-test`
+unstable-public-internals = []
diff --git a/builtins-test-intrinsics/Cargo.toml b/builtins-test-intrinsics/Cargo.toml
index 6e10628a4..e73a1f7b1 100644
--- a/builtins-test-intrinsics/Cargo.toml
+++ b/builtins-test-intrinsics/Cargo.toml
@@ -1,12 +1,12 @@
 [package]
 name = "builtins-test-intrinsics"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 publish = false
 license = "MIT OR Apache-2.0"
 
 [dependencies]
-compiler_builtins = { path = "../compiler-builtins", features = ["compiler-builtins"]}
+compiler_builtins = { path = "../builtins-shim", features = ["compiler-builtins"] }
 panic-handler = { path = "../crates/panic-handler" }
 
 [features]
diff --git a/builtins-test-intrinsics/build.rs b/builtins-test-intrinsics/build.rs
index 89b126ff2..b82581262 100644
--- a/builtins-test-intrinsics/build.rs
+++ b/builtins-test-intrinsics/build.rs
@@ -6,6 +6,5 @@ fn main() {
     println!("cargo::rerun-if-changed=../configure.rs");
 
     let target = builtins_configure::Target::from_env();
-    builtins_configure::configure_f16_f128(&target);
     builtins_configure::configure_aliases(&target);
 }
diff --git a/builtins-test-intrinsics/src/main.rs b/builtins-test-intrinsics/src/main.rs
index 1fa7b0091..b9d19ea77 100644
--- a/builtins-test-intrinsics/src/main.rs
+++ b/builtins-test-intrinsics/src/main.rs
@@ -13,11 +13,14 @@
 #![no_std]
 #![no_main]
 
+// Ensure this `compiler_builtins` gets used, rather than the version injected from the sysroot.
+extern crate compiler_builtins;
 extern crate panic_handler;
 
+// SAFETY: no definitions, only used for linking
 #[cfg(all(not(thumb), not(windows), not(target_arch = "wasm32")))]
 #[link(name = "c")]
-extern "C" {}
+unsafe extern "C" {}
 
 // Every function in this module maps will be lowered to an intrinsic by LLVM, if the platform
 // doesn't have native support for the operation used in the function. ARM has a naming convention
@@ -37,11 +40,7 @@ mod intrinsics {
         x as f64
     }
 
-    #[cfg(all(
-        f16_enabled,
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(all(f16_enabled, f128_enabled))]
     pub fn extendhftf(x: f16) -> f128 {
         x as f128
     }
@@ -198,11 +197,7 @@ mod intrinsics {
 
     /* f128 operations */
 
-    #[cfg(all(
-        f16_enabled,
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(all(f16_enabled, f128_enabled))]
     pub fn trunctfhf(x: f128) -> f16 {
         x as f16
     }
@@ -217,50 +212,32 @@ mod intrinsics {
         x as f64
     }
 
-    #[cfg(all(
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(f128_enabled)]
     pub fn fixtfsi(x: f128) -> i32 {
         x as i32
     }
 
-    #[cfg(all(
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(f128_enabled)]
     pub fn fixtfdi(x: f128) -> i64 {
         x as i64
     }
 
-    #[cfg(all(
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(f128_enabled)]
     pub fn fixtfti(x: f128) -> i128 {
         x as i128
     }
 
-    #[cfg(all(
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(f128_enabled)]
     pub fn fixunstfsi(x: f128) -> u32 {
         x as u32
     }
 
-    #[cfg(all(
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(f128_enabled)]
     pub fn fixunstfdi(x: f128) -> u64 {
         x as u64
     }
 
-    #[cfg(all(
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(f128_enabled)]
     pub fn fixunstfti(x: f128) -> u128 {
         x as u128
     }
@@ -537,47 +514,25 @@ fn run() {
     bb(extendhfdf(bb(2.)));
     #[cfg(f16_enabled)]
     bb(extendhfsf(bb(2.)));
-    #[cfg(all(
-        f16_enabled,
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(all(f16_enabled, f128_enabled))]
     bb(extendhftf(bb(2.)));
     #[cfg(f128_enabled)]
     bb(extendsftf(bb(2.)));
     bb(fixdfti(bb(2.)));
     bb(fixsfti(bb(2.)));
-    #[cfg(all(
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(f128_enabled)]
     bb(fixtfdi(bb(2.)));
-    #[cfg(all(
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(f128_enabled)]
     bb(fixtfsi(bb(2.)));
-    #[cfg(all(
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(f128_enabled)]
     bb(fixtfti(bb(2.)));
     bb(fixunsdfti(bb(2.)));
     bb(fixunssfti(bb(2.)));
-    #[cfg(all(
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(f128_enabled)]
     bb(fixunstfdi(bb(2.)));
-    #[cfg(all(
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(f128_enabled)]
     bb(fixunstfsi(bb(2.)));
-    #[cfg(all(
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(f128_enabled)]
     bb(fixunstfti(bb(2.)));
     #[cfg(f128_enabled)]
     bb(floatditf(bb(2)));
@@ -613,11 +568,7 @@ fn run() {
     bb(truncsfhf(bb(2.)));
     #[cfg(f128_enabled)]
     bb(trunctfdf(bb(2.)));
-    #[cfg(all(
-        f16_enabled,
-        f128_enabled,
-        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
-    ))]
+    #[cfg(all(f16_enabled, f128_enabled))]
     bb(trunctfhf(bb(2.)));
     #[cfg(f128_enabled)]
     bb(trunctfsf(bb(2.)));
@@ -651,22 +602,23 @@ fn something_with_a_dtor(f: &dyn Fn()) {
 
 #[unsafe(no_mangle)]
 #[cfg(not(thumb))]
-fn main(_argc: core::ffi::c_int, _argv: *const *const u8) -> core::ffi::c_int {
+extern "C" fn main(_argc: core::ffi::c_int, _argv: *const *const u8) -> core::ffi::c_int {
     run();
     0
 }
 
 #[unsafe(no_mangle)]
 #[cfg(thumb)]
-pub fn _start() -> ! {
+extern "C" fn _start() -> ! {
     run();
     loop {}
 }
 
+// SAFETY: no definitions, only used for linking
 #[cfg(windows)]
 #[link(name = "kernel32")]
 #[link(name = "msvcrt")]
-extern "C" {}
+unsafe extern "C" {}
 
 // ARM targets need these symbols
 #[unsafe(no_mangle)]
diff --git a/builtins-test/Cargo.toml b/builtins-test/Cargo.toml
index 10978c0bb..00a9d8579 100644
--- a/builtins-test/Cargo.toml
+++ b/builtins-test/Cargo.toml
@@ -10,19 +10,19 @@ license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)"
 # For fuzzing tests we want a deterministic seedable RNG. We also eliminate potential
 # problems with system RNGs on the variety of platforms this crate is tested on.
 # `xoshiro128**` is used for its quality, size, and speed at generating `u32` shift amounts.
-rand_xoshiro = "0.6"
+rand_xoshiro = "0.7"
 # To compare float builtins against
-rustc_apfloat = "0.2.1"
+rustc_apfloat = "0.2.3"
 # Really a dev dependency, but dev dependencies can't be optional
-iai-callgrind = { version = "0.14.0", optional = true }
+iai-callgrind = { version = "0.15.2", optional = true }
 
 [dependencies.compiler_builtins]
-path = "../compiler-builtins"
+path = "../builtins-shim"
 default-features = false
 features = ["unstable-public-internals"]
 
 [dev-dependencies]
-criterion = { version = "0.5.1", default-features = false, features = ["cargo_bench_support"] }
+criterion = { version = "0.6.0", default-features = false, features = ["cargo_bench_support"] }
 paste = "1.0.15"
 
 [target.'cfg(all(target_arch = "arm", not(any(target_env = "gnu", target_env = "musl")), target_os = "linux"))'.dev-dependencies]
diff --git a/builtins-test/benches/float_cmp.rs b/builtins-test/benches/float_cmp.rs
index 42d665239..da29b5d31 100644
--- a/builtins-test/benches/float_cmp.rs
+++ b/builtins-test/benches/float_cmp.rs
@@ -1,12 +1,23 @@
 #![cfg_attr(f128_enabled, feature(f128))]
 
 use builtins_test::float_bench;
-use compiler_builtins::float::cmp;
+use compiler_builtins::float::cmp::{self, CmpResult};
 use criterion::{Criterion, criterion_main};
 
 /// `gt` symbols are allowed to return differing results, they just get compared
 /// to 0.
-fn gt_res_eq(a: i32, b: i32) -> bool {
+fn gt_res_eq(mut a: CmpResult, mut b: CmpResult) -> bool {
+    // FIXME: Our CmpResult used to be `i32`, but GCC/LLVM expect `isize`. on 64-bit platforms,
+    // this means the top half of the word may be garbage if built with an old version of
+    // `compiler-builtins`, so add a hack around this.
+    //
+    // This can be removed once a version of `compiler-builtins` with the return type fix makes
+    // it upstream.
+    if size_of::<CmpResult>() == 8 {
+        a = a as i32 as CmpResult;
+        b = b as i32 as CmpResult;
+    }
+
     let a_lt_0 = a <= 0;
     let b_lt_0 = b <= 0;
     (a_lt_0 && b_lt_0) || (!a_lt_0 && !b_lt_0)
@@ -14,14 +25,14 @@ fn gt_res_eq(a: i32, b: i32) -> bool {
 
 float_bench! {
     name: cmp_f32_gt,
-    sig: (a: f32, b: f32) -> i32,
+    sig: (a: f32, b: f32) -> CmpResult,
     crate_fn: cmp::__gtsf2,
     sys_fn: __gtsf2,
     sys_available: all(),
     output_eq: gt_res_eq,
     asm: [
         #[cfg(target_arch = "x86_64")] {
-            let ret: i32;
+            let ret: CmpResult;
             asm!(
                 "xor     {ret:e}, {ret:e}",
                 "ucomiss {a}, {b}",
@@ -36,7 +47,7 @@ float_bench! {
         };
 
         #[cfg(target_arch = "aarch64")] {
-            let ret: i32;
+            let ret: CmpResult;
             asm!(
                 "fcmp    {a:s}, {b:s}",
                 "cset    {ret:w}, gt",
@@ -53,13 +64,13 @@ float_bench! {
 
 float_bench! {
     name: cmp_f32_unord,
-    sig: (a: f32, b: f32) -> i32,
+    sig: (a: f32, b: f32) -> CmpResult,
     crate_fn: cmp::__unordsf2,
     sys_fn: __unordsf2,
     sys_available: all(),
     asm: [
         #[cfg(target_arch = "x86_64")] {
-            let ret: i32;
+            let ret: CmpResult;
             asm!(
                 "xor     {ret:e}, {ret:e}",
                 "ucomiss {a}, {b}",
@@ -74,7 +85,7 @@ float_bench! {
         };
 
         #[cfg(target_arch = "aarch64")] {
-            let ret: i32;
+            let ret: CmpResult;
             asm!(
                 "fcmp    {a:s}, {b:s}",
                 "cset    {ret:w}, vs",
@@ -91,14 +102,14 @@ float_bench! {
 
 float_bench! {
     name: cmp_f64_gt,
-    sig: (a: f64, b: f64) -> i32,
+    sig: (a: f64, b: f64) -> CmpResult,
     crate_fn: cmp::__gtdf2,
     sys_fn: __gtdf2,
     sys_available: all(),
     output_eq: gt_res_eq,
     asm: [
         #[cfg(target_arch = "x86_64")] {
-            let ret: i32;
+            let ret: CmpResult;
             asm!(
                 "xor     {ret:e}, {ret:e}",
                 "ucomisd {a}, {b}",
@@ -113,7 +124,7 @@ float_bench! {
         };
 
         #[cfg(target_arch = "aarch64")] {
-            let ret: i32;
+            let ret: CmpResult;
             asm!(
                 "fcmp    {a:d}, {b:d}",
                 "cset {ret:w}, gt",
@@ -130,13 +141,13 @@ float_bench! {
 
 float_bench! {
     name: cmp_f64_unord,
-    sig: (a: f64, b: f64) -> i32,
+    sig: (a: f64, b: f64) -> CmpResult,
     crate_fn: cmp::__unorddf2,
     sys_fn: __unorddf2,
     sys_available: all(),
     asm: [
         #[cfg(target_arch = "x86_64")] {
-            let ret: i32;
+            let ret: CmpResult;
             asm!(
                 "xor     {ret:e}, {ret:e}",
                 "ucomisd {a}, {b}",
@@ -151,7 +162,7 @@ float_bench! {
         };
 
         #[cfg(target_arch = "aarch64")] {
-            let ret: i32;
+            let ret: CmpResult;
             asm!(
                 "fcmp    {a:d}, {b:d}",
                 "cset    {ret:w}, vs",
@@ -166,9 +177,10 @@ float_bench! {
     ],
 }
 
+#[cfg(f128_enabled)]
 float_bench! {
     name: cmp_f128_gt,
-    sig: (a: f128, b: f128) -> i32,
+    sig: (a: f128, b: f128) -> CmpResult,
     crate_fn: cmp::__gttf2,
     crate_fn_ppc: cmp::__gtkf2,
     sys_fn: __gttf2,
@@ -178,9 +190,10 @@ float_bench! {
     asm: []
 }
 
+#[cfg(f128_enabled)]
 float_bench! {
     name: cmp_f128_unord,
-    sig: (a: f128, b: f128) -> i32,
+    sig: (a: f128, b: f128) -> CmpResult,
     crate_fn: cmp::__unordtf2,
     crate_fn_ppc: cmp::__unordkf2,
     sys_fn: __unordtf2,
diff --git a/builtins-test/benches/float_conv.rs b/builtins-test/benches/float_conv.rs
index d4a7346d1..40c13d270 100644
--- a/builtins-test/benches/float_conv.rs
+++ b/builtins-test/benches/float_conv.rs
@@ -1,4 +1,3 @@
-#![allow(improper_ctypes)]
 #![cfg_attr(f128_enabled, feature(f128))]
 
 use builtins_test::float_bench;
@@ -365,7 +364,6 @@ float_bench! {
 
 /* float -> unsigned int */
 
-#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
 float_bench! {
     name: conv_f32_u32,
     sig: (a: f32) -> u32,
@@ -387,7 +385,6 @@ float_bench! {
     ],
 }
 
-#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
 float_bench! {
     name: conv_f32_u64,
     sig: (a: f32) -> u64,
@@ -409,7 +406,6 @@ float_bench! {
     ],
 }
 
-#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
 float_bench! {
     name: conv_f32_u128,
     sig: (a: f32) -> u128,
@@ -505,7 +501,6 @@ float_bench! {
 
 /* float -> signed int */
 
-#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
 float_bench! {
     name: conv_f32_i32,
     sig: (a: f32) -> i32,
@@ -527,7 +522,6 @@ float_bench! {
     ],
 }
 
-#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
 float_bench! {
     name: conv_f32_i64,
     sig: (a: f32) -> i64,
@@ -549,7 +543,6 @@ float_bench! {
     ],
 }
 
-#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
 float_bench! {
     name: conv_f32_i128,
     sig: (a: f32) -> i128,
@@ -666,9 +659,6 @@ pub fn float_conv() {
     conv_f64_i128(&mut criterion);
 
     #[cfg(f128_enabled)]
-    // FIXME: ppc64le has a sporadic overflow panic in the crate functions
-    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
-    #[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
     {
         conv_u32_f128(&mut criterion);
         conv_u64_f128(&mut criterion);
diff --git a/builtins-test/benches/float_extend.rs b/builtins-test/benches/float_extend.rs
index fc44e80c9..939dc60f9 100644
--- a/builtins-test/benches/float_extend.rs
+++ b/builtins-test/benches/float_extend.rs
@@ -110,9 +110,7 @@ float_bench! {
 pub fn float_extend() {
     let mut criterion = Criterion::default().configure_from_args();
 
-    // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
     #[cfg(f16_enabled)]
-    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
     {
         extend_f16_f32(&mut criterion);
         extend_f16_f64(&mut criterion);
diff --git a/builtins-test/benches/float_trunc.rs b/builtins-test/benches/float_trunc.rs
index 43310c7cf..9373f945b 100644
--- a/builtins-test/benches/float_trunc.rs
+++ b/builtins-test/benches/float_trunc.rs
@@ -121,9 +121,7 @@ float_bench! {
 pub fn float_trunc() {
     let mut criterion = Criterion::default().configure_from_args();
 
-    // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
     #[cfg(f16_enabled)]
-    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
     {
         trunc_f32_f16(&mut criterion);
         trunc_f64_f16(&mut criterion);
@@ -133,11 +131,8 @@ pub fn float_trunc() {
 
     #[cfg(f128_enabled)]
     {
-        // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
         #[cfg(f16_enabled)]
-        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
         trunc_f128_f16(&mut criterion);
-
         trunc_f128_f32(&mut criterion);
         trunc_f128_f64(&mut criterion);
     }
diff --git a/builtins-test/build.rs b/builtins-test/build.rs
index e8f4eb4dd..5b2dcd12e 100644
--- a/builtins-test/build.rs
+++ b/builtins-test/build.rs
@@ -116,5 +116,4 @@ fn main() {
     }
 
     builtins_configure::configure_aliases(&target);
-    builtins_configure::configure_f16_f128(&target);
 }
diff --git a/builtins-test/src/bench.rs b/builtins-test/src/bench.rs
index 2348f6bc9..4bdcf482c 100644
--- a/builtins-test/src/bench.rs
+++ b/builtins-test/src/bench.rs
@@ -17,28 +17,14 @@ pub fn skip_sys_checks(test_name: &str) -> bool {
         "extend_f16_f32",
         "trunc_f32_f16",
         "trunc_f64_f16",
-        // FIXME(#616): re-enable once fix is in nightly
-        // <https://github.com/rust-lang/compiler-builtins/issues/616>
-        "mul_f32",
-        "mul_f64",
     ];
 
-    // FIXME(f16_f128): error on LE ppc64. There are more tests that are cfg-ed out completely
-    // in their benchmark modules due to runtime panics.
-    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
-    const PPC64LE_SKIPPED: &[&str] = &["extend_f32_f128"];
-
     // FIXME(f16_f128): system symbols have incorrect results
     // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
     const X86_NO_SSE_SKIPPED: &[&str] = &[
         "add_f128", "sub_f128", "mul_f128", "div_f128", "powi_f32", "powi_f64",
     ];
 
-    // FIXME(f16_f128): Wide multiply carry bug in `compiler-rt`, re-enable when nightly no longer
-    // uses `compiler-rt` version.
-    // <https://github.com/llvm/llvm-project/issues/91840>
-    const AARCH64_SKIPPED: &[&str] = &["mul_f128", "div_f128"];
-
     // FIXME(llvm): system symbols have incorrect results on Windows
     // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2121359807>
     const WINDOWS_SKIPPED: &[&str] = &[
@@ -57,19 +43,7 @@ pub fn skip_sys_checks(test_name: &str) -> bool {
         return true;
     }
 
-    if cfg!(all(target_arch = "powerpc64", target_endian = "little"))
-        && PPC64LE_SKIPPED.contains(&test_name)
-    {
-        return true;
-    }
-
-    if cfg!(all(target_arch = "x86", not(target_feature = "sse")))
-        && X86_NO_SSE_SKIPPED.contains(&test_name)
-    {
-        return true;
-    }
-
-    if cfg!(target_arch = "aarch64") && AARCH64_SKIPPED.contains(&test_name) {
+    if cfg!(x86_no_sse) && X86_NO_SSE_SKIPPED.contains(&test_name) {
         return true;
     }
 
@@ -358,8 +332,8 @@ impl_testio!(float f16);
 impl_testio!(float f32, f64);
 #[cfg(f128_enabled)]
 impl_testio!(float f128);
-impl_testio!(int i16, i32, i64, i128);
-impl_testio!(int u16, u32, u64, u128);
+impl_testio!(int i8, i16, i32, i64, i128, isize);
+impl_testio!(int u8, u16, u32, u64, u128, usize);
 impl_testio!((float, int)(f32, i32));
 impl_testio!((float, int)(f64, i32));
 #[cfg(f128_enabled)]
diff --git a/builtins-test/src/lib.rs b/builtins-test/src/lib.rs
index c596ac213..f1673133b 100644
--- a/builtins-test/src/lib.rs
+++ b/builtins-test/src/lib.rs
@@ -40,6 +40,75 @@ pub const N: u32 = if cfg!(target_arch = "x86_64") && !cfg!(debug_assertions) {
     10_000
 };
 
+/// Additional constants that determine how the integer gets fuzzed.
+trait FuzzInt: MinInt {
+    /// LUT used for maximizing the space covered and minimizing the computational cost of fuzzing
+    /// in `builtins-test`. For example, Self = u128 produces [0,1,2,7,8,15,16,31,32,63,64,95,96,
+    /// 111,112,119,120,125,126,127].
+    const FUZZ_LENGTHS: [u8; 20] = make_fuzz_lengths(Self::BITS);
+
+    /// The number of entries of `FUZZ_LENGTHS` actually used. The maximum is 20 for u128.
+    const FUZZ_NUM: usize = {
+        let log2 = Self::BITS.ilog2() as usize;
+        if log2 == 3 {
+            // case for u8
+            6
+        } else {
+            // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate
+            // boundaries.
+            8 + (4 * (log2 - 4))
+        }
+    };
+}
+
+impl<I> FuzzInt for I where I: MinInt {}
+
+const fn make_fuzz_lengths(bits: u32) -> [u8; 20] {
+    let mut v = [0u8; 20];
+    v[0] = 0;
+    v[1] = 1;
+    v[2] = 2; // important for parity and the iX::MIN case when reversed
+    let mut i = 3;
+
+    // No need for any more until the byte boundary, because there should be no algorithms
+    // that are sensitive to anything not next to byte boundaries after 2. We also scale
+    // in powers of two, which is important to prevent u128 corner tests from getting too
+    // big.
+    let mut l = 8;
+    loop {
+        if l >= ((bits / 2) as u8) {
+            break;
+        }
+        // get both sides of the byte boundary
+        v[i] = l - 1;
+        i += 1;
+        v[i] = l;
+        i += 1;
+        l *= 2;
+    }
+
+    if bits != 8 {
+        // add the lower side of the middle boundary
+        v[i] = ((bits / 2) - 1) as u8;
+        i += 1;
+    }
+
+    // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS
+    // boundary because of algorithms that split the high part up. We reverse the scaling
+    // as we go to Self::BITS.
+    let mid = i;
+    let mut j = 1;
+    loop {
+        v[i] = (bits as u8) - (v[mid - j]) - 1;
+        if j == mid {
+            break;
+        }
+        i += 1;
+        j += 1;
+    }
+    v
+}
+
 /// Random fuzzing step. When run several times, it results in excellent fuzzing entropy such as:
 /// 11110101010101011110111110011111
 /// 10110101010100001011101011001010
@@ -92,10 +161,9 @@ fn fuzz_step<I: Int>(rng: &mut Xoshiro128StarStar, x: &mut I) {
 macro_rules! edge_cases {
     ($I:ident, $case:ident, $inner:block) => {
         for i0 in 0..$I::FUZZ_NUM {
-            let mask_lo = (!$I::UnsignedInt::ZERO).wrapping_shr($I::FUZZ_LENGTHS[i0] as u32);
+            let mask_lo = (!$I::Unsigned::ZERO).wrapping_shr($I::FUZZ_LENGTHS[i0] as u32);
             for i1 in i0..I::FUZZ_NUM {
-                let mask_hi =
-                    (!$I::UnsignedInt::ZERO).wrapping_shl($I::FUZZ_LENGTHS[i1 - i0] as u32);
+                let mask_hi = (!$I::Unsigned::ZERO).wrapping_shl($I::FUZZ_LENGTHS[i1 - i0] as u32);
                 let $case = I::from_unsigned(mask_lo & mask_hi);
                 $inner
             }
@@ -107,7 +175,7 @@ macro_rules! edge_cases {
 /// edge cases, followed by a more random fuzzer that runs `n` times.
 pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F)
 where
-    <I as MinInt>::UnsignedInt: Int,
+    <I as MinInt>::Unsigned: Int,
 {
     // edge case tester. Calls `f` 210 times for u128.
     // zero gets skipped by the loop
@@ -128,7 +196,7 @@ where
 /// The same as `fuzz`, except `f` has two inputs.
 pub fn fuzz_2<I: Int, F: Fn(I, I)>(n: u32, f: F)
 where
-    <I as MinInt>::UnsignedInt: Int,
+    <I as MinInt>::Unsigned: Int,
 {
     // Check cases where the first and second inputs are zero. Both call `f` 210 times for `u128`.
     edge_cases!(I, case, {
diff --git a/builtins-test/tests/addsub.rs b/builtins-test/tests/addsub.rs
index 865b9e472..f3334bd0e 100644
--- a/builtins-test/tests/addsub.rs
+++ b/builtins-test/tests/addsub.rs
@@ -1,4 +1,5 @@
 #![allow(unused_macros)]
+#![cfg_attr(f16_enabled, feature(f16))]
 #![cfg_attr(f128_enabled, feature(f128))]
 
 use builtins_test::*;
@@ -111,32 +112,29 @@ macro_rules! float_sum {
     }
 }
 
-#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[cfg(not(x86_no_sse))]
 mod float_addsub {
     use super::*;
 
+    #[cfg(f16_enabled)]
+    float_sum! {
+        f16, __addhf3, __subhf3, Half, all();
+    }
+
     float_sum! {
         f32, __addsf3, __subsf3, Single, all();
         f64, __adddf3, __subdf3, Double, all();
     }
-}
-
-#[cfg(f128_enabled)]
-#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
-#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
-mod float_addsub_f128 {
-    use super::*;
 
+    #[cfg(f128_enabled)]
+    #[cfg(not(x86_no_sse))]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
     float_sum! {
         f128, __addtf3, __subtf3, Quad, not(feature = "no-sys-f128");
     }
-}
-
-#[cfg(f128_enabled)]
-#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
-mod float_addsub_f128_ppc {
-    use super::*;
 
+    #[cfg(f128_enabled)]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
     float_sum! {
         f128, __addkf3, __subkf3, Quad, not(feature = "no-sys-f128");
     }
diff --git a/builtins-test/tests/aeabi_memclr.rs b/builtins-test/tests/aeabi_memclr.rs
index bfd15a391..0761feaff 100644
--- a/builtins-test/tests/aeabi_memclr.rs
+++ b/builtins-test/tests/aeabi_memclr.rs
@@ -24,7 +24,8 @@ macro_rules! panic {
     };
 }
 
-extern "C" {
+// SAFETY: defined in  compiler-builtins
+unsafe extern "aapcs" {
     fn __aeabi_memclr4(dest: *mut u8, n: usize);
     fn __aeabi_memset4(dest: *mut u8, n: usize, c: u32);
 }
diff --git a/builtins-test/tests/aeabi_memcpy.rs b/builtins-test/tests/aeabi_memcpy.rs
index c892c5aba..e76e712a2 100644
--- a/builtins-test/tests/aeabi_memcpy.rs
+++ b/builtins-test/tests/aeabi_memcpy.rs
@@ -22,7 +22,8 @@ macro_rules! panic {
     };
 }
 
-extern "C" {
+// SAFETY: defined in  compiler-builtins
+unsafe extern "aapcs" {
     fn __aeabi_memcpy(dest: *mut u8, src: *const u8, n: usize);
     fn __aeabi_memcpy4(dest: *mut u8, src: *const u8, n: usize);
 }
diff --git a/builtins-test/tests/aeabi_memset.rs b/builtins-test/tests/aeabi_memset.rs
index 34ab3acc7..8f9f80f96 100644
--- a/builtins-test/tests/aeabi_memset.rs
+++ b/builtins-test/tests/aeabi_memset.rs
@@ -24,7 +24,8 @@ macro_rules! panic {
     };
 }
 
-extern "C" {
+// SAFETY: defined in  compiler-builtins
+unsafe extern "aapcs" {
     fn __aeabi_memset4(dest: *mut u8, n: usize, c: u32);
 }
 
diff --git a/builtins-test/tests/cmp.rs b/builtins-test/tests/cmp.rs
index a904dc5f7..4b01b6ca1 100644
--- a/builtins-test/tests/cmp.rs
+++ b/builtins-test/tests/cmp.rs
@@ -1,5 +1,6 @@
 #![allow(unused_macros)]
 #![allow(unreachable_code)]
+#![cfg_attr(f16_enabled, feature(f16))]
 #![cfg_attr(f128_enabled, feature(f128))]
 
 use builtins_test::*;
@@ -51,6 +52,26 @@ mod float_comparisons {
         };
     }
 
+    #[test]
+    #[cfg(f16_enabled)]
+    fn cmp_f16() {
+        use compiler_builtins::float::cmp::{
+            __eqhf2, __gehf2, __gthf2, __lehf2, __lthf2, __nehf2, __unordhf2,
+        };
+
+        fuzz_float_2(N, |x: f16, y: f16| {
+            assert_eq!(__unordhf2(x, y) != 0, x.is_nan() || y.is_nan());
+            cmp!(f16, x, y, Half, all(),
+                1, __lthf2;
+                1, __lehf2;
+                1, __eqhf2;
+                -1, __gehf2;
+                -1, __gthf2;
+                1, __nehf2;
+            );
+        });
+    }
+
     #[test]
     fn cmp_f32() {
         use compiler_builtins::float::cmp::{
diff --git a/builtins-test/tests/conv.rs b/builtins-test/tests/conv.rs
index 491915d9b..9b04295d2 100644
--- a/builtins-test/tests/conv.rs
+++ b/builtins-test/tests/conv.rs
@@ -59,32 +59,28 @@ mod i_to_f {
                                 || ((error_minus == error || error_plus == error)
                                     && ((f0.to_bits() & 1) != 0))
                             {
-                                if !cfg!(any(
-                                    target_arch = "powerpc",
-                                    target_arch = "powerpc64"
-                                )) {
-                                    panic!(
-                                        "incorrect rounding by {}({}): {}, ({}, {}, {}), errors ({}, {}, {})",
-                                        stringify!($fn),
-                                        x,
-                                        f1.to_bits(),
-                                        y_minus_ulp,
-                                        y,
-                                        y_plus_ulp,
-                                        error_minus,
-                                        error,
-                                        error_plus,
-                                    );
-                                }
+                                panic!(
+                                    "incorrect rounding by {}({}): {}, ({}, {}, {}), errors ({}, {}, {})",
+                                    stringify!($fn),
+                                    x,
+                                    f1.to_bits(),
+                                    y_minus_ulp,
+                                    y,
+                                    y_plus_ulp,
+                                    error_minus,
+                                    error,
+                                    error_plus,
+                                );
                             }
                         }
 
-                        // Test against native conversion. We disable testing on all `x86` because of
-                        // rounding bugs with `i686`. `powerpc` also has the same rounding bug.
+                        // Test against native conversion.
+                        // FIXME(x86,ppc): the platform version has rounding bugs on i686 and
+                        // PowerPC64le (for PPC this only shows up in Docker, not the native runner).
+                        // https://github.com/rust-lang/compiler-builtins/pull/384#issuecomment-740413334
                         if !Float::eq_repr(f0, f1) && !cfg!(any(
                             target_arch = "x86",
-                            target_arch = "powerpc",
-                            target_arch = "powerpc64"
+                            all(target_arch = "powerpc64", target_endian = "little")
                         )) {
                             panic!(
                                 "{}({}): std: {:?}, builtins: {:?}",
@@ -118,7 +114,7 @@ mod i_to_f {
         i128, __floattidf;
     }
 
-    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(f128_enabled)]
     #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
     i_to_f! { f128, Quad, not(feature = "no-sys-f128-int-convert"),
         u32, __floatunsitf;
@@ -129,7 +125,7 @@ mod i_to_f {
         i128, __floattitf;
     }
 
-    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(f128_enabled)]
     #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
     i_to_f! { f128, Quad, not(feature = "no-sys-f128-int-convert"),
         u32, __floatunsikf;
diff --git a/builtins-test/tests/div_rem.rs b/builtins-test/tests/div_rem.rs
index 5ae653cc9..caee4166c 100644
--- a/builtins-test/tests/div_rem.rs
+++ b/builtins-test/tests/div_rem.rs
@@ -138,7 +138,7 @@ macro_rules! float {
     };
 }
 
-#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[cfg(not(x86_no_sse))]
 mod float_div {
     use super::*;
 
@@ -147,7 +147,7 @@ mod float_div {
         f64, __divdf3, Double, all();
     }
 
-    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(f128_enabled)]
     #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
     float! {
         f128, __divtf3, Quad,
@@ -156,7 +156,7 @@ mod float_div {
         not(any(feature = "no-sys-f128", all(target_arch = "aarch64", target_os = "linux")));
     }
 
-    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(f128_enabled)]
     #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
     float! {
         f128, __divkf3, Quad, not(feature = "no-sys-f128");
diff --git a/builtins-test/tests/float_pow.rs b/builtins-test/tests/float_pow.rs
index 8209543e6..a17dff27c 100644
--- a/builtins-test/tests/float_pow.rs
+++ b/builtins-test/tests/float_pow.rs
@@ -1,7 +1,7 @@
 #![allow(unused_macros)]
 #![cfg_attr(f128_enabled, feature(f128))]
-#![cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
 
+#[cfg_attr(x86_no_sse, allow(unused))]
 use builtins_test::*;
 
 // This is approximate because of issues related to
@@ -52,14 +52,13 @@ macro_rules! pow {
     };
 }
 
+#[cfg(not(x86_no_sse))] // FIXME(i586): failure for powidf2
 pow! {
     f32, 1e-4, __powisf2, all();
     f64, 1e-12, __powidf2, all();
 }
 
 #[cfg(f128_enabled)]
-// FIXME(f16_f128): MSVC cannot build these until `__divtf3` is available in nightly.
-#[cfg(not(target_env = "msvc"))]
 #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
 pow! {
     f128, 1e-36, __powitf2, not(feature = "no-sys-f128");
diff --git a/builtins-test/tests/lse.rs b/builtins-test/tests/lse.rs
index 53167d98f..5d59fbb7f 100644
--- a/builtins-test/tests/lse.rs
+++ b/builtins-test/tests/lse.rs
@@ -1,5 +1,6 @@
 #![feature(decl_macro)] // so we can use pub(super)
-#![cfg(all(target_arch = "aarch64", target_os = "linux", not(feature = "no-asm")))]
+#![feature(macro_metavar_expr_concat)]
+#![cfg(all(target_arch = "aarch64", target_os = "linux"))]
 
 /// Translate a byte size to a Rust type.
 macro int_ty {
@@ -87,7 +88,7 @@ test_op!(add, |left, right| left.wrapping_add(right));
 test_op!(clr, |left, right| left & !right);
 test_op!(xor, std::ops::BitXor::bitxor);
 test_op!(or, std::ops::BitOr::bitor);
-
+use compiler_builtins::{foreach_bytes, foreach_ordering};
 compiler_builtins::foreach_cas!(cas::test);
 compiler_builtins::foreach_cas16!(test_cas16);
 compiler_builtins::foreach_swp!(swap::test);
diff --git a/builtins-test/tests/mul.rs b/builtins-test/tests/mul.rs
index 58bc9ab4a..bbf1157db 100644
--- a/builtins-test/tests/mul.rs
+++ b/builtins-test/tests/mul.rs
@@ -1,5 +1,6 @@
-#![allow(unused_macros)]
+#![cfg_attr(f16_enabled, feature(f16))]
 #![cfg_attr(f128_enabled, feature(f128))]
+#![allow(unused_macros)]
 
 use builtins_test::*;
 
@@ -113,10 +114,15 @@ macro_rules! float_mul {
     };
 }
 
-#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[cfg(not(x86_no_sse))]
 mod float_mul {
     use super::*;
 
+    #[cfg(f16_enabled)]
+    float_mul! {
+        f16, __mulhf3, Half, all();
+    }
+
     // FIXME(#616): Stop ignoring arches that don't have native support once fix for builtins is in
     // nightly.
     float_mul! {
@@ -126,7 +132,7 @@ mod float_mul {
 }
 
 #[cfg(f128_enabled)]
-#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[cfg(not(x86_no_sse))]
 #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
 mod float_mul_f128 {
     use super::*;
diff --git a/ci/bench-icount.sh b/ci/bench-icount.sh
index 4d93e257a..12228b9da 100755
--- a/ci/bench-icount.sh
+++ b/ci/bench-icount.sh
@@ -2,10 +2,21 @@
 
 set -eux
 
+target="${1:-}"
+
+if [ -z "$target" ]; then
+    host_target=$(rustc -vV | awk '/^host/ { print $2 }')
+    echo "Defaulted to host target $host_target"
+    target="$host_target"
+fi
+
 iai_home="iai-home"
 
+# Use the arch as a tag to disambiguate artifacts
+tag="$(echo "$target" | cut -d'-' -f1)"
+
 # Download the baseline from master
-./ci/ci-util.py locate-baseline --download --extract
+./ci/ci-util.py locate-baseline --download --extract --tag "$tag"
 
 # Run benchmarks once
 function run_icount_benchmarks() {
@@ -17,7 +28,7 @@ function run_icount_benchmarks() {
 
     iai_args=(
         "--home" "$(pwd)/$iai_home"
-        "--regression=ir=5.0"
+        "--callgrind-limits=ir=5.0"
         "--save-summary"
     )
 
@@ -35,16 +46,18 @@ function run_icount_benchmarks() {
         shift
     done
 
-    # Run iai-callgrind benchmarks
-    cargo bench "${cargo_args[@]}" -- "${iai_args[@]}"
+    # Run iai-callgrind benchmarks. Do this in a subshell with `&& true` to
+    # capture rather than exit on error.
+    (cargo bench "${cargo_args[@]}" -- "${iai_args[@]}") && true
+    exit_code="$?"
 
-    # NB: iai-callgrind should exit on error but does not, so we inspect the sumary
-    # for errors. See  https://github.com/iai-callgrind/iai-callgrind/issues/337
-    if [ -n "${PR_NUMBER:-}" ]; then
-        # If this is for a pull request, ignore regressions if specified.
-        ./ci/ci-util.py check-regressions --home "$iai_home" --allow-pr-override "$PR_NUMBER"
+    if [ "$exit_code" -eq 0 ]; then
+        echo "Benchmarks completed with no regressions"
+    elif [ -z "${PR_NUMBER:-}" ]; then
+        # Disregard regressions after merge
+        echo "Benchmarks completed with regressions; ignoring (not in a PR)"
     else
-        ./ci/ci-util.py check-regressions --home "$iai_home" || true
+        ./ci/ci-util.py handle-bench-regressions "$PR_NUMBER"
     fi
 }
 
@@ -53,6 +66,6 @@ run_icount_benchmarks --features force-soft-floats -- --save-baseline=softfloat
 run_icount_benchmarks -- --save-baseline=hardfloat
 
 # Name and tar the new baseline
-name="baseline-icount-$(date -u +'%Y%m%d%H%M')-${GITHUB_SHA:0:12}"
+name="baseline-icount-$tag-$(date -u +'%Y%m%d%H%M')-${GITHUB_SHA:0:12}"
 echo "BASELINE_NAME=$name" >>"$GITHUB_ENV"
 tar cJf "$name.tar.xz" "$iai_home"
diff --git a/ci/ci-util.py b/ci/ci-util.py
index d785b2e9e..c1db17c6c 100755
--- a/ci/ci-util.py
+++ b/ci/ci-util.py
@@ -7,11 +7,13 @@
 
 import json
 import os
+import pprint
 import re
 import subprocess as sp
 import sys
 from dataclasses import dataclass
-from glob import glob, iglob
+from functools import cache
+from glob import glob
 from inspect import cleandoc
 from os import getenv
 from pathlib import Path
@@ -28,21 +30,20 @@
             Calculate a matrix of which functions had source change, print that as
             a JSON object.
 
-        locate-baseline [--download] [--extract]
+        locate-baseline [--download] [--extract] [--tag TAG]
             Locate the most recent benchmark baseline available in CI and, if flags
             specify, download and extract it. Never exits with nonzero status if
             downloading fails.
 
-            Note that `--extract` will overwrite files in `iai-home`.
+            `--tag` can be specified to look for artifacts with a specific tag, such as
+            for a specific architecture.
 
-        check-regressions [--home iai-home] [--allow-pr-override pr_number]
-            Check `iai-home` (or `iai-home` if unspecified) for `summary.json`
-            files and see if there are any regressions. This is used as a workaround
-            for `iai-callgrind` not exiting with error status; see
-            <https://github.com/iai-callgrind/iai-callgrind/issues/337>.
+            Note that `--extract` will overwrite files in `iai-home`.
 
-            If `--allow-pr-override` is specified, the regression check will not exit
-            with failure if any line in the PR starts with `allow-regressions`.
+        handle-bench-regressions PR_NUMBER
+            Exit with success if the pull request contains a line starting with
+            `ci: allow-regressions`, indicating that regressions in benchmarks should
+            be accepted. Otherwise, exit 1.
     """
 )
 
@@ -50,16 +51,7 @@
 GIT = ["git", "-C", REPO_ROOT]
 DEFAULT_BRANCH = "master"
 WORKFLOW_NAME = "CI"  # Workflow that generates the benchmark artifacts
-ARTIFACT_GLOB = "baseline-icount*"
-# Place this in a PR body to skip regression checks (must be at the start of a line).
-REGRESSION_DIRECTIVE = "ci: allow-regressions"
-# Place this in a PR body to skip extensive tests
-SKIP_EXTENSIVE_DIRECTIVE = "ci: skip-extensive"
-# Place this in a PR body to allow running a large number of extensive tests. If not
-# set, this script will error out if a threshold is exceeded in order to avoid
-# accidentally spending huge amounts of CI time.
-ALLOW_MANY_EXTENSIVE_DIRECTIVE = "ci: allow-many-extensive"
-MANY_EXTENSIVE_THRESHOLD = 20
+ARTIFACT_PREFIX = "baseline-icount*"
 
 # Don't run exhaustive tests if these files change, even if they contaiin a function
 # definition.
@@ -71,7 +63,7 @@
 
 # libm PR CI takes a long time and doesn't need to run unless relevant files have been
 # changed. Anything matching this regex pattern will trigger a run.
-TRIGGER_LIBM_PR_CI = ".*(libm|musl).*"
+TRIGGER_LIBM_CI_FILE_PAT = ".*(libm|musl).*"
 
 TYPES = ["f16", "f32", "f64", "f128"]
 
@@ -81,6 +73,54 @@ def eprint(*args, **kwargs):
     print(*args, file=sys.stderr, **kwargs)
 
 
+@dataclass(init=False)
+class PrCfg:
+    """Directives that we allow in the commit body to control test behavior.
+
+    These are of the form `ci: foo`, at the start of a line.
+    """
+
+    # Skip regression checks (must be at the start of a line).
+    allow_regressions: bool = False
+    # Don't run extensive tests
+    skip_extensive: bool = False
+
+    # Allow running a large number of extensive tests. If not set, this script
+    # will error out if a threshold is exceeded in order to avoid accidentally
+    # spending huge amounts of CI time.
+    allow_many_extensive: bool = False
+
+    # Max number of extensive tests to run by default
+    MANY_EXTENSIVE_THRESHOLD: int = 20
+
+    # Run tests for `libm` that may otherwise be skipped due to no changed files.
+    always_test_libm: bool = False
+
+    # String values of directive names
+    DIR_ALLOW_REGRESSIONS: str = "allow-regressions"
+    DIR_SKIP_EXTENSIVE: str = "skip-extensive"
+    DIR_ALLOW_MANY_EXTENSIVE: str = "allow-many-extensive"
+    DIR_TEST_LIBM: str = "test-libm"
+
+    def __init__(self, body: str):
+        directives = re.finditer(r"^\s*ci:\s*(?P<dir_name>\S*)", body, re.MULTILINE)
+        for dir in directives:
+            name = dir.group("dir_name")
+            if name == self.DIR_ALLOW_REGRESSIONS:
+                self.allow_regressions = True
+            elif name == self.DIR_SKIP_EXTENSIVE:
+                self.skip_extensive = True
+            elif name == self.DIR_ALLOW_MANY_EXTENSIVE:
+                self.allow_many_extensive = True
+            elif name == self.DIR_TEST_LIBM:
+                self.always_test_libm = True
+            else:
+                eprint(f"Found unexpected directive `{name}`")
+                exit(1)
+
+        pprint.pp(self)
+
+
 @dataclass
 class PrInfo:
     """GitHub response for PR query"""
@@ -89,10 +129,21 @@ class PrInfo:
     commits: list[str]
     created_at: str
     number: int
+    cfg: PrCfg
 
     @classmethod
-    def load(cls, pr_number: int | str) -> Self:
-        """For a given PR number, query the body and commit list"""
+    def from_env(cls) -> Self | None:
+        """Create a PR object from the PR_NUMBER environment if set, `None` otherwise."""
+        pr_env = os.environ.get("PR_NUMBER")
+        if pr_env is not None and len(pr_env) > 0:
+            return cls.from_pr(pr_env)
+
+        return None
+
+    @classmethod
+    @cache  # Cache so we don't print info messages multiple times
+    def from_pr(cls, pr_number: int | str) -> Self:
+        """For a given PR number, query the body and commit list."""
         pr_info = sp.check_output(
             [
                 "gh",
@@ -105,13 +156,9 @@ def load(cls, pr_number: int | str) -> Self:
             ],
             text=True,
         )
-        eprint("PR info:", json.dumps(pr_info, indent=4))
-        return cls(**json.loads(pr_info))
-
-    def contains_directive(self, directive: str) -> bool:
-        """Return true if the provided directive is on a line in the PR body"""
-        lines = self.body.splitlines()
-        return any(line.startswith(directive) for line in lines)
+        pr_json = json.loads(pr_info)
+        eprint("PR info:", json.dumps(pr_json, indent=4))
+        return cls(**json.loads(pr_info), cfg=PrCfg(pr_json["body"]))
 
 
 class FunctionDef(TypedDict):
@@ -208,26 +255,32 @@ def may_skip_libm_ci(self) -> bool:
         """If this is a PR and no libm files were changed, allow skipping libm
         jobs."""
 
-        if self.is_pr():
-            return all(not re.match(TRIGGER_LIBM_PR_CI, str(f)) for f in self.changed)
+        # Always run on merge CI
+        if not self.is_pr():
+            return False
 
-        return False
+        pr = PrInfo.from_env()
+        assert pr is not None, "Is a PR but couldn't load PrInfo"
+
+        # Allow opting in to libm tests
+        if pr.cfg.always_test_libm:
+            return False
+
+        # By default, run if there are any changed files matching the pattern
+        return all(not re.match(TRIGGER_LIBM_CI_FILE_PAT, str(f)) for f in self.changed)
 
     def emit_workflow_output(self):
         """Create a JSON object a list items for each type's changed files, if any
         did change, and the routines that were affected by the change.
         """
 
-        pr_number = os.environ.get("PR_NUMBER")
         skip_tests = False
         error_on_many_tests = False
 
-        if pr_number is not None and len(pr_number) > 0:
-            pr = PrInfo.load(pr_number)
-            skip_tests = pr.contains_directive(SKIP_EXTENSIVE_DIRECTIVE)
-            error_on_many_tests = not pr.contains_directive(
-                ALLOW_MANY_EXTENSIVE_DIRECTIVE
-            )
+        pr = PrInfo.from_env()
+        if pr is not None:
+            skip_tests = pr.cfg.skip_extensive
+            error_on_many_tests = not pr.cfg.allow_many_extensive
 
             if skip_tests:
                 eprint("Skipping all extensive tests")
@@ -254,16 +307,14 @@ def emit_workflow_output(self):
         may_skip = str(self.may_skip_libm_ci()).lower()
         print(f"extensive_matrix={ext_matrix}")
         print(f"may_skip_libm_ci={may_skip}")
-        eprint(f"extensive_matrix={ext_matrix}")
-        eprint(f"may_skip_libm_ci={may_skip}")
         eprint(f"total extensive tests: {total_to_test}")
 
-        if error_on_many_tests and total_to_test > MANY_EXTENSIVE_THRESHOLD:
+        if error_on_many_tests and total_to_test > PrCfg.MANY_EXTENSIVE_THRESHOLD:
             eprint(
-                f"More than {MANY_EXTENSIVE_THRESHOLD} tests would be run; add"
-                f" `{ALLOW_MANY_EXTENSIVE_DIRECTIVE}` to the PR body if this is"
+                f"More than {PrCfg.MANY_EXTENSIVE_THRESHOLD} tests would be run; add"
+                f" `{PrCfg.DIR_ALLOW_MANY_EXTENSIVE}` to the PR body if this is"
                 " intentional. If this is refactoring that happens to touch a lot of"
-                f" files, `{SKIP_EXTENSIVE_DIRECTIVE}` can be used instead."
+                f" files, `{PrCfg.DIR_SKIP_EXTENSIVE}` can be used instead."
             )
             exit(1)
 
@@ -278,6 +329,7 @@ def locate_baseline(flags: list[str]) -> None:
 
     download = False
     extract = False
+    tag = ""
 
     while len(flags) > 0:
         match flags[0]:
@@ -285,6 +337,9 @@ def locate_baseline(flags: list[str]) -> None:
                 download = True
             case "--extract":
                 extract = True
+            case "--tag":
+                tag = flags[1]
+                flags = flags[1:]
             case _:
                 eprint(USAGE)
                 exit(1)
@@ -333,8 +388,10 @@ def locate_baseline(flags: list[str]) -> None:
         eprint("skipping download step")
         return
 
+    artifact_glob = f"{ARTIFACT_PREFIX}{f"-{tag}" if tag else ""}*"
+
     sp.run(
-        ["gh", "run", "download", str(job_id), f"--pattern={ARTIFACT_GLOB}"],
+        ["gh", "run", "download", str(job_id), f"--pattern={artifact_glob}"],
         check=False,
     )
 
@@ -344,7 +401,7 @@ def locate_baseline(flags: list[str]) -> None:
 
     # Find the baseline with the most recent timestamp. GH downloads the files to e.g.
     # `some-dirname/some-dirname.tar.xz`, so just glob the whole thing together.
-    candidate_baselines = glob(f"{ARTIFACT_GLOB}/{ARTIFACT_GLOB}")
+    candidate_baselines = glob(f"{artifact_glob}/{artifact_glob}")
     if len(candidate_baselines) == 0:
         eprint("no possible baseline directories found")
         return
@@ -356,64 +413,22 @@ def locate_baseline(flags: list[str]) -> None:
     eprint("baseline extracted successfully")
 
 
-def check_iai_regressions(args: list[str]):
-    """Find regressions in iai summary.json files, exit with failure if any are
-    found.
-    """
-
-    iai_home_str = "iai-home"
-    pr_number = None
-
-    while len(args) > 0:
-        match args:
-            case ["--home", home, *rest]:
-                iai_home_str = home
-                args = rest
-            case ["--allow-pr-override", pr_num, *rest]:
-                pr_number = pr_num
-                args = rest
-            case _:
-                eprint(USAGE)
-                exit(1)
-
-    iai_home = Path(iai_home_str)
-
-    found_summaries = False
-    regressions: list[dict] = []
-    for summary_path in iglob("**/summary.json", root_dir=iai_home, recursive=True):
-        found_summaries = True
-        with open(iai_home / summary_path, "r") as f:
-            summary = json.load(f)
-
-        summary_regs = []
-        run = summary["callgrind_summary"]["callgrind_run"]
-        fname = summary["function_name"]
-        id = summary["id"]
-        name_entry = {"name": f"{fname}.{id}"}
-
-        for segment in run["segments"]:
-            summary_regs.extend(segment["regressions"])
+def handle_bench_regressions(args: list[str]):
+    """Exit with error unless the PR message contains an ignore directive."""
 
-        summary_regs.extend(run["total"]["regressions"])
-
-        regressions.extend(name_entry | reg for reg in summary_regs)
-
-    if not found_summaries:
-        eprint(f"did not find any summary.json files within {iai_home}")
-        exit(1)
+    match args:
+        case [pr_number]:
+            pr_number = pr_number
+        case _:
+            eprint(USAGE)
+            exit(1)
 
-    if len(regressions) == 0:
-        eprint("No regressions found")
+    pr = PrInfo.from_pr(pr_number)
+    if pr.cfg.allow_regressions:
+        eprint("PR allows regressions")
         return
 
-    eprint("Found regressions:", json.dumps(regressions, indent=4))
-
-    if pr_number is not None:
-        pr = PrInfo.load(pr_number)
-        if pr.contains_directive(REGRESSION_DIRECTIVE):
-            eprint("PR allows regressions, returning")
-            return
-
+    eprint("Regressions were found; benchmark failed")
     exit(1)
 
 
@@ -424,8 +439,8 @@ def main():
             ctx.emit_workflow_output()
         case ["locate-baseline", *flags]:
             locate_baseline(flags)
-        case ["check-regressions", *args]:
-            check_iai_regressions(args)
+        case ["handle-bench-regressions", *args]:
+            handle_bench_regressions(args)
         case ["--help" | "-h"]:
             print(USAGE)
             exit()
diff --git a/ci/docker/aarch64-unknown-linux-gnu/Dockerfile b/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
index df71804ba..69b99f5b6 100644
--- a/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/arm-unknown-linux-gnueabi/Dockerfile b/ci/docker/arm-unknown-linux-gnueabi/Dockerfile
index 38ad1a136..2fa6f8520 100644
--- a/ci/docker/arm-unknown-linux-gnueabi/Dockerfile
+++ b/ci/docker/arm-unknown-linux-gnueabi/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile b/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
index ffead05d5..85f7335f5 100644
--- a/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
+++ b/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile b/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
index 9ab49e46e..42511479f 100644
--- a/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
+++ b/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/i586-unknown-linux-gnu/Dockerfile b/ci/docker/i586-unknown-linux-gnu/Dockerfile
index d12ced325..35488c477 100644
--- a/ci/docker/i586-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/i586-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/i686-unknown-linux-gnu/Dockerfile b/ci/docker/i686-unknown-linux-gnu/Dockerfile
index d12ced325..35488c477 100644
--- a/ci/docker/i686-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/i686-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/loongarch64-unknown-linux-gnu/Dockerfile b/ci/docker/loongarch64-unknown-linux-gnu/Dockerfile
index 62b43da9e..e95a1b916 100644
--- a/ci/docker/loongarch64-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/loongarch64-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/mips-unknown-linux-gnu/Dockerfile b/ci/docker/mips-unknown-linux-gnu/Dockerfile
index c02a94672..fd1877603 100644
--- a/ci/docker/mips-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/mips-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile b/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
index 6d8b96069..4e542ce68 100644
--- a/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
+++ b/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile b/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
index 7e6ac7c3b..528dfd894 100644
--- a/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
+++ b/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/mipsel-unknown-linux-gnu/Dockerfile b/ci/docker/mipsel-unknown-linux-gnu/Dockerfile
index 9feadc7b5..257218023 100644
--- a/ci/docker/mipsel-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/mipsel-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/powerpc-unknown-linux-gnu/Dockerfile b/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
index 84dcaf47e..cac1f2361 100644
--- a/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile b/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
index b90fd5ec5..76127b7db 100644
--- a/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile b/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
index e6d1d1cd0..da1d56ca6 100644
--- a/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
@@ -12,6 +12,5 @@ ENV CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
     CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_RUNNER=qemu-ppc64le-static \
     AR_powerpc64le_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
     CC_powerpc64le_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
-    QEMU_CPU=POWER8 \
     QEMU_LD_PREFIX=/usr/powerpc64le-linux-gnu \
     RUST_TEST_THREADS=1
diff --git a/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile b/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
index eeb4ed019..513efacd6 100644
--- a/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/thumbv6m-none-eabi/Dockerfile b/ci/docker/thumbv6m-none-eabi/Dockerfile
index ad0d4351e..a9a172a21 100644
--- a/ci/docker/thumbv6m-none-eabi/Dockerfile
+++ b/ci/docker/thumbv6m-none-eabi/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/thumbv7em-none-eabi/Dockerfile b/ci/docker/thumbv7em-none-eabi/Dockerfile
index ad0d4351e..a9a172a21 100644
--- a/ci/docker/thumbv7em-none-eabi/Dockerfile
+++ b/ci/docker/thumbv7em-none-eabi/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/thumbv7em-none-eabihf/Dockerfile b/ci/docker/thumbv7em-none-eabihf/Dockerfile
index ad0d4351e..a9a172a21 100644
--- a/ci/docker/thumbv7em-none-eabihf/Dockerfile
+++ b/ci/docker/thumbv7em-none-eabihf/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/thumbv7m-none-eabi/Dockerfile b/ci/docker/thumbv7m-none-eabi/Dockerfile
index ad0d4351e..a9a172a21 100644
--- a/ci/docker/thumbv7m-none-eabi/Dockerfile
+++ b/ci/docker/thumbv7m-none-eabi/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/docker/x86_64-unknown-linux-gnu/Dockerfile b/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
index c590adcdd..2ef800129 100644
--- a/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE=ubuntu:24.04
+ARG IMAGE=ubuntu:25.04
 FROM $IMAGE
 
 RUN apt-get update && \
diff --git a/ci/run-docker.sh b/ci/run-docker.sh
index d0122dee5..4c1fe0fe2 100755
--- a/ci/run-docker.sh
+++ b/ci/run-docker.sh
@@ -97,7 +97,7 @@ if [ "${1:-}" = "--help" ] || [ "$#" -gt 1 ]; then
     usage: ./ci/run-docker.sh [target]
 
     you can also set DOCKER_BASE_IMAGE to use something other than the default
-    ubuntu:24.04 (or rustlang/rust:nightly).
+    ubuntu:25.04 (or rustlang/rust:nightly).
     "
     exit
 fi
diff --git a/ci/run.sh b/ci/run.sh
index 68d13c130..bc94d42fe 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -41,136 +41,55 @@ else
     "${test_builtins[@]}" --benches
     "${test_builtins[@]}" --benches --release
 
-    if [ "${TEST_VERBATIM:-}" = "1" ]; then
+    # Validate that having a verbatim path for the target directory works
+    # (trivial to regress using `/` in paths to build artifacts rather than
+    # `Path::join`). MinGW does not currently support these paths.
+    if [[ "$target" = *"windows"* ]] && [[ "$target" != *"gnu"* ]]; then
         verb_path=$(cmd.exe //C echo \\\\?\\%cd%\\builtins-test\\target2)
         "${test_builtins[@]}" --target-dir "$verb_path" --features c
     fi
 fi
 
-
-declare -a rlib_paths
-
-# Set the `rlib_paths` global array to a list of all compiler-builtins rlibs
-update_rlib_paths() {
-    if [ -d /builtins-target ]; then
-        rlib_paths=( /builtins-target/"${target}"/debug/deps/libcompiler_builtins-*.rlib )
-    else
-        rlib_paths=( target/"${target}"/debug/deps/libcompiler_builtins-*.rlib )
-    fi
-}
-
-# Remove any existing artifacts from previous tests that don't set #![compiler_builtins]
-update_rlib_paths
-rm -f "${rlib_paths[@]}"
-
-cargo build -p compiler_builtins --target "$target"
-cargo build -p compiler_builtins --target "$target" --release
-cargo build -p compiler_builtins --target "$target" --features c
-cargo build -p compiler_builtins --target "$target" --features c --release
-cargo build -p compiler_builtins --target "$target" --features no-asm
-cargo build -p compiler_builtins --target "$target" --features no-asm --release
-cargo build -p compiler_builtins --target "$target" --features no-f16-f128
-cargo build -p compiler_builtins --target "$target" --features no-f16-f128 --release
-
-PREFIX=${target//unknown-/}-
-case "$target" in
-    armv7-*)
-        PREFIX=arm-linux-gnueabihf-
-        ;;
-    thumb*)
-        PREFIX=arm-none-eabi-
-        ;;
-    *86*-*)
-        PREFIX=
-        ;;
-esac
-
-NM=$(find "$(rustc --print sysroot)" \( -name llvm-nm -o -name llvm-nm.exe \) )
-if [ "$NM" = "" ]; then
-  NM="${PREFIX}nm"
-fi
-
-# i686-pc-windows-gnu tools have a dependency on some DLLs, so run it with
-# rustup run to ensure that those are in PATH.
-TOOLCHAIN="$(rustup show active-toolchain | sed 's/ (default)//')"
-if [[ "$TOOLCHAIN" == *i686-pc-windows-gnu ]]; then
-  NM="rustup run $TOOLCHAIN $NM"
-fi
-
-# Look out for duplicated symbols when we include the compiler-rt (C) implementation
-update_rlib_paths
-for rlib in "${rlib_paths[@]}"; do
-    set +x
-    echo "================================================================"
-    echo "checking $rlib for duplicate symbols"
-    echo "================================================================"
-    set -x
-    
-    duplicates_found=0
-
-    # NOTE On i586, It's normal that the get_pc_thunk symbol appears several
-    # times so ignore it
-    $NM -g --defined-only "$rlib" 2>&1 |
-      sort |
-      uniq -d |
-      grep -v __x86.get_pc_thunk --quiet |
-      grep 'T __' && duplicates_found=1
-
-    if [ "$duplicates_found" != 0 ]; then
-        echo "error: found duplicate symbols"
-        exit 1
-    else
-        echo "success; no duplicate symbols found"
+# Ensure there are no duplicate symbols or references to `core` when
+# `compiler-builtins` is built with various features. Symcheck invokes Cargo to
+# build with the arguments we provide it, then validates the built artifacts.
+symcheck=(cargo run -p symbol-check --release)
+[[ "$target" = "wasm"* ]] && symcheck+=(--features wasm)
+symcheck+=(-- build-and-check)
+
+"${symcheck[@]}" "$target" -- -p compiler_builtins
+"${symcheck[@]}" "$target" -- -p compiler_builtins --release
+"${symcheck[@]}" "$target" -- -p compiler_builtins --features c
+"${symcheck[@]}" "$target" -- -p compiler_builtins --features c --release
+"${symcheck[@]}" "$target" -- -p compiler_builtins --features no-asm
+"${symcheck[@]}" "$target" -- -p compiler_builtins --features no-asm --release
+"${symcheck[@]}" "$target" -- -p compiler_builtins --features no-f16-f128
+"${symcheck[@]}" "$target" -- -p compiler_builtins --features no-f16-f128 --release
+
+run_intrinsics_test() {
+    build_args=(--verbose --manifest-path builtins-test-intrinsics/Cargo.toml)
+    build_args+=("$@")
+
+    # symcheck also checks the results of builtins-test-intrinsics
+    "${symcheck[@]}" "$target" -- "${build_args[@]}"
+
+    # FIXME: we get access violations on Windows, our entrypoint may need to
+    # be tweaked.
+    if [ "${BUILD_ONLY:-}" != "1" ] && ! [[ "$target" = *"windows"* ]]; then
+        cargo run --target "$target" "${build_args[@]}"
     fi
-done
-
-rm -f "${rlib_paths[@]}"
-
-build_intrinsics_test() {
-    cargo build \
-        --target "$target" --verbose \
-        --manifest-path builtins-test-intrinsics/Cargo.toml "$@"
 }
 
 # Verify that we haven't dropped any intrinsics/symbols
-build_intrinsics_test
-build_intrinsics_test --release
-build_intrinsics_test --features c
-build_intrinsics_test --features c --release
+run_intrinsics_test
+run_intrinsics_test --release
+run_intrinsics_test --features c
+run_intrinsics_test --features c --release
 
 # Verify that there are no undefined symbols to `panic` within our
 # implementations
-CARGO_PROFILE_DEV_LTO=true build_intrinsics_test
-CARGO_PROFILE_RELEASE_LTO=true build_intrinsics_test --release
-
-# Ensure no references to any symbols from core
-update_rlib_paths
-for rlib in "${rlib_paths[@]}"; do
-    set +x
-    echo "================================================================"
-    echo "checking $rlib for references to core"
-    echo "================================================================"
-    set -x
-
-    tmpdir="${CARGO_TARGET_DIR:-target}/tmp"
-    test -d "$tmpdir" || mkdir "$tmpdir"
-    defined="$tmpdir/defined_symbols.txt"
-    undefined="$tmpdir/defined_symbols.txt"
-
-    $NM --quiet -U "$rlib" | grep 'T _ZN4core' | awk '{print $3}' | sort | uniq > "$defined"
-    $NM --quiet -u "$rlib" | grep 'U _ZN4core' | awk '{print $2}' | sort | uniq > "$undefined"
-    grep_has_results=0
-    grep -v -F -x -f "$defined" "$undefined" && grep_has_results=1
-
-    if [ "$target" = "powerpc64-unknown-linux-gnu" ]; then
-        echo "FIXME: powerpc64 fails these tests"
-    elif [ "$grep_has_results" != 0 ]; then
-        echo "error: found unexpected references to core"
-        exit 1
-    else
-        echo "success; no references to core found"
-    fi
-done
+CARGO_PROFILE_DEV_LTO=true run_intrinsics_test
+CARGO_PROFILE_RELEASE_LTO=true run_intrinsics_test --release
 
 # Test libm
 
@@ -245,7 +164,7 @@ else
     mflags+=(--workspace --target "$target")
     cmd=(cargo test "${mflags[@]}")
     profile_flag="--profile"
-    
+
     # If nextest is available, use that
     command -v cargo-nextest && nextest=1 || nextest=0
     if [ "$nextest" = "1" ]; then
@@ -288,7 +207,7 @@ else
     "${cmd[@]}" "$profile_flag" release-checked --features unstable-intrinsics --benches
 
     # Ensure that the routines do not panic.
-    # 
+    #
     # `--tests` must be passed because no-panic is only enabled as a dev
     # dependency. The `release-opt` profile must be used to enable LTO and a
     # single CGU.
diff --git a/ci/update-musl.sh b/ci/update-musl.sh
new file mode 100755
index 000000000..637ab1394
--- /dev/null
+++ b/ci/update-musl.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# Download musl to a repository for `musl-math-sys`
+
+set -eux
+
+url=https://github.com/kraj/musl.git
+ref=c47ad25ea3b484e10326f933e927c0bc8cded3da
+dst=crates/musl-math-sys/musl
+
+if ! [ -d "$dst" ]; then
+    git clone "$url" "$dst" --single-branch --depth=1000
+fi
+
+git -C "$dst" fetch "$url" --depth=1
+git -C "$dst" checkout "$ref"
diff --git a/compiler-builtins/CHANGELOG.md b/compiler-builtins/CHANGELOG.md
index f0af37ba0..880e56c44 100644
--- a/compiler-builtins/CHANGELOG.md
+++ b/compiler-builtins/CHANGELOG.md
@@ -7,6 +7,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.160](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.159...compiler_builtins-v0.1.160) - 2025-05-29
+
+### Other
+
+- Change `compiler-builtins` to edition 2024
+- Remove unneeded C symbols
+- Reuse `libm`'s `Caat` and `CastFrom` in `compiler-builtins`
+- Reuse `MinInt` and `Int` from `libm` in `compiler-builtins`
+- Update `CmpResult` to use a pointer-sized return type
+- Enable `__powitf2` on MSVC
+- Fix `i256::MAX`
+- Add a note saying why we use `frintx` rather than `frintn`
+- Typo in README.md
+- Clean up unused files
+
+## [0.1.159](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.158...compiler_builtins-v0.1.159) - 2025-05-12
+
+### Other
+
+- Remove cfg(bootstrap)
+
 ## [0.1.158](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.157...compiler_builtins-v0.1.158) - 2025-05-06
 
 ### Other
diff --git a/compiler-builtins/Cargo.toml b/compiler-builtins/Cargo.toml
index 81f708c48..8bbe136ce 100644
--- a/compiler-builtins/Cargo.toml
+++ b/compiler-builtins/Cargo.toml
@@ -1,31 +1,32 @@
+# NOTE: Must be kept in sync with `../builtins-shim/Cargo.toml`.
+#
+# This manifest is actually used in-tree by rust-lang/rust,
+# `../builtins-shim/Cargo.toml` is used by out-of-tree testing. See the other
+# manifest for further details.
+
 [package]
-authors = ["Jorge Aparicio <japaricious@gmail.com>"]
 name = "compiler_builtins"
-version = "0.1.158"
-license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)"
-readme = "README.md"
-repository = "https://github.com/rust-lang/compiler-builtins"
-homepage = "https://github.com/rust-lang/compiler-builtins"
-documentation = "https://docs.rs/compiler_builtins"
-edition = "2021"
+version = "0.1.160"
+authors = ["Jorge Aparicio <japaricious@gmail.com>"]
 description = "Compiler intrinsics used by the Rust compiler."
+repository = "https://github.com/rust-lang/compiler-builtins"
+license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)"
+edition = "2024"
+publish = false
 links = "compiler-rt"
 
 [lib]
 bench = false
 doctest = false
 test = false
+# make sure this crate isn't included in public standard library docs
+doc = false
 
 [dependencies]
-# For more information on this dependency see
-# https://github.com/rust-lang/rust/tree/master/library/rustc-std-workspace-core
-core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
+core = { path = "../../core", optional = true }
 
 [build-dependencies]
-cc = { optional = true, version = "1.0" }
-
-[dev-dependencies]
-panic-handler = { path = "../crates/panic-handler" }
+cc = { optional = true, version = "1.2" }
 
 [features]
 default = ["compiler-builtins"]
@@ -34,8 +35,9 @@ default = ["compiler-builtins"]
 # implementations and also filling in unimplemented intrinsics
 c = ["dep:cc"]
 
-# Workaround for the Cranelift codegen backend. Disables any implementations
-# which use inline assembly and fall back to pure Rust versions (if available).
+# For implementations where there is both a generic version and a platform-
+# specific version, use the generic version. This is meant to enable testing
+# the generic versions on all platforms.
 no-asm = []
 
 # Workaround for codegen backends which haven't yet implemented `f16` and
@@ -58,7 +60,3 @@ rustc-dep-of-std = ["compiler-builtins", "dep:core"]
 # This makes certain traits and function specializations public that
 # are not normally public but are required by the `builtins-test`
 unstable-public-internals = []
-
-[lints.rust]
-# The cygwin config can be dropped after our benchmark toolchain is bumped
-unexpected_cfgs = { level = "warn", check-cfg = ['cfg(bootstrap)', 'cfg(target_os, values("cygwin"))'] }
diff --git a/compiler-builtins/LICENSE.txt b/compiler-builtins/LICENSE.txt
deleted file mode 120000
index 4ab43736a..000000000
--- a/compiler-builtins/LICENSE.txt
+++ /dev/null
@@ -1 +0,0 @@
-../LICENSE.txt
\ No newline at end of file
diff --git a/compiler-builtins/build.rs b/compiler-builtins/build.rs
index 90d98ec7c..43b978606 100644
--- a/compiler-builtins/build.rs
+++ b/compiler-builtins/build.rs
@@ -1,11 +1,8 @@
 mod configure;
 
-use std::collections::BTreeMap;
 use std::env;
-use std::path::PathBuf;
-use std::sync::atomic::Ordering;
 
-use configure::{Target, configure_aliases, configure_f16_f128};
+use configure::{Target, configure_aliases};
 
 fn main() {
     println!("cargo::rerun-if-changed=build.rs");
@@ -15,13 +12,15 @@ fn main() {
     let cwd = env::current_dir().unwrap();
 
     configure_check_cfg();
-    configure_f16_f128(&target);
     configure_aliases(&target);
 
     configure_libm(&target);
 
     println!("cargo:compiler-rt={}", cwd.join("compiler-rt").display());
 
+    println!("cargo::rustc-check-cfg=cfg(kernel_user_helpers)");
+    println!("cargo::rustc-check-cfg=cfg(feature, values(\"mem-unaligned\"))");
+
     // Emscripten's runtime includes all the builtins
     if target.os == "emscripten" {
         return;
@@ -47,7 +46,6 @@ fn main() {
     }
 
     // These targets have hardware unaligned access support.
-    println!("cargo::rustc-check-cfg=cfg(feature, values(\"mem-unaligned\"))");
     if target.arch.contains("x86_64")
         || target.arch.contains("x86")
         || target.arch.contains("aarch64")
@@ -78,17 +76,12 @@ fn main() {
     // Only emit the ARM Linux atomic emulation on pre-ARMv6 architectures. This
     // includes the old androideabi. It is deprecated but it is available as a
     // rustc target (arm-linux-androideabi).
-    println!("cargo::rustc-check-cfg=cfg(kernel_user_helpers)");
     if llvm_target[0] == "armv4t"
         || llvm_target[0] == "armv5te"
         || target.triple == "arm-linux-androideabi"
     {
         println!("cargo:rustc-cfg=kernel_user_helpers")
     }
-
-    if llvm_target[0].starts_with("aarch64") {
-        generate_aarch64_outlined_atomics();
-    }
 }
 
 /// Run configuration for `libm` since it is included directly.
@@ -113,13 +106,6 @@ fn configure_libm(target: &Target) {
         println!("cargo:rustc-cfg=optimizations_enabled");
     }
 
-    // Config shorthands
-    println!("cargo:rustc-check-cfg=cfg(x86_no_sse)");
-    if target.arch == "x86" && !target.features.iter().any(|f| f == "sse") {
-        // Shorthand to detect i586 targets
-        println!("cargo:rustc-cfg=x86_no_sse");
-    }
-
     println!(
         "cargo:rustc-env=CFG_CARGO_FEATURES={:?}",
         target.cargo_features
@@ -131,61 +117,6 @@ fn configure_libm(target: &Target) {
     println!("cargo:rustc-cfg=feature=\"unstable-intrinsics\"");
 }
 
-fn aarch64_symbol(ordering: Ordering) -> &'static str {
-    match ordering {
-        Ordering::Relaxed => "relax",
-        Ordering::Acquire => "acq",
-        Ordering::Release => "rel",
-        Ordering::AcqRel => "acq_rel",
-        _ => panic!("unknown symbol for {ordering:?}"),
-    }
-}
-
-/// The `concat_idents` macro is extremely annoying and doesn't allow us to define new items.
-/// Define them from the build script instead.
-/// Note that the majority of the code is still defined in `aarch64.rs` through inline macros.
-fn generate_aarch64_outlined_atomics() {
-    use std::fmt::Write;
-    // #[macro_export] so that we can use this in tests
-    let gen_macro =
-        |name| format!("#[macro_export] macro_rules! foreach_{name} {{ ($macro:path) => {{\n");
-
-    // Generate different macros for add/clr/eor/set so that we can test them separately.
-    let sym_names = ["cas", "ldadd", "ldclr", "ldeor", "ldset", "swp"];
-    let mut macros = BTreeMap::new();
-    for sym in sym_names {
-        macros.insert(sym, gen_macro(sym));
-    }
-
-    // Only CAS supports 16 bytes, and it has a different implementation that uses a different macro.
-    let mut cas16 = gen_macro("cas16");
-
-    for ordering in [
-        Ordering::Relaxed,
-        Ordering::Acquire,
-        Ordering::Release,
-        Ordering::AcqRel,
-    ] {
-        let sym_ordering = aarch64_symbol(ordering);
-        for size in [1, 2, 4, 8] {
-            for (sym, macro_) in &mut macros {
-                let name = format!("__aarch64_{sym}{size}_{sym_ordering}");
-                writeln!(macro_, "$macro!( {ordering:?}, {size}, {name} );").unwrap();
-            }
-        }
-        let name = format!("__aarch64_cas16_{sym_ordering}");
-        writeln!(cas16, "$macro!( {ordering:?}, {name} );").unwrap();
-    }
-
-    let mut buf = String::new();
-    for macro_def in macros.values().chain(std::iter::once(&cas16)) {
-        buf += macro_def;
-        buf += "}; }\n";
-    }
-    let out_dir = PathBuf::from(std::env::var("OUT_DIR").unwrap());
-    std::fs::write(out_dir.join("outlined_atomics.rs"), buf).unwrap();
-}
-
 /// Emit directives for features we expect to support that aren't in `Cargo.toml`.
 ///
 /// These are mostly cfg elements emitted by this `build.rs`.
@@ -555,7 +486,6 @@ mod c {
 
         if (target.arch == "aarch64" || target.arch == "arm64ec") && consider_float_intrinsics {
             sources.extend(&[
-                ("__comparetf2", "comparetf2.c"),
                 ("__fe_getround", "fp_mode.c"),
                 ("__fe_raise_inexact", "fp_mode.c"),
             ]);
@@ -570,11 +500,11 @@ mod c {
         }
 
         if target.arch == "mips64" {
-            sources.extend(&[("__netf2", "comparetf2.c"), ("__fe_getround", "fp_mode.c")]);
+            sources.extend(&[("__fe_getround", "fp_mode.c")]);
         }
 
         if target.arch == "loongarch64" {
-            sources.extend(&[("__netf2", "comparetf2.c"), ("__fe_getround", "fp_mode.c")]);
+            sources.extend(&[("__fe_getround", "fp_mode.c")]);
         }
 
         // Remove the assembly implementations that won't compile for the target
diff --git a/compiler-builtins/configure.rs b/compiler-builtins/configure.rs
index d825f35a9..79e238abc 100644
--- a/compiler-builtins/configure.rs
+++ b/compiler-builtins/configure.rs
@@ -1,6 +1,6 @@
 // Configuration that is shared between `compiler_builtins` and `builtins_test`.
 
-use std::env;
+use std::{env, str};
 
 #[derive(Debug)]
 #[allow(dead_code)]
@@ -16,6 +16,8 @@ pub struct Target {
     pub pointer_width: u8,
     pub little_endian: bool,
     pub features: Vec<String>,
+    pub reliable_f128: bool,
+    pub reliable_f16: bool,
 }
 
 impl Target {
@@ -51,6 +53,10 @@ impl Target {
                 .split(",")
                 .map(ToOwned::to_owned)
                 .collect(),
+            // Note that these are unstable options, so only show up with the nightly compiler or
+            // with `RUSTC_BOOTSTRAP=1` (which is required to use the types anyway).
+            reliable_f128: env::var_os("CARGO_CFG_TARGET_HAS_RELIABLE_F128").is_some(),
+            reliable_f16: env::var_os("CARGO_CFG_TARGET_HAS_RELIABLE_F16").is_some(),
         }
     }
 
@@ -74,63 +80,31 @@ pub fn configure_aliases(target: &Target) {
     if target.triple_split[0] == "thumbv6m" || target.triple_split[0] == "thumbv8m.base" {
         println!("cargo:rustc-cfg=thumb_1")
     }
-}
 
-/// Configure whether or not `f16` and `f128` support should be enabled.
-pub fn configure_f16_f128(target: &Target) {
-    // Set whether or not `f16` and `f128` are supported at a basic level by LLVM. This only means
-    // that the backend will not crash when using these types and generates code that can be called
-    // without crashing (no infinite recursion). This does not mean that the platform doesn't have
-    // ABI or other bugs.
-    //
-    // We do this here rather than in `rust-lang/rust` because configuring via cargo features is
-    // not straightforward.
-    //
-    // Original source of this list:
-    // <https://github.com/rust-lang/compiler-builtins/pull/652#issuecomment-2266151350>
-    let f16_enabled = match target.arch.as_str() {
-        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
-        "arm64ec" => false,
-        // Selection failure <https://github.com/llvm/llvm-project/issues/50374>
-        "s390x" => false,
-        // Infinite recursion <https://github.com/llvm/llvm-project/issues/97981>
-        "csky" => false,
-        "hexagon" => false,
-        "powerpc" | "powerpc64" => false,
-        "sparc" | "sparc64" => false,
-        "wasm32" | "wasm64" => false,
-        // Most everything else works as of LLVM 19
-        _ => true,
-    };
+    // Config shorthands
+    println!("cargo:rustc-check-cfg=cfg(x86_no_sse)");
+    if target.arch == "x86" && !target.features.iter().any(|f| f == "sse") {
+        // Shorthand to detect i586 targets
+        println!("cargo:rustc-cfg=x86_no_sse");
+    }
 
-    let f128_enabled = match target.arch.as_str() {
-        // Unsupported (libcall is not supported) <https://github.com/llvm/llvm-project/issues/121122>
-        "amdgpu" => false,
-        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
-        "arm64ec" => false,
-        // FIXME(llvm20): fixed by <https://github.com/llvm/llvm-project/pull/117525>
-        "mips64" | "mips64r6" => false,
-        // Selection failure <https://github.com/llvm/llvm-project/issues/95471>
-        "nvptx64" => false,
-        // Selection failure <https://github.com/llvm/llvm-project/issues/101545>
-        "powerpc64" if &target.os == "aix" => false,
-        // Selection failure <https://github.com/llvm/llvm-project/issues/41838>
-        "sparc" => false,
-        // Most everything else works as of LLVM 19
-        _ => true,
-    };
+    /* Not all backends support `f16` and `f128` to the same level on all architectures, so we
+     * need to disable things if the compiler may crash. See configuration at:
+     * * https://github.com/rust-lang/rust/blob/c65dccabacdfd6c8a7f7439eba13422fdd89b91e/compiler/rustc_codegen_llvm/src/llvm_util.rs#L367-L432
+     * * https://github.com/rust-lang/rustc_codegen_gcc/blob/4b5c44b14166083eef8d71f15f5ea1f53fc976a0/src/lib.rs#L496-L507
+     * * https://github.com/rust-lang/rustc_codegen_cranelift/blob/c713ffab3c6e28ab4b4dd4e392330f786ea657ad/src/lib.rs#L196-L226
+     */
 
-    // If the feature is set, disable these types.
-    let disable_both = env::var_os("CARGO_FEATURE_NO_F16_F128").is_some();
+    // If the feature is set, disable both of these types.
+    let no_f16_f128 = target.cargo_features.iter().any(|s| s == "no-f16-f128");
 
     println!("cargo::rustc-check-cfg=cfg(f16_enabled)");
-    println!("cargo::rustc-check-cfg=cfg(f128_enabled)");
-
-    if f16_enabled && !disable_both {
+    if target.reliable_f16 && !no_f16_f128 {
         println!("cargo::rustc-cfg=f16_enabled");
     }
 
-    if f128_enabled && !disable_both {
+    println!("cargo::rustc-check-cfg=cfg(f128_enabled)");
+    if target.reliable_f128 && !no_f16_f128 {
         println!("cargo::rustc-cfg=f128_enabled");
     }
 }
diff --git a/compiler-builtins/src/aarch64.rs b/compiler-builtins/src/aarch64.rs
index 80392187c..039fab206 100644
--- a/compiler-builtins/src/aarch64.rs
+++ b/compiler-builtins/src/aarch64.rs
@@ -4,8 +4,8 @@ use core::intrinsics;
 
 intrinsics! {
     #[unsafe(naked)]
-    #[cfg(all(target_os = "uefi", not(feature = "no-asm")))]
-    pub unsafe extern "C" fn __chkstk() {
+    #[cfg(target_os = "uefi")]
+    pub unsafe extern "custom" fn __chkstk() {
         core::arch::naked_asm!(
             ".p2align 2",
             "lsl    x16, x15, #4",
diff --git a/compiler-builtins/src/aarch64_linux.rs b/compiler-builtins/src/aarch64_linux.rs
index e238d0237..01d7fb473 100644
--- a/compiler-builtins/src/aarch64_linux.rs
+++ b/compiler-builtins/src/aarch64_linux.rs
@@ -4,10 +4,7 @@
 //! To avoid breaking backwards compat, C toolchains introduced a concept of "outlined atomics",
 //! where atomic operations call into the compiler runtime to dispatch between two depending on
 //! which is supported on the current CPU.
-//! See https://community.arm.com/arm-community-blogs/b/tools-software-ides-blog/posts/making-the-most-of-the-arm-architecture-in-gcc-10#:~:text=out%20of%20line%20atomics for more discussion.
-//!
-//! Currently we only support LL/SC, because LSE requires `getauxval` from libc in order to do runtime detection.
-//! Use the `compiler-rt` intrinsics if you want LSE support.
+//! See <https://community.arm.com/arm-community-blogs/b/tools-software-ides-blog/posts/making-the-most-of-the-arm-architecture-in-gcc-10#:~:text=out%20of%20line%20atomics> for more discussion.
 //!
 //! Ported from `aarch64/lse.S` in LLVM's compiler-rt.
 //!
@@ -24,7 +21,18 @@
 //! We do something similar, but with macro arguments.
 #![cfg_attr(feature = "c", allow(unused_macros))] // avoid putting the macros into a submodule
 
-// We don't do runtime dispatch so we don't have to worry about the `__aarch64_have_lse_atomics` global ctor.
+use core::sync::atomic::{AtomicU8, Ordering};
+
+/// non-zero if the host supports LSE atomics.
+static HAVE_LSE_ATOMICS: AtomicU8 = AtomicU8::new(0);
+
+intrinsics! {
+    /// Call to enable LSE in outline atomic operations. The caller must verify
+    /// LSE operations are supported.
+    pub extern "C" fn __rust_enable_lse() {
+        HAVE_LSE_ATOMICS.store(1, Ordering::Relaxed);
+    }
+}
 
 /// Translate a byte size to a Rust type.
 #[rustfmt::skip]
@@ -45,6 +53,7 @@ macro_rules! reg {
     (2, $num:literal) => { concat!("w", $num) };
     (4, $num:literal) => { concat!("w", $num) };
     (8, $num:literal) => { concat!("x", $num) };
+    (16, $num:literal) => { concat!("x", $num) };
 }
 
 /// Given an atomic ordering, translate it to the acquire suffix for the lxdr aarch64 ASM instruction.
@@ -126,6 +135,41 @@ macro_rules! stxp {
     };
 }
 
+// If supported, perform the requested LSE op and return, or fallthrough.
+macro_rules! try_lse_op {
+    ($op: literal, $ordering:ident, $bytes:tt, $($reg:literal,)* [ $mem:ident ] ) => {
+        concat!(
+            ".arch_extension lse; ",
+            "adrp    x16, {have_lse}; ",
+            "ldrb    w16, [x16, :lo12:{have_lse}]; ",
+            "cbz     w16, 8f; ",
+            // LSE_OP  s(reg),* [$mem]
+            concat!(lse!($op, $ordering, $bytes), $( " ", reg!($bytes, $reg), ", " ,)* "[", stringify!($mem), "]; ",),
+            "ret; ",
+            "8:"
+        )
+    };
+}
+
+// Translate memory ordering to the LSE suffix
+#[rustfmt::skip]
+macro_rules! lse_mem_sfx {
+    (Relaxed) => { "" };
+    (Acquire) => { "a" };
+    (Release) => { "l" };
+    (AcqRel) => { "al" };
+}
+
+// Generate the aarch64 LSE operation for memory ordering and width
+macro_rules! lse {
+    ($op:literal, $order:ident, 16) => {
+        concat!($op, "p", lse_mem_sfx!($order))
+    };
+    ($op:literal, $order:ident, $bytes:tt) => {
+        concat!($op, lse_mem_sfx!($order), size!($bytes))
+    };
+}
+
 /// See <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.compare_and_swap>.
 macro_rules! compare_and_swap {
     ($ordering:ident, $bytes:tt, $name:ident) => {
@@ -137,7 +181,9 @@ macro_rules! compare_and_swap {
             ) -> int_ty!($bytes) {
                 // We can't use `AtomicI8::compare_and_swap`; we *are* compare_and_swap.
                 core::arch::naked_asm! {
-                    // UXT s(tmp0), s(0)
+                    // CAS    s(0), s(1), [x2]; if LSE supported.
+                    try_lse_op!("cas", $ordering, $bytes, 0, 1, [x2]),
+                    // UXT    s(tmp0), s(0)
                     concat!(uxt!($bytes), " ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
                     "0:",
                     // LDXR   s(0), [x2]
@@ -150,6 +196,7 @@ macro_rules! compare_and_swap {
                     "cbnz   w17, 0b",
                     "1:",
                     "ret",
+                    have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
                 }
             }
         }
@@ -166,6 +213,8 @@ macro_rules! compare_and_swap_i128 {
                 expected: i128, desired: i128, ptr: *mut i128
             ) -> i128 {
                 core::arch::naked_asm! {
+                    // CASP   x0, x1, x2, x3, [x4]; if LSE supported.
+                    try_lse_op!("cas", $ordering, 16, 0, 1, 2, 3, [x4]),
                     "mov    x16, x0",
                     "mov    x17, x1",
                     "0:",
@@ -179,6 +228,7 @@ macro_rules! compare_and_swap_i128 {
                     "cbnz   w15, 0b",
                     "1:",
                     "ret",
+                    have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
                 }
             }
         }
@@ -195,6 +245,8 @@ macro_rules! swap {
                 left: int_ty!($bytes), right_ptr: *mut int_ty!($bytes)
             ) -> int_ty!($bytes) {
                 core::arch::naked_asm! {
+                    // SWP    s(0), s(0), [x1]; if LSE supported.
+                    try_lse_op!("swp", $ordering, $bytes, 0, 0, [x1]),
                     // mov    s(tmp0), s(0)
                     concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
                     "0:",
@@ -204,6 +256,7 @@ macro_rules! swap {
                     concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 16), ", [x1]"),
                     "cbnz   w17, 0b",
                     "ret",
+                    have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
                 }
             }
         }
@@ -212,7 +265,7 @@ macro_rules! swap {
 
 /// See (e.g.) <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.fetch_add>.
 macro_rules! fetch_op {
-    ($ordering:ident, $bytes:tt, $name:ident, $op:literal) => {
+    ($ordering:ident, $bytes:tt, $name:ident, $op:literal, $lse_op:literal) => {
         intrinsics! {
             #[maybe_use_optimized_c_shim]
             #[unsafe(naked)]
@@ -220,6 +273,8 @@ macro_rules! fetch_op {
                 val: int_ty!($bytes), ptr: *mut int_ty!($bytes)
             ) -> int_ty!($bytes) {
                 core::arch::naked_asm! {
+                    // LSEOP  s(0), s(0), [x1]; if LSE supported.
+                    try_lse_op!($lse_op, $ordering, $bytes, 0, 0, [x1]),
                     // mov    s(tmp0), s(0)
                     concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
                     "0:",
@@ -231,6 +286,7 @@ macro_rules! fetch_op {
                     concat!(stxr!($ordering, $bytes), " w15, ", reg!($bytes, 17), ", [x1]"),
                     "cbnz  w15, 0b",
                     "ret",
+                    have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
                 }
             }
         }
@@ -240,30 +296,100 @@ macro_rules! fetch_op {
 // We need a single macro to pass to `foreach_ldadd`.
 macro_rules! add {
     ($ordering:ident, $bytes:tt, $name:ident) => {
-        fetch_op! { $ordering, $bytes, $name, "add" }
+        fetch_op! { $ordering, $bytes, $name, "add", "ldadd" }
     };
 }
 
 macro_rules! and {
     ($ordering:ident, $bytes:tt, $name:ident) => {
-        fetch_op! { $ordering, $bytes, $name, "bic" }
+        fetch_op! { $ordering, $bytes, $name, "bic", "ldclr" }
     };
 }
 
 macro_rules! xor {
     ($ordering:ident, $bytes:tt, $name:ident) => {
-        fetch_op! { $ordering, $bytes, $name, "eor" }
+        fetch_op! { $ordering, $bytes, $name, "eor", "ldeor" }
     };
 }
 
 macro_rules! or {
     ($ordering:ident, $bytes:tt, $name:ident) => {
-        fetch_op! { $ordering, $bytes, $name, "orr" }
+        fetch_op! { $ordering, $bytes, $name, "orr", "ldset" }
+    };
+}
+
+#[macro_export]
+macro_rules! foreach_ordering {
+    ($macro:path, $bytes:tt, $name:ident) => {
+        $macro!( Relaxed, $bytes, ${concat($name, _relax)} );
+        $macro!( Acquire, $bytes, ${concat($name, _acq)} );
+        $macro!( Release, $bytes, ${concat($name, _rel)} );
+        $macro!( AcqRel, $bytes, ${concat($name, _acq_rel)} );
+    };
+    ($macro:path, $name:ident) => {
+        $macro!( Relaxed, ${concat($name, _relax)} );
+        $macro!( Acquire, ${concat($name, _acq)} );
+        $macro!( Release, ${concat($name, _rel)} );
+        $macro!( AcqRel, ${concat($name, _acq_rel)} );
+    };
+}
+
+#[macro_export]
+macro_rules! foreach_bytes {
+    ($macro:path, $name:ident) => {
+        foreach_ordering!( $macro, 1, ${concat(__aarch64_, $name, "1")} );
+        foreach_ordering!( $macro, 2, ${concat(__aarch64_, $name, "2")} );
+        foreach_ordering!( $macro, 4, ${concat(__aarch64_, $name, "4")} );
+        foreach_ordering!( $macro, 8, ${concat(__aarch64_, $name, "8")} );
+    };
+}
+
+/// Generate different macros for cas/swp/add/clr/eor/set so that we can test them separately.
+#[macro_export]
+macro_rules! foreach_cas {
+    ($macro:path) => {
+        foreach_bytes!($macro, cas);
+    };
+}
+
+/// Only CAS supports 16 bytes, and it has a different implementation that uses a different macro.
+#[macro_export]
+macro_rules! foreach_cas16 {
+    ($macro:path) => {
+        foreach_ordering!($macro, __aarch64_cas16);
+    };
+}
+#[macro_export]
+macro_rules! foreach_swp {
+    ($macro:path) => {
+        foreach_bytes!($macro, swp);
+    };
+}
+#[macro_export]
+macro_rules! foreach_ldadd {
+    ($macro:path) => {
+        foreach_bytes!($macro, ldadd);
+    };
+}
+#[macro_export]
+macro_rules! foreach_ldclr {
+    ($macro:path) => {
+        foreach_bytes!($macro, ldclr);
+    };
+}
+#[macro_export]
+macro_rules! foreach_ldeor {
+    ($macro:path) => {
+        foreach_bytes!($macro, ldeor);
+    };
+}
+#[macro_export]
+macro_rules! foreach_ldset {
+    ($macro:path) => {
+        foreach_bytes!($macro, ldset);
     };
 }
 
-// See `generate_aarch64_outlined_atomics` in build.rs.
-include!(concat!(env!("OUT_DIR"), "/outlined_atomics.rs"));
 foreach_cas!(compare_and_swap);
 foreach_cas16!(compare_and_swap_i128);
 foreach_swp!(swap);
diff --git a/compiler-builtins/src/arm.rs b/compiler-builtins/src/arm.rs
index a9107e3cd..0c15b37df 100644
--- a/compiler-builtins/src/arm.rs
+++ b/compiler-builtins/src/arm.rs
@@ -1,16 +1,16 @@
-#![cfg(not(feature = "no-asm"))]
-
 // Interfaces used by naked trampolines.
-extern "C" {
+// SAFETY: these are defined in compiler-builtins
+unsafe extern "C" {
     fn __udivmodsi4(a: u32, b: u32, rem: *mut u32) -> u32;
     fn __udivmoddi4(a: u64, b: u64, rem: *mut u64) -> u64;
     fn __divmoddi4(a: i64, b: i64, rem: *mut i64) -> i64;
 }
 
-extern "aapcs" {
+// SAFETY: these are defined in compiler-builtins
+unsafe extern "custom" {
     // AAPCS is not always the correct ABI for these intrinsics, but we only use this to
     // forward another `__aeabi_` call so it doesn't matter.
-    fn __aeabi_idiv(a: i32, b: i32) -> i32;
+    fn __aeabi_idiv();
 }
 
 intrinsics! {
@@ -18,7 +18,7 @@ intrinsics! {
     // custom calling convention which can't be implemented using a normal Rust function.
     #[unsafe(naked)]
     #[cfg(not(target_env = "msvc"))]
-    pub unsafe extern "C" fn __aeabi_uidivmod() {
+    pub unsafe extern "custom" fn __aeabi_uidivmod() {
         core::arch::naked_asm!(
             "push {{lr}}",
             "sub sp, sp, #4",
@@ -32,7 +32,7 @@ intrinsics! {
     }
 
     #[unsafe(naked)]
-    pub unsafe extern "C" fn __aeabi_uldivmod() {
+    pub unsafe extern "custom" fn __aeabi_uldivmod() {
         core::arch::naked_asm!(
             "push {{r4, lr}}",
             "sub sp, sp, #16",
@@ -48,7 +48,7 @@ intrinsics! {
     }
 
     #[unsafe(naked)]
-    pub unsafe extern "C" fn __aeabi_idivmod() {
+    pub unsafe extern "custom" fn __aeabi_idivmod() {
         core::arch::naked_asm!(
             "push {{r0, r1, r4, lr}}",
             "bl {trampoline}",
@@ -61,7 +61,7 @@ intrinsics! {
     }
 
     #[unsafe(naked)]
-    pub unsafe extern "C" fn __aeabi_ldivmod() {
+    pub unsafe extern "custom" fn __aeabi_ldivmod() {
         core::arch::naked_asm!(
             "push {{r4, lr}}",
             "sub sp, sp, #16",
@@ -132,8 +132,8 @@ intrinsics! {
     /// eight bytes.
     #[cfg(not(target_vendor = "apple"))]
     pub unsafe extern "aapcs" fn __aeabi_memcpy8(dst: *mut u8, src: *const u8, n: usize) {
-        debug_assert!(dst.addr() & 7 == 0);
-        debug_assert!(src.addr() & 7 == 0);
+        debug_assert!(dst.addr().is_multiple_of(8));
+        debug_assert!(src.addr().is_multiple_of(8));
 
         // SAFETY: memcpy preconditions apply, less strict alignment.
         unsafe { __aeabi_memcpy4(dst, src, n) };
@@ -158,8 +158,8 @@ intrinsics! {
     /// four bytes.
     #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
     pub unsafe extern "aapcs" fn __aeabi_memmove4(dst: *mut u8, src: *const u8, n: usize) {
-        debug_assert!(dst.addr() & 3 == 0);
-        debug_assert!(src.addr() & 3 == 0);
+        debug_assert!(dst.addr().is_multiple_of(4));
+        debug_assert!(src.addr().is_multiple_of(4));
 
         // SAFETY: same preconditions, less strict aligment.
         unsafe { __aeabi_memmove(dst, src, n) };
@@ -173,8 +173,8 @@ intrinsics! {
     /// eight bytes.
     #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
     pub unsafe extern "aapcs" fn __aeabi_memmove8(dst: *mut u8, src: *const u8, n: usize) {
-        debug_assert!(dst.addr() & 7 == 0);
-        debug_assert!(src.addr() & 7 == 0);
+        debug_assert!(dst.addr().is_multiple_of(8));
+        debug_assert!(src.addr().is_multiple_of(8));
 
         // SAFETY: memmove preconditions apply, less strict alignment.
         unsafe { __aeabi_memmove(dst, src, n) };
@@ -233,7 +233,7 @@ intrinsics! {
     /// eight bytes.
     #[cfg(not(target_vendor = "apple"))]
     pub unsafe extern "aapcs" fn __aeabi_memset8(dst: *mut u8, n: usize, c: i32) {
-        debug_assert!(dst.addr() & 7 == 0);
+        debug_assert!(dst.addr().is_multiple_of(8));
 
         // SAFETY: memset preconditions apply, less strict alignment.
         unsafe { __aeabi_memset4(dst, n, c) };
@@ -258,7 +258,7 @@ intrinsics! {
     /// four bytes.
     #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
     pub unsafe extern "aapcs" fn __aeabi_memclr4(dst: *mut u8, n: usize) {
-        debug_assert!(dst.addr() & 3 == 0);
+        debug_assert!(dst.addr().is_multiple_of(4));
 
         // SAFETY: memclr preconditions apply, less strict alignment.
         unsafe { __aeabi_memset4(dst, n, 0) };
@@ -272,7 +272,7 @@ intrinsics! {
     /// eight bytes.
     #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
     pub unsafe extern "aapcs" fn __aeabi_memclr8(dst: *mut u8, n: usize) {
-        debug_assert!(dst.addr() & 7 == 0);
+        debug_assert!(dst.addr().is_multiple_of(8));
 
         // SAFETY: memclr preconditions apply, less strict alignment.
         unsafe { __aeabi_memset4(dst, n, 0) };
diff --git a/compiler-builtins/src/arm_linux.rs b/compiler-builtins/src/arm_linux.rs
index 6ce67ba71..ab9f86807 100644
--- a/compiler-builtins/src/arm_linux.rs
+++ b/compiler-builtins/src/arm_linux.rs
@@ -4,12 +4,17 @@ use core::{arch, mem};
 // Kernel-provided user-mode helper functions:
 // https://www.kernel.org/doc/Documentation/arm/kernel_user_helpers.txt
 unsafe fn __kuser_cmpxchg(oldval: u32, newval: u32, ptr: *mut u32) -> bool {
-    let f: extern "C" fn(u32, u32, *mut u32) -> u32 = mem::transmute(0xffff0fc0usize as *const ());
+    // FIXME(volatile): the third parameter is a volatile pointer
+    // SAFETY: kernel docs specify a known address with the given signature
+    let f = unsafe {
+        mem::transmute::<_, extern "C" fn(u32, u32, *mut u32) -> u32>(0xffff0fc0usize as *const ())
+    };
     f(oldval, newval, ptr) == 0
 }
 
 unsafe fn __kuser_memory_barrier() {
-    let f: extern "C" fn() = mem::transmute(0xffff0fa0usize as *const ());
+    // SAFETY: kernel docs specify a known address with the given signature
+    let f = unsafe { mem::transmute::<_, extern "C" fn()>(0xffff0fa0usize as *const ()) };
     f();
 }
 
@@ -67,8 +72,10 @@ fn insert_aligned(aligned: u32, val: u32, shift: u32, mask: u32) -> u32 {
 /// - if `size_of::<T>() == 2`, `ptr` or `ptr` offset by 2 bytes must be valid for a relaxed atomic
 ///   read of 2 bytes.
 /// - if `size_of::<T>() == 4`, `ptr` must be valid for a relaxed atomic read of 4 bytes.
+// FIXME: assert some of the preconditions in debug mode
 unsafe fn atomic_load_aligned<T>(ptr: *mut u32) -> u32 {
-    if mem::size_of::<T>() == 4 {
+    const { assert!(size_of::<T>() <= 4) };
+    if size_of::<T>() == 4 {
         // SAFETY: As `T` has a size of 4, the caller garantees this is sound.
         unsafe { AtomicU32::from_ptr(ptr).load(Ordering::Relaxed) }
     } else {
@@ -100,11 +107,13 @@ unsafe fn atomic_rmw<T, F: Fn(u32) -> u32, G: Fn(u32, u32) -> u32>(ptr: *mut T,
     let (shift, mask) = get_shift_mask(ptr);
 
     loop {
-        let curval_aligned = atomic_load_aligned::<T>(aligned_ptr);
+        // FIXME(safety): preconditions review needed
+        let curval_aligned = unsafe { atomic_load_aligned::<T>(aligned_ptr) };
         let curval = extract_aligned(curval_aligned, shift, mask);
         let newval = f(curval);
         let newval_aligned = insert_aligned(curval_aligned, newval, shift, mask);
-        if __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) {
+        // FIXME(safety): preconditions review needed
+        if unsafe { __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) } {
             return g(curval, newval);
         }
     }
@@ -116,13 +125,15 @@ unsafe fn atomic_cmpxchg<T>(ptr: *mut T, oldval: u32, newval: u32) -> u32 {
     let (shift, mask) = get_shift_mask(ptr);
 
     loop {
-        let curval_aligned = atomic_load_aligned::<T>(aligned_ptr);
+        // FIXME(safety): preconditions review needed
+        let curval_aligned = unsafe { atomic_load_aligned::<T>(aligned_ptr) };
         let curval = extract_aligned(curval_aligned, shift, mask);
         if curval != oldval {
             return curval;
         }
         let newval_aligned = insert_aligned(curval_aligned, newval, shift, mask);
-        if __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) {
+        // FIXME(safety): preconditions review needed
+        if unsafe { __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) } {
             return oldval;
         }
     }
@@ -132,7 +143,14 @@ macro_rules! atomic_rmw {
     ($name:ident, $ty:ty, $op:expr, $fetch:expr) => {
         intrinsics! {
             pub unsafe extern "C" fn $name(ptr: *mut $ty, val: $ty) -> $ty {
-                atomic_rmw(ptr, |x| $op(x as $ty, val) as u32, |old, new| $fetch(old, new)) as $ty
+                // FIXME(safety): preconditions review needed
+                unsafe {
+                    atomic_rmw(
+                        ptr,
+                        |x| $op(x as $ty, val) as u32,
+                        |old, new| $fetch(old, new)
+                    ) as $ty
+                }
             }
         }
     };
@@ -149,7 +167,8 @@ macro_rules! atomic_cmpxchg {
     ($name:ident, $ty:ty) => {
         intrinsics! {
             pub unsafe extern "C" fn $name(ptr: *mut $ty, oldval: $ty, newval: $ty) -> $ty {
-                atomic_cmpxchg(ptr, oldval as u32, newval as u32) as $ty
+                // FIXME(safety): preconditions review needed
+                unsafe { atomic_cmpxchg(ptr, oldval as u32, newval as u32) as $ty }
             }
         }
     };
@@ -285,6 +304,7 @@ atomic_cmpxchg!(__sync_val_compare_and_swap_4, u32);
 
 intrinsics! {
     pub unsafe extern "C" fn __sync_synchronize() {
-        __kuser_memory_barrier();
+       // SAFETY: preconditions are the same as the calling function.
+       unsafe {  __kuser_memory_barrier() };
     }
 }
diff --git a/compiler-builtins/src/float/add.rs b/compiler-builtins/src/float/add.rs
index 0426c9cc4..8dbfb0e10 100644
--- a/compiler-builtins/src/float/add.rs
+++ b/compiler-builtins/src/float/add.rs
@@ -1,5 +1,5 @@
 use crate::float::Float;
-use crate::int::{CastInto, Int, MinInt};
+use crate::int::{CastFrom, CastInto, Int, MinInt};
 
 /// Returns `a + b`
 fn add<F: Float>(a: F, b: F) -> F
@@ -12,7 +12,7 @@ where
     let one = F::Int::ONE;
     let zero = F::Int::ZERO;
 
-    let bits = F::BITS.cast();
+    let bits: F::Int = F::BITS.cast();
     let significand_bits = F::SIG_BITS;
     let max_exponent = F::EXP_SAT;
 
@@ -115,9 +115,10 @@ where
     let align = a_exponent.wrapping_sub(b_exponent).cast();
     if align != MinInt::ZERO {
         if align < bits {
-            let sticky =
-                F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != MinInt::ZERO);
-            b_significand = (b_significand >> align.cast()) | sticky;
+            let sticky = F::Int::from_bool(
+                b_significand << u32::cast_from(bits.wrapping_sub(align)) != MinInt::ZERO,
+            );
+            b_significand = (b_significand >> u32::cast_from(align)) | sticky;
         } else {
             b_significand = one; // sticky; b is known to be non-zero.
         }
@@ -132,8 +133,8 @@ where
         // If partial cancellation occured, we need to left-shift the result
         // and adjust the exponent:
         if a_significand < implicit_bit << 3 {
-            let shift =
-                a_significand.leading_zeros() as i32 - (implicit_bit << 3).leading_zeros() as i32;
+            let shift = a_significand.leading_zeros() as i32
+                - (implicit_bit << 3u32).leading_zeros() as i32;
             a_significand <<= shift;
             a_exponent -= shift;
         }
@@ -159,14 +160,15 @@ where
         // Result is denormal before rounding; the exponent is zero and we
         // need to shift the significand.
         let shift = (1 - a_exponent).cast();
-        let sticky =
-            F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != MinInt::ZERO);
-        a_significand = (a_significand >> shift.cast()) | sticky;
+        let sticky = F::Int::from_bool(
+            (a_significand << u32::cast_from(bits.wrapping_sub(shift))) != MinInt::ZERO,
+        );
+        a_significand = (a_significand >> u32::cast_from(shift)) | sticky;
         a_exponent = 0;
     }
 
     // Low three bits are round, guard, and sticky.
-    let a_significand_i32: i32 = a_significand.cast();
+    let a_significand_i32: i32 = a_significand.cast_lossy();
     let round_guard_sticky: i32 = a_significand_i32 & 0x7;
 
     // Shift the significand into place, and mask off the implicit bit.
@@ -189,6 +191,11 @@ where
 }
 
 intrinsics! {
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __addhf3(a: f16, b: f16) -> f16 {
+        add(a, b)
+    }
+
     #[aapcs_on_arm]
     #[arm_aeabi_alias = __aeabi_fadd]
     pub extern "C" fn __addsf3(a: f32, b: f32) -> f32 {
diff --git a/compiler-builtins/src/float/cmp.rs b/compiler-builtins/src/float/cmp.rs
index 296952821..8ab39c2b5 100644
--- a/compiler-builtins/src/float/cmp.rs
+++ b/compiler-builtins/src/float/cmp.rs
@@ -2,14 +2,23 @@
 
 use crate::float::Float;
 use crate::int::MinInt;
-
-// https://github.com/llvm/llvm-project/blob/1e6ba3cd2fe96be00b6ed6ba28b3d9f9271d784d/compiler-rt/lib/builtins/fp_compare_impl.inc#L22
-#[cfg(target_arch = "avr")]
-pub type CmpResult = i8;
-
-// https://github.com/llvm/llvm-project/blob/1e6ba3cd2fe96be00b6ed6ba28b3d9f9271d784d/compiler-rt/lib/builtins/fp_compare_impl.inc#L25
-#[cfg(not(target_arch = "avr"))]
-pub type CmpResult = i32;
+use crate::support::cfg_if;
+
+// Taken from LLVM config:
+// https://github.com/llvm/llvm-project/blob/0cf3c437c18ed27d9663d87804a9a15ff6874af2/compiler-rt/lib/builtins/fp_compare_impl.inc#L11-L27
+cfg_if! {
+    if #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))] {
+        // Aarch64 uses `int` rather than a pointer-sized value.
+        pub type CmpResult = i32;
+    } else if #[cfg(target_arch = "avr")] {
+        // AVR uses a single byte.
+        pub type CmpResult = i8;
+    } else {
+        // In compiler-rt, LLP64 ABIs use `long long` and everything else uses `long`. In effect,
+        // this means the return value is always pointer-sized.
+        pub type CmpResult = isize;
+    }
+}
 
 #[derive(Clone, Copy)]
 enum Result {
@@ -106,6 +115,37 @@ fn unord<F: Float>(a: F, b: F) -> bool {
     a_abs > inf_rep || b_abs > inf_rep
 }
 
+#[cfg(f16_enabled)]
+intrinsics! {
+    pub extern "C" fn __lehf2(a: f16, b: f16) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __gehf2(a: f16, b: f16) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+
+    pub extern "C" fn __unordhf2(a: f16, b: f16) -> crate::float::cmp::CmpResult {
+        unord(a, b) as crate::float::cmp::CmpResult
+    }
+
+    pub extern "C" fn __eqhf2(a: f16, b: f16) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __lthf2(a: f16, b: f16) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __nehf2(a: f16, b: f16) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __gthf2(a: f16, b: f16) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+}
+
 intrinsics! {
     pub extern "C" fn __lesf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
         cmp(a, b).to_le_abi()
diff --git a/compiler-builtins/src/float/conv.rs b/compiler-builtins/src/float/conv.rs
index f5427a113..75ea7ce02 100644
--- a/compiler-builtins/src/float/conv.rs
+++ b/compiler-builtins/src/float/conv.rs
@@ -72,9 +72,9 @@ mod int_to_float {
         F: Float,
         I: Int,
         F::Int: CastFrom<I>,
-        Conv: Fn(I::UnsignedInt) -> F::Int,
+        Conv: Fn(I::Unsigned) -> F::Int,
     {
-        let sign_bit = F::Int::cast_from(i >> (I::BITS - 1)) << (F::BITS - 1);
+        let sign_bit = F::Int::cast_from_lossy(i >> (I::BITS - 1)) << (F::BITS - 1);
         F::from_bits(conv(i.unsigned_abs()) | sign_bit)
     }
 
@@ -166,7 +166,7 @@ mod int_to_float {
 
         // Within the upper `F::BITS`, everything except for the signifcand
         // gets truncated
-        let d1: u32 = (i_m >> (u128::BITS - f32::BITS - f32::SIG_BITS - 1)).cast();
+        let d1: u32 = (i_m >> (u128::BITS - f32::BITS - f32::SIG_BITS - 1)).cast_lossy();
 
         // The entire rest of `i_m` gets truncated. Zero the upper `F::BITS` then just
         // check if it is nonzero.
@@ -313,10 +313,10 @@ intrinsics! {
 fn float_to_unsigned_int<F, U>(f: F) -> U
 where
     F: Float,
-    U: Int<UnsignedInt = U>,
+    U: Int<Unsigned = U>,
     F::Int: CastInto<U>,
     F::Int: CastFrom<u32>,
-    F::Int: CastInto<U::UnsignedInt>,
+    F::Int: CastInto<U::Unsigned>,
     u32: CastFrom<F::Int>,
 {
     float_to_int_inner::<F, U, _, _>(f.to_bits(), |i: U| i, || U::MAX)
@@ -327,8 +327,8 @@ fn float_to_signed_int<F, I>(f: F) -> I
 where
     F: Float,
     I: Int + Neg<Output = I>,
-    I::UnsignedInt: Int,
-    F::Int: CastInto<I::UnsignedInt>,
+    I::Unsigned: Int,
+    F::Int: CastInto<I::Unsigned>,
     F::Int: CastFrom<u32>,
     u32: CastFrom<F::Int>,
 {
@@ -355,27 +355,27 @@ where
     I: Int,
     FnFoo: FnOnce(I) -> I,
     FnOob: FnOnce() -> I,
-    I::UnsignedInt: Int,
-    F::Int: CastInto<I::UnsignedInt>,
+    I::Unsigned: Int,
+    F::Int: CastInto<I::Unsigned>,
     F::Int: CastFrom<u32>,
     u32: CastFrom<F::Int>,
 {
     let int_max_exp = F::EXP_BIAS + I::MAX.ilog2() + 1;
-    let foobar = F::EXP_BIAS + I::UnsignedInt::BITS - 1;
+    let foobar = F::EXP_BIAS + I::Unsigned::BITS - 1;
 
     if fbits < F::ONE.to_bits() {
         // < 0 gets rounded to 0
         I::ZERO
     } else if fbits < F::Int::cast_from(int_max_exp) << F::SIG_BITS {
         // >= 1, < integer max
-        let m_base = if I::UnsignedInt::BITS >= F::Int::BITS {
-            I::UnsignedInt::cast_from(fbits) << (I::BITS - F::SIG_BITS - 1)
+        let m_base = if I::Unsigned::BITS >= F::Int::BITS {
+            I::Unsigned::cast_from(fbits) << (I::BITS - F::SIG_BITS - 1)
         } else {
-            I::UnsignedInt::cast_from(fbits >> (F::SIG_BITS - I::BITS + 1))
+            I::Unsigned::cast_from_lossy(fbits >> (F::SIG_BITS - I::BITS + 1))
         };
 
         // Set the implicit 1-bit.
-        let m: I::UnsignedInt = (I::UnsignedInt::ONE << (I::BITS - 1)) | m_base;
+        let m: I::Unsigned = (I::Unsigned::ONE << (I::BITS - 1)) | m_base;
 
         // Shift based on the exponent and bias.
         let s: u32 = (foobar) - u32::cast_from(fbits >> F::SIG_BITS);
diff --git a/compiler-builtins/src/float/div.rs b/compiler-builtins/src/float/div.rs
index 5df637c7e..fc1fc0851 100644
--- a/compiler-builtins/src/float/div.rs
+++ b/compiler-builtins/src/float/div.rs
@@ -370,7 +370,7 @@ where
         let hi_corr: F::Int = corr_uq1 >> hw;
 
         // x_UQ0 * corr_UQ1 = (x_UQ0_hw * 2^HW) * (hi_corr * 2^HW + lo_corr) - corr_UQ1
-        let mut x_uq0: F::Int = ((F::Int::from(x_uq0_hw) * hi_corr) << 1)
+        let mut x_uq0: F::Int = ((F::Int::from(x_uq0_hw) * hi_corr) << 1u32)
             .wrapping_add((F::Int::from(x_uq0_hw) * lo_corr) >> (hw - 1))
             // 1 to account for the highest bit of corr_UQ1 can be 1
             // 1 to account for possible carry
@@ -482,7 +482,7 @@ where
 
         let ret = quotient.wrapping_shr(u32::cast_from(res_exponent.wrapping_neg()) + 1);
         residual_lo = a_significand
-            .wrapping_shl(significand_bits.wrapping_add(CastInto::<u32>::cast(res_exponent)))
+            .wrapping_shl(significand_bits.wrapping_add(CastInto::<u32>::cast_lossy(res_exponent)))
             .wrapping_sub(ret.wrapping_mul(b_significand) << 1);
         ret
     };
diff --git a/compiler-builtins/src/float/mul.rs b/compiler-builtins/src/float/mul.rs
index 7f1f19d9b..49a2414eb 100644
--- a/compiler-builtins/src/float/mul.rs
+++ b/compiler-builtins/src/float/mul.rs
@@ -143,7 +143,7 @@ where
         // a zero of the appropriate sign.  Mathematically there is no need to
         // handle this case separately, but we make it a special case to
         // simplify the shift logic.
-        let shift = one.wrapping_sub(product_exponent.cast()).cast();
+        let shift: u32 = one.wrapping_sub(product_exponent.cast_lossy()).cast();
         if shift >= bits {
             return F::from_bits(product_sign);
         }
@@ -180,6 +180,11 @@ where
 }
 
 intrinsics! {
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __mulhf3(a: f16, b: f16) -> f16 {
+        mul(a, b)
+    }
+
     #[aapcs_on_arm]
     #[arm_aeabi_alias = __aeabi_fmul]
     pub extern "C" fn __mulsf3(a: f32, b: f32) -> f32 {
diff --git a/compiler-builtins/src/float/pow.rs b/compiler-builtins/src/float/pow.rs
index 45a4ad904..6997a9c21 100644
--- a/compiler-builtins/src/float/pow.rs
+++ b/compiler-builtins/src/float/pow.rs
@@ -32,8 +32,6 @@ intrinsics! {
 
     #[ppc_alias = __powikf2]
     #[cfg(f128_enabled)]
-    // FIXME(f16_f128): MSVC cannot build these until `__divtf3` is available in nightly.
-    #[cfg(not(target_env = "msvc"))]
     pub extern "C" fn __powitf2(a: f128, b: i32) -> f128 {
         pow(a, b)
     }
diff --git a/compiler-builtins/src/float/sub.rs b/compiler-builtins/src/float/sub.rs
index a0fd9dff9..48ef33b0b 100644
--- a/compiler-builtins/src/float/sub.rs
+++ b/compiler-builtins/src/float/sub.rs
@@ -1,6 +1,11 @@
 use crate::float::Float;
 
 intrinsics! {
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __subhf3(a: f16, b: f16) -> f16 {
+        crate::float::add::__addhf3(a, f16::from_bits(b.to_bits() ^ f16::SIGN_MASK))
+    }
+
     #[arm_aeabi_alias = __aeabi_fsub]
     pub extern "C" fn __subsf3(a: f32, b: f32) -> f32 {
         crate::float::add::__addsf3(a, f32::from_bits(b.to_bits() ^ f32::SIGN_MASK))
diff --git a/compiler-builtins/src/float/traits.rs b/compiler-builtins/src/float/traits.rs
index 8ccaa7bcb..a30d20900 100644
--- a/compiler-builtins/src/float/traits.rs
+++ b/compiler-builtins/src/float/traits.rs
@@ -20,10 +20,10 @@ pub trait Float:
     + ops::Rem<Output = Self>
 {
     /// A uint of the same width as the float
-    type Int: Int<OtherSign = Self::SignedInt, UnsignedInt = Self::Int>;
+    type Int: Int<OtherSign = Self::SignedInt, Unsigned = Self::Int>;
 
     /// A int of the same width as the float
-    type SignedInt: Int + MinInt<OtherSign = Self::Int, UnsignedInt = Self::Int>;
+    type SignedInt: Int + MinInt<OtherSign = Self::Int, Unsigned = Self::Int>;
 
     /// An int capable of containing the exponent bits plus a sign bit. This is signed.
     type ExpInt: Int;
diff --git a/compiler-builtins/src/float/trunc.rs b/compiler-builtins/src/float/trunc.rs
index ca8a0f368..93db5d8bb 100644
--- a/compiler-builtins/src/float/trunc.rs
+++ b/compiler-builtins/src/float/trunc.rs
@@ -50,7 +50,7 @@ where
         // The exponent of a is within the range of normal numbers in the
         // destination format.  We can convert by simply right-shifting with
         // rounding and adjusting the exponent.
-        abs_result = (a_abs >> sig_bits_delta).cast();
+        abs_result = (a_abs >> sig_bits_delta).cast_lossy();
         // Cast before shifting to prevent overflow.
         let bias_diff: R::Int = src_exp_bias.wrapping_sub(dst_exp_bias).cast();
         let tmp = bias_diff << R::SIG_BITS;
diff --git a/compiler-builtins/src/hexagon.rs b/compiler-builtins/src/hexagon.rs
index 91cf91c31..a5c7b4dfd 100644
--- a/compiler-builtins/src/hexagon.rs
+++ b/compiler-builtins/src/hexagon.rs
@@ -1,5 +1,3 @@
-#![cfg(not(feature = "no-asm"))]
-
 use core::arch::global_asm;
 
 global_asm!(include_str!("hexagon/func_macro.s"), options(raw));
diff --git a/compiler-builtins/src/int/addsub.rs b/compiler-builtins/src/int/addsub.rs
index 1f84e8eb1..b2b21fc2c 100644
--- a/compiler-builtins/src/int/addsub.rs
+++ b/compiler-builtins/src/int/addsub.rs
@@ -22,7 +22,7 @@ impl UAddSub for u128 {}
 
 trait AddSub: Int
 where
-    <Self as MinInt>::UnsignedInt: UAddSub,
+    <Self as MinInt>::Unsigned: UAddSub,
 {
     fn add(self, other: Self) -> Self {
         Self::from_unsigned(self.unsigned().uadd(other.unsigned()))
@@ -37,7 +37,7 @@ impl AddSub for i128 {}
 
 trait Addo: AddSub
 where
-    <Self as MinInt>::UnsignedInt: UAddSub,
+    <Self as MinInt>::Unsigned: UAddSub,
 {
     fn addo(self, other: Self) -> (Self, bool) {
         let sum = AddSub::add(self, other);
@@ -50,7 +50,7 @@ impl Addo for u128 {}
 
 trait Subo: AddSub
 where
-    <Self as MinInt>::UnsignedInt: UAddSub,
+    <Self as MinInt>::Unsigned: UAddSub,
 {
     fn subo(self, other: Self) -> (Self, bool) {
         let sum = AddSub::sub(self, other);
diff --git a/compiler-builtins/src/int/big.rs b/compiler-builtins/src/int/big.rs
index 61f1349d9..8e0600909 100644
--- a/compiler-builtins/src/int/big.rs
+++ b/compiler-builtins/src/int/big.rs
@@ -45,7 +45,7 @@ impl i256 {
 impl MinInt for u256 {
     type OtherSign = i256;
 
-    type UnsignedInt = u256;
+    type Unsigned = u256;
 
     const SIGNED: bool = false;
     const BITS: u32 = 256;
@@ -58,14 +58,14 @@ impl MinInt for u256 {
 impl MinInt for i256 {
     type OtherSign = u256;
 
-    type UnsignedInt = u256;
+    type Unsigned = u256;
 
     const SIGNED: bool = false;
     const BITS: u32 = 256;
     const ZERO: Self = Self([0u64; 4]);
     const ONE: Self = Self([1, 0, 0, 0]);
     const MIN: Self = Self([0, 0, 0, 1 << 63]);
-    const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX << 1]);
+    const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 1]);
 }
 
 macro_rules! impl_common {
diff --git a/compiler-builtins/src/int/leading_zeros.rs b/compiler-builtins/src/int/leading_zeros.rs
index 112f4d036..aa5cb3993 100644
--- a/compiler-builtins/src/int/leading_zeros.rs
+++ b/compiler-builtins/src/int/leading_zeros.rs
@@ -9,11 +9,14 @@ pub use implementation::{leading_zeros_default, leading_zeros_riscv};
 pub(crate) use implementation::{leading_zeros_default, leading_zeros_riscv};
 
 mod implementation {
-    use crate::int::{CastInto, Int};
+    use crate::int::{CastFrom, Int};
 
     /// Returns the number of leading binary zeros in `x`.
     #[allow(dead_code)]
-    pub fn leading_zeros_default<T: Int + CastInto<usize>>(x: T) -> usize {
+    pub fn leading_zeros_default<I: Int>(x: I) -> usize
+    where
+        usize: CastFrom<I>,
+    {
         // The basic idea is to test if the higher bits of `x` are zero and bisect the number
         // of leading zeros. It is possible for all branches of the bisection to use the same
         // code path by conditionally shifting the higher parts down to let the next bisection
@@ -23,44 +26,48 @@ mod implementation {
         // because it simplifies the final bisection step.
         let mut x = x;
         // the number of potential leading zeros
-        let mut z = T::BITS as usize;
+        let mut z = I::BITS as usize;
         // a temporary
-        let mut t: T;
+        let mut t: I;
 
-        const { assert!(T::BITS <= 64) };
-        if T::BITS >= 64 {
+        const { assert!(I::BITS <= 64) };
+        if I::BITS >= 64 {
             t = x >> 32;
-            if t != T::ZERO {
+            if t != I::ZERO {
                 z -= 32;
                 x = t;
             }
         }
-        if T::BITS >= 32 {
+        if I::BITS >= 32 {
             t = x >> 16;
-            if t != T::ZERO {
+            if t != I::ZERO {
                 z -= 16;
                 x = t;
             }
         }
-        const { assert!(T::BITS >= 16) };
+        const { assert!(I::BITS >= 16) };
         t = x >> 8;
-        if t != T::ZERO {
+        if t != I::ZERO {
             z -= 8;
             x = t;
         }
         t = x >> 4;
-        if t != T::ZERO {
+        if t != I::ZERO {
             z -= 4;
             x = t;
         }
         t = x >> 2;
-        if t != T::ZERO {
+        if t != I::ZERO {
             z -= 2;
             x = t;
         }
         // the last two bisections are combined into one conditional
         t = x >> 1;
-        if t != T::ZERO { z - 2 } else { z - x.cast() }
+        if t != I::ZERO {
+            z - 2
+        } else {
+            z - usize::cast_from(x)
+        }
 
         // We could potentially save a few cycles by using the LUT trick from
         // "https://embeddedgurus.com/state-space/2014/09/
@@ -82,10 +89,13 @@ mod implementation {
 
     /// Returns the number of leading binary zeros in `x`.
     #[allow(dead_code)]
-    pub fn leading_zeros_riscv<T: Int + CastInto<usize>>(x: T) -> usize {
+    pub fn leading_zeros_riscv<I: Int>(x: I) -> usize
+    where
+        usize: CastFrom<I>,
+    {
         let mut x = x;
         // the number of potential leading zeros
-        let mut z = T::BITS;
+        let mut z = I::BITS;
         // a temporary
         let mut t: u32;
 
@@ -97,11 +107,11 @@ mod implementation {
         // right). If we try to save an instruction by using `x < imm` for each bisection, we
         // have to shift `x` left and compare with powers of two approaching `usize::MAX + 1`,
         // but the immediate will never fit into 12 bits and never save an instruction.
-        const { assert!(T::BITS <= 64) };
-        if T::BITS >= 64 {
+        const { assert!(I::BITS <= 64) };
+        if I::BITS >= 64 {
             // If the upper 32 bits of `x` are not all 0, `t` is set to `1 << 5`, otherwise
             // `t` is set to 0.
-            t = ((x >= (T::ONE << 32)) as u32) << 5;
+            t = ((x >= (I::ONE << 32)) as u32) << 5;
             // If `t` was set to `1 << 5`, then the upper 32 bits are shifted down for the
             // next step to process.
             x >>= t;
@@ -109,27 +119,27 @@ mod implementation {
             // leading zeros
             z -= t;
         }
-        if T::BITS >= 32 {
-            t = ((x >= (T::ONE << 16)) as u32) << 4;
+        if I::BITS >= 32 {
+            t = ((x >= (I::ONE << 16)) as u32) << 4;
             x >>= t;
             z -= t;
         }
-        const { assert!(T::BITS >= 16) };
-        t = ((x >= (T::ONE << 8)) as u32) << 3;
+        const { assert!(I::BITS >= 16) };
+        t = ((x >= (I::ONE << 8)) as u32) << 3;
         x >>= t;
         z -= t;
-        t = ((x >= (T::ONE << 4)) as u32) << 2;
+        t = ((x >= (I::ONE << 4)) as u32) << 2;
         x >>= t;
         z -= t;
-        t = ((x >= (T::ONE << 2)) as u32) << 1;
+        t = ((x >= (I::ONE << 2)) as u32) << 1;
         x >>= t;
         z -= t;
-        t = (x >= (T::ONE << 1)) as u32;
+        t = (x >= (I::ONE << 1)) as u32;
         x >>= t;
         z -= t;
         // All bits except the LSB are guaranteed to be zero for this final bisection step.
         // If `x != 0` then `x == 1` and subtracts one potential zero from `z`.
-        z as usize - x.cast()
+        z as usize - usize::cast_from(x)
     }
 }
 
diff --git a/compiler-builtins/src/int/specialized_div_rem/mod.rs b/compiler-builtins/src/int/specialized_div_rem/mod.rs
index 43f466e75..7841e4f33 100644
--- a/compiler-builtins/src/int/specialized_div_rem/mod.rs
+++ b/compiler-builtins/src/int/specialized_div_rem/mod.rs
@@ -125,10 +125,10 @@ impl_normalization_shift!(
 /// dependencies.
 #[inline]
 fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
-    if let Some(quo) = duo.checked_div(div) {
-        if let Some(rem) = duo.checked_rem(div) {
-            return (quo, rem);
-        }
+    if let Some(quo) = duo.checked_div(div)
+        && let Some(rem) = duo.checked_rem(div)
+    {
+        return (quo, rem);
     }
     zero_div_fn()
 }
@@ -227,10 +227,10 @@ impl_asymmetric!(
 #[inline]
 #[allow(dead_code)]
 fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
-    if let Some(quo) = duo.checked_div(div) {
-        if let Some(rem) = duo.checked_rem(div) {
-            return (quo, rem);
-        }
+    if let Some(quo) = duo.checked_div(div)
+        && let Some(rem) = duo.checked_rem(div)
+    {
+        return (quo, rem);
     }
     zero_div_fn()
 }
diff --git a/compiler-builtins/src/int/trailing_zeros.rs b/compiler-builtins/src/int/trailing_zeros.rs
index c45d6b1cf..1b0ae5b73 100644
--- a/compiler-builtins/src/int/trailing_zeros.rs
+++ b/compiler-builtins/src/int/trailing_zeros.rs
@@ -4,33 +4,38 @@ pub use implementation::trailing_zeros;
 pub(crate) use implementation::trailing_zeros;
 
 mod implementation {
-    use crate::int::{CastInto, Int};
+    use crate::int::{CastFrom, Int};
 
     /// Returns number of trailing binary zeros in `x`.
     #[allow(dead_code)]
-    pub fn trailing_zeros<T: Int + CastInto<u32> + CastInto<u16> + CastInto<u8>>(x: T) -> usize {
+    pub fn trailing_zeros<I: Int>(x: I) -> usize
+    where
+        u32: CastFrom<I>,
+        u16: CastFrom<I>,
+        u8: CastFrom<I>,
+    {
         let mut x = x;
         let mut r: u32 = 0;
         let mut t: u32;
 
-        const { assert!(T::BITS <= 64) };
-        if T::BITS >= 64 {
-            r += ((CastInto::<u32>::cast(x) == 0) as u32) << 5; // if (x has no 32 small bits) t = 32 else 0
+        const { assert!(I::BITS <= 64) };
+        if I::BITS >= 64 {
+            r += ((u32::cast_from_lossy(x) == 0) as u32) << 5; // if (x has no 32 small bits) t = 32 else 0
             x >>= r; // remove 32 zero bits
         }
 
-        if T::BITS >= 32 {
-            t = ((CastInto::<u16>::cast(x) == 0) as u32) << 4; // if (x has no 16 small bits) t = 16 else 0
+        if I::BITS >= 32 {
+            t = ((u16::cast_from_lossy(x) == 0) as u32) << 4; // if (x has no 16 small bits) t = 16 else 0
             r += t;
             x >>= t; // x = [0 - 0xFFFF] + higher garbage bits
         }
 
-        const { assert!(T::BITS >= 16) };
-        t = ((CastInto::<u8>::cast(x) == 0) as u32) << 3;
+        const { assert!(I::BITS >= 16) };
+        t = ((u8::cast_from_lossy(x) == 0) as u32) << 3;
         x >>= t; // x = [0 - 0xFF] + higher garbage bits
         r += t;
 
-        let mut x: u8 = x.cast();
+        let mut x: u8 = x.cast_lossy();
 
         t = (((x & 0x0F) == 0) as u32) << 2;
         x >>= t; // x = [0 - 0xF] + higher garbage bits
diff --git a/compiler-builtins/src/int/traits.rs b/compiler-builtins/src/int/traits.rs
index 152cb2eee..25b9718ad 100644
--- a/compiler-builtins/src/int/traits.rs
+++ b/compiler-builtins/src/int/traits.rs
@@ -1,275 +1,4 @@
-use core::ops;
-
-/// Minimal integer implementations needed on all integer types, including wide integers.
-#[allow(dead_code)]
-pub trait MinInt:
-    Copy
-    + core::fmt::Debug
-    + ops::BitOr<Output = Self>
-    + ops::Not<Output = Self>
-    + ops::Shl<u32, Output = Self>
-{
-    /// Type with the same width but other signedness
-    type OtherSign: MinInt;
-    /// Unsigned version of Self
-    type UnsignedInt: MinInt;
-
-    /// If `Self` is a signed integer
-    const SIGNED: bool;
-
-    /// The bitwidth of the int type
-    const BITS: u32;
-
-    const ZERO: Self;
-    const ONE: Self;
-    const MIN: Self;
-    const MAX: Self;
-}
-
-/// Trait for some basic operations on integers
-#[allow(dead_code)]
-pub trait Int:
-    MinInt
-    + PartialEq
-    + PartialOrd
-    + ops::AddAssign
-    + ops::SubAssign
-    + ops::BitAndAssign
-    + ops::BitOrAssign
-    + ops::BitXorAssign
-    + ops::ShlAssign<i32>
-    + ops::ShrAssign<u32>
-    + ops::Add<Output = Self>
-    + ops::Sub<Output = Self>
-    + ops::Mul<Output = Self>
-    + ops::Div<Output = Self>
-    + ops::Shr<u32, Output = Self>
-    + ops::BitXor<Output = Self>
-    + ops::BitAnd<Output = Self>
-{
-    /// LUT used for maximizing the space covered and minimizing the computational cost of fuzzing
-    /// in `builtins-test`. For example, Self = u128 produces [0,1,2,7,8,15,16,31,32,63,64,95,96,
-    /// 111,112,119,120,125,126,127].
-    const FUZZ_LENGTHS: [u8; 20] = make_fuzz_lengths(<Self as MinInt>::BITS);
-
-    /// The number of entries of `FUZZ_LENGTHS` actually used. The maximum is 20 for u128.
-    const FUZZ_NUM: usize = {
-        let log2 = (<Self as MinInt>::BITS - 1).count_ones() as usize;
-        if log2 == 3 {
-            // case for u8
-            6
-        } else {
-            // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate
-            // boundaries.
-            8 + (4 * (log2 - 4))
-        }
-    };
-
-    fn unsigned(self) -> Self::UnsignedInt;
-    fn from_unsigned(unsigned: Self::UnsignedInt) -> Self;
-    fn unsigned_abs(self) -> Self::UnsignedInt;
-
-    fn from_bool(b: bool) -> Self;
-
-    /// Prevents the need for excessive conversions between signed and unsigned
-    fn logical_shr(self, other: u32) -> Self;
-
-    /// Absolute difference between two integers.
-    fn abs_diff(self, other: Self) -> Self::UnsignedInt;
-
-    // copied from primitive integers, but put in a trait
-    fn is_zero(self) -> bool;
-    fn wrapping_neg(self) -> Self;
-    fn wrapping_add(self, other: Self) -> Self;
-    fn wrapping_mul(self, other: Self) -> Self;
-    fn wrapping_sub(self, other: Self) -> Self;
-    fn wrapping_shl(self, other: u32) -> Self;
-    fn wrapping_shr(self, other: u32) -> Self;
-    fn rotate_left(self, other: u32) -> Self;
-    fn overflowing_add(self, other: Self) -> (Self, bool);
-    fn leading_zeros(self) -> u32;
-    fn ilog2(self) -> u32;
-}
-
-pub(crate) const fn make_fuzz_lengths(bits: u32) -> [u8; 20] {
-    let mut v = [0u8; 20];
-    v[0] = 0;
-    v[1] = 1;
-    v[2] = 2; // important for parity and the iX::MIN case when reversed
-    let mut i = 3;
-
-    // No need for any more until the byte boundary, because there should be no algorithms
-    // that are sensitive to anything not next to byte boundaries after 2. We also scale
-    // in powers of two, which is important to prevent u128 corner tests from getting too
-    // big.
-    let mut l = 8;
-    loop {
-        if l >= ((bits / 2) as u8) {
-            break;
-        }
-        // get both sides of the byte boundary
-        v[i] = l - 1;
-        i += 1;
-        v[i] = l;
-        i += 1;
-        l *= 2;
-    }
-
-    if bits != 8 {
-        // add the lower side of the middle boundary
-        v[i] = ((bits / 2) - 1) as u8;
-        i += 1;
-    }
-
-    // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS
-    // boundary because of algorithms that split the high part up. We reverse the scaling
-    // as we go to Self::BITS.
-    let mid = i;
-    let mut j = 1;
-    loop {
-        v[i] = (bits as u8) - (v[mid - j]) - 1;
-        if j == mid {
-            break;
-        }
-        i += 1;
-        j += 1;
-    }
-    v
-}
-
-macro_rules! int_impl_common {
-    ($ty:ty) => {
-        fn from_bool(b: bool) -> Self {
-            b as $ty
-        }
-
-        fn logical_shr(self, other: u32) -> Self {
-            Self::from_unsigned(self.unsigned().wrapping_shr(other))
-        }
-
-        fn is_zero(self) -> bool {
-            self == Self::ZERO
-        }
-
-        fn wrapping_neg(self) -> Self {
-            <Self>::wrapping_neg(self)
-        }
-
-        fn wrapping_add(self, other: Self) -> Self {
-            <Self>::wrapping_add(self, other)
-        }
-
-        fn wrapping_mul(self, other: Self) -> Self {
-            <Self>::wrapping_mul(self, other)
-        }
-        fn wrapping_sub(self, other: Self) -> Self {
-            <Self>::wrapping_sub(self, other)
-        }
-
-        fn wrapping_shl(self, other: u32) -> Self {
-            <Self>::wrapping_shl(self, other)
-        }
-
-        fn wrapping_shr(self, other: u32) -> Self {
-            <Self>::wrapping_shr(self, other)
-        }
-
-        fn rotate_left(self, other: u32) -> Self {
-            <Self>::rotate_left(self, other)
-        }
-
-        fn overflowing_add(self, other: Self) -> (Self, bool) {
-            <Self>::overflowing_add(self, other)
-        }
-
-        fn leading_zeros(self) -> u32 {
-            <Self>::leading_zeros(self)
-        }
-
-        fn ilog2(self) -> u32 {
-            <Self>::ilog2(self)
-        }
-    };
-}
-
-macro_rules! int_impl {
-    ($ity:ty, $uty:ty) => {
-        impl MinInt for $uty {
-            type OtherSign = $ity;
-            type UnsignedInt = $uty;
-
-            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
-            const SIGNED: bool = Self::MIN != Self::ZERO;
-
-            const ZERO: Self = 0;
-            const ONE: Self = 1;
-            const MIN: Self = <Self>::MIN;
-            const MAX: Self = <Self>::MAX;
-        }
-
-        impl Int for $uty {
-            fn unsigned(self) -> $uty {
-                self
-            }
-
-            // It makes writing macros easier if this is implemented for both signed and unsigned
-            #[allow(clippy::wrong_self_convention)]
-            fn from_unsigned(me: $uty) -> Self {
-                me
-            }
-
-            fn unsigned_abs(self) -> Self {
-                self
-            }
-
-            fn abs_diff(self, other: Self) -> Self {
-                self.abs_diff(other)
-            }
-
-            int_impl_common!($uty);
-        }
-
-        impl MinInt for $ity {
-            type OtherSign = $uty;
-            type UnsignedInt = $uty;
-
-            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
-            const SIGNED: bool = Self::MIN != Self::ZERO;
-
-            const ZERO: Self = 0;
-            const ONE: Self = 1;
-            const MIN: Self = <Self>::MIN;
-            const MAX: Self = <Self>::MAX;
-        }
-
-        impl Int for $ity {
-            fn unsigned(self) -> $uty {
-                self as $uty
-            }
-
-            fn from_unsigned(me: $uty) -> Self {
-                me as $ity
-            }
-
-            fn unsigned_abs(self) -> Self::UnsignedInt {
-                self.unsigned_abs()
-            }
-
-            fn abs_diff(self, other: Self) -> $uty {
-                self.abs_diff(other)
-            }
-
-            int_impl_common!($ity);
-        }
-    };
-}
-
-int_impl!(isize, usize);
-int_impl!(i8, u8);
-int_impl!(i16, u16);
-int_impl!(i32, u32);
-int_impl!(i64, u64);
-int_impl!(i128, u128);
+pub use crate::support::{CastFrom, CastInto, Int, MinInt};
 
 /// Trait for integers twice the bit width of another integer. This is implemented for all
 /// primitives except for `u8`, because there is not a smaller primitive.
@@ -368,44 +97,3 @@ impl_h_int!(
     i32 u32 i64,
     i64 u64 i128
 );
-
-/// Trait to express (possibly lossy) casting of integers
-pub trait CastInto<T: Copy>: Copy {
-    fn cast(self) -> T;
-}
-
-pub trait CastFrom<T: Copy>: Copy {
-    fn cast_from(value: T) -> Self;
-}
-
-impl<T: Copy, U: CastInto<T> + Copy> CastFrom<U> for T {
-    fn cast_from(value: U) -> Self {
-        value.cast()
-    }
-}
-
-macro_rules! cast_into {
-    ($ty:ty) => {
-        cast_into!($ty; usize, isize, u8, i8, u16, i16, u32, i32, u64, i64, u128, i128);
-    };
-    ($ty:ty; $($into:ty),*) => {$(
-        impl CastInto<$into> for $ty {
-            fn cast(self) -> $into {
-                self as $into
-            }
-        }
-    )*};
-}
-
-cast_into!(usize);
-cast_into!(isize);
-cast_into!(u8);
-cast_into!(i8);
-cast_into!(u16);
-cast_into!(i16);
-cast_into!(u32);
-cast_into!(i32);
-cast_into!(u64);
-cast_into!(i64);
-cast_into!(u128);
-cast_into!(i128);
diff --git a/compiler-builtins/src/int/udiv.rs b/compiler-builtins/src/int/udiv.rs
index b9dee63c4..017a81ac9 100644
--- a/compiler-builtins/src/int/udiv.rs
+++ b/compiler-builtins/src/int/udiv.rs
@@ -44,7 +44,7 @@ intrinsics! {
     }
 
     #[unsafe(naked)]
-    pub unsafe extern "C" fn __udivmodqi4() {
+    pub unsafe extern "custom" fn __udivmodqi4() {
         // compute unsigned 8-bit `n / d` and `n % d`.
         //
         // Note: GCC implements a [non-standard calling convention](https://gcc.gnu.org/wiki/avr-gcc#Exceptions_to_the_Calling_Convention) for this function.
diff --git a/compiler-builtins/src/lib.rs b/compiler-builtins/src/lib.rs
index 6a6b28067..b111dc0bd 100644
--- a/compiler-builtins/src/lib.rs
+++ b/compiler-builtins/src/lib.rs
@@ -1,23 +1,23 @@
 #![cfg_attr(feature = "compiler-builtins", compiler_builtins)]
 #![cfg_attr(all(target_family = "wasm"), feature(wasm_numeric_instr))]
+#![feature(abi_custom)]
 #![feature(abi_unadjusted)]
 #![feature(asm_experimental_arch)]
 #![feature(cfg_target_has_atomic)]
 #![feature(compiler_builtins)]
 #![feature(core_intrinsics)]
 #![feature(linkage)]
+#![feature(asm_cfg)]
 #![feature(naked_functions)]
 #![feature(repr_simd)]
+#![feature(macro_metavar_expr_concat)]
+#![feature(rustc_attrs)]
 #![cfg_attr(f16_enabled, feature(f16))]
 #![cfg_attr(f128_enabled, feature(f128))]
 #![no_builtins]
 #![no_std]
 #![allow(unused_features)]
 #![allow(internal_features)]
-// We use `u128` in a whole bunch of places which we currently agree with the
-// compiler on ABIs and such, so we should be "good enough" for now and changes
-// to the `u128` ABI will be reflected here.
-#![allow(improper_ctypes, improper_ctypes_definitions)]
 // `mem::swap` cannot be used because it may generate references to memcpy in unoptimized code.
 #![allow(clippy::manual_swap)]
 // Support compiling on both stage0 and stage1 which may differ in supported stable features.
@@ -56,7 +56,7 @@ pub mod arm;
 #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
 pub mod aarch64;
 
-#[cfg(all(target_arch = "aarch64", target_os = "linux", not(feature = "no-asm"),))]
+#[cfg(all(target_arch = "aarch64", target_os = "linux"))]
 pub mod aarch64_linux;
 
 #[cfg(all(
diff --git a/compiler-builtins/src/macros.rs b/compiler-builtins/src/macros.rs
index dbf715534..203cd0949 100644
--- a/compiler-builtins/src/macros.rs
+++ b/compiler-builtins/src/macros.rs
@@ -132,7 +132,7 @@ macro_rules! intrinsics {
     ) => (
         #[cfg($name = "optimized-c")]
         pub $(unsafe $($empty)? )? extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
-            extern $abi {
+            unsafe extern $abi {
                 fn $name($($argname: $ty),*) $(-> $ret)?;
             }
             unsafe {
@@ -433,21 +433,9 @@ macro_rules! intrinsics {
     ) => (
         // `#[naked]` definitions are referenced by other places, so we can't use `cfg` like the others
         pub mod $name {
-            // FIXME: when bootstrap supports `#[unsafe(naked)]` this duplication can be removed
-            #[cfg(bootstrap)]
-            #[naked]
-            #[allow(unused_unsafe)]
-            $(#[$($attr)*])*
-            #[cfg_attr(not(feature = "mangled-names"), no_mangle)]
-            #[cfg_attr(not(any(all(windows, target_env = "gnu"), target_os = "cygwin")), linkage = "weak")]
-            pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
-                unsafe { $($body)* }
-            }
-
-            #[cfg(not(bootstrap))]
             #[unsafe(naked)]
             $(#[$($attr)*])*
-            #[cfg_attr(not(feature = "mangled-names"), no_mangle)]
+            #[cfg_attr(not(feature = "mangled-names"), unsafe(no_mangle))]
             #[cfg_attr(not(any(all(windows, target_env = "gnu"), target_os = "cygwin")), linkage = "weak")]
             pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
                 $($body)*
diff --git a/compiler-builtins/src/math/libm_math b/compiler-builtins/src/math/libm_math
deleted file mode 120000
index 4d65313c2..000000000
--- a/compiler-builtins/src/math/libm_math
+++ /dev/null
@@ -1 +0,0 @@
-../../../libm/src/math
\ No newline at end of file
diff --git a/compiler-builtins/src/math/mod.rs b/compiler-builtins/src/math/mod.rs
index 078feb9ff..62d729674 100644
--- a/compiler-builtins/src/math/mod.rs
+++ b/compiler-builtins/src/math/mod.rs
@@ -2,6 +2,7 @@
 #[allow(dead_code)]
 #[allow(unused_imports)]
 #[allow(clippy::all)]
+#[path = "../../../libm/src/math/mod.rs"]
 pub(crate) mod libm_math;
 
 macro_rules! libm_intrinsics {
diff --git a/compiler-builtins/src/mem/impls.rs b/compiler-builtins/src/mem/impls.rs
index 14a478748..da16dee25 100644
--- a/compiler-builtins/src/mem/impls.rs
+++ b/compiler-builtins/src/mem/impls.rs
@@ -15,6 +15,7 @@
 // this use. Of course this is not a guarantee that such use will work, it just means that this
 // crate doing wrapping pointer arithmetic with a method that must not wrap won't be the problem if
 // something does go wrong at runtime.
+use core::ffi::c_int;
 use core::intrinsics::likely;
 
 const WORD_SIZE: usize = core::mem::size_of::<usize>();
@@ -384,13 +385,13 @@ pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) {
 }
 
 #[inline(always)]
-pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) -> i32 {
+pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) -> c_int {
     let mut i = 0;
     while i < n {
         let a = *s1.wrapping_add(i);
         let b = *s2.wrapping_add(i);
         if a != b {
-            return a as i32 - b as i32;
+            return c_int::from(a) - c_int::from(b);
         }
         i += 1;
     }
diff --git a/compiler-builtins/src/mem/mod.rs b/compiler-builtins/src/mem/mod.rs
index 6828f3804..a227f60a2 100644
--- a/compiler-builtins/src/mem/mod.rs
+++ b/compiler-builtins/src/mem/mod.rs
@@ -3,13 +3,6 @@
 // FIXME(e2024): this eventually needs to be removed.
 #![allow(unsafe_op_in_unsafe_fn)]
 
-#[allow(warnings)]
-#[cfg(target_pointer_width = "16")]
-type c_int = i16;
-#[allow(warnings)]
-#[cfg(not(target_pointer_width = "16"))]
-type c_int = i32;
-
 // memcpy/memmove/memset have optimized implementations on some architectures
 #[cfg_attr(
     all(not(feature = "no-asm"), target_arch = "x86_64"),
@@ -38,18 +31,18 @@ intrinsics! {
     }
 
     #[mem_builtin]
-    pub unsafe extern "C" fn memset(s: *mut u8, c: crate::mem::c_int, n: usize) -> *mut u8 {
+    pub unsafe extern "C" fn memset(s: *mut u8, c: core::ffi::c_int, n: usize) -> *mut u8 {
         impls::set_bytes(s, c as u8, n);
         s
     }
 
     #[mem_builtin]
-    pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
+    pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> core::ffi::c_int {
         impls::compare_bytes(s1, s2, n)
     }
 
     #[mem_builtin]
-    pub unsafe extern "C" fn bcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
+    pub unsafe extern "C" fn bcmp(s1: *const u8, s2: *const u8, n: usize) -> core::ffi::c_int {
         memcmp(s1, s2, n)
     }
 
diff --git a/compiler-builtins/src/mem/x86_64.rs b/compiler-builtins/src/mem/x86_64.rs
index 5cbe83ab1..fb29eb11b 100644
--- a/compiler-builtins/src/mem/x86_64.rs
+++ b/compiler-builtins/src/mem/x86_64.rs
@@ -69,7 +69,7 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
         "rep movsb",
         "sub $7, %rsi",
         "sub $7, %rdi",
-        "mov {qword_count}, %rcx",
+        "mov {qword_count:r}, %rcx",
         "rep movsq",
         "test {pre_byte_count:e}, {pre_byte_count:e}",
         "add $7, %rsi",
@@ -212,7 +212,7 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
     let x = {
         let r;
         asm!(
-            "movdqa ({addr}), {dest}",
+            "movdqa ({addr:r}), {dest}",
             addr = in(reg) s,
             dest = out(xmm_reg) r,
             options(att_syntax, nostack),
@@ -232,7 +232,7 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
         let x = {
             let r;
             asm!(
-                "movdqa ({addr}), {dest}",
+                "movdqa ({addr:r}), {dest}",
                 addr = in(reg) s,
                 dest = out(xmm_reg) r,
                 options(att_syntax, nostack),
diff --git a/compiler-builtins/src/probestack.rs b/compiler-builtins/src/probestack.rs
index 5b6abd21a..9a18216da 100644
--- a/compiler-builtins/src/probestack.rs
+++ b/compiler-builtins/src/probestack.rs
@@ -44,214 +44,84 @@
 #![cfg(not(feature = "mangled-names"))]
 // Windows and Cygwin already has builtins to do this.
 #![cfg(not(any(windows, target_os = "cygwin")))]
-// All these builtins require assembly
-#![cfg(not(feature = "no-asm"))]
 // We only define stack probing for these architectures today.
 #![cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 
-extern "C" {
-    pub fn __rust_probestack();
-}
-
-// A wrapper for our implementation of __rust_probestack, which allows us to
-// keep the assembly inline while controlling all CFI directives in the assembly
-// emitted for the function.
-//
-// This is the ELF version.
-#[cfg(not(any(target_vendor = "apple", target_os = "uefi")))]
-macro_rules! define_rust_probestack {
-    ($body: expr) => {
-        concat!(
-            "
-            .pushsection .text.__rust_probestack
-            .globl __rust_probestack
-            .type  __rust_probestack, @function
-            .hidden __rust_probestack
-        __rust_probestack:
-            ",
-            $body,
-            "
-            .size __rust_probestack, . - __rust_probestack
-            .popsection
-            "
-        )
-    };
-}
-
-#[cfg(all(target_os = "uefi", target_arch = "x86_64"))]
-macro_rules! define_rust_probestack {
-    ($body: expr) => {
-        concat!(
-            "
-            .globl __rust_probestack
-        __rust_probestack:
-            ",
-            $body
-        )
-    };
-}
-
-// Same as above, but for Mach-O. Note that the triple underscore
-// is deliberate
-#[cfg(target_vendor = "apple")]
-macro_rules! define_rust_probestack {
-    ($body: expr) => {
-        concat!(
-            "
-            .globl ___rust_probestack
-        ___rust_probestack:
-            ",
-            $body
-        )
-    };
-}
-
-// In UEFI x86 arch, triple underscore is deliberate.
-#[cfg(all(target_os = "uefi", target_arch = "x86"))]
-macro_rules! define_rust_probestack {
-    ($body: expr) => {
-        concat!(
-            "
-            .globl ___rust_probestack
-        ___rust_probestack:
-            ",
-            $body
-        )
-    };
-}
-
 // Our goal here is to touch each page between %rsp+8 and %rsp+8-%rax,
 // ensuring that if any pages are unmapped we'll make a page fault.
 //
 // The ABI here is that the stack frame size is located in `%rax`. Upon
 // return we're not supposed to modify `%rsp` or `%rax`.
-//
-// Any changes to this function should be replicated to the SGX version below.
-#[cfg(all(
-    target_arch = "x86_64",
-    not(all(target_env = "sgx", target_vendor = "fortanix"))
-))]
-core::arch::global_asm!(
-    define_rust_probestack!(
+#[cfg(target_arch = "x86_64")]
+#[unsafe(naked)]
+#[rustc_std_internal_symbol]
+pub unsafe extern "custom" fn __rust_probestack() {
+    core::arch::naked_asm!(
         "
-    .cfi_startproc
-    pushq  %rbp
-    .cfi_adjust_cfa_offset 8
-    .cfi_offset %rbp, -16
-    movq   %rsp, %rbp
-    .cfi_def_cfa_register %rbp
-
-    mov    %rax,%r11        // duplicate %rax as we're clobbering %r11
-
-    // Main loop, taken in one page increments. We're decrementing rsp by
-    // a page each time until there's less than a page remaining. We're
-    // guaranteed that this function isn't called unless there's more than a
-    // page needed.
-    //
-    // Note that we're also testing against `8(%rsp)` to account for the 8
-    // bytes pushed on the stack orginally with our return address. Using
-    // `8(%rsp)` simulates us testing the stack pointer in the caller's
-    // context.
-
-    // It's usually called when %rax >= 0x1000, but that's not always true.
-    // Dynamic stack allocation, which is needed to implement unsized
-    // rvalues, triggers stackprobe even if %rax < 0x1000.
-    // Thus we have to check %r11 first to avoid segfault.
-    cmp    $0x1000,%r11
-    jna    3f
-2:
-    sub    $0x1000,%rsp
-    test   %rsp,8(%rsp)
-    sub    $0x1000,%r11
-    cmp    $0x1000,%r11
-    ja     2b
-
-3:
-    // Finish up the last remaining stack space requested, getting the last
-    // bits out of r11
-    sub    %r11,%rsp
-    test   %rsp,8(%rsp)
-
-    // Restore the stack pointer to what it previously was when entering
-    // this function. The caller will readjust the stack pointer after we
-    // return.
-    add    %rax,%rsp
-
-    leave
-    .cfi_def_cfa_register %rsp
-    .cfi_adjust_cfa_offset -8
-    ret
-    .cfi_endproc
+            .cfi_startproc
+            pushq  %rbp
+            .cfi_adjust_cfa_offset 8
+            .cfi_offset %rbp, -16
+            movq   %rsp, %rbp
+            .cfi_def_cfa_register %rbp
+
+            mov    %rax,%r11        // duplicate %rax as we're clobbering %r11
+
+            // Main loop, taken in one page increments. We're decrementing rsp by
+            // a page each time until there's less than a page remaining. We're
+            // guaranteed that this function isn't called unless there's more than a
+            // page needed.
+            //
+            // Note that we're also testing against `8(%rsp)` to account for the 8
+            // bytes pushed on the stack orginally with our return address. Using
+            // `8(%rsp)` simulates us testing the stack pointer in the caller's
+            // context.
+
+            // It's usually called when %rax >= 0x1000, but that's not always true.
+            // Dynamic stack allocation, which is needed to implement unsized
+            // rvalues, triggers stackprobe even if %rax < 0x1000.
+            // Thus we have to check %r11 first to avoid segfault.
+            cmp    $0x1000,%r11
+            jna    3f
+        2:
+            sub    $0x1000,%rsp
+            test   %rsp,8(%rsp)
+            sub    $0x1000,%r11
+            cmp    $0x1000,%r11
+            ja     2b
+
+        3:
+            // Finish up the last remaining stack space requested, getting the last
+            // bits out of r11
+            sub    %r11,%rsp
+            test   %rsp,8(%rsp)
+
+            // Restore the stack pointer to what it previously was when entering
+            // this function. The caller will readjust the stack pointer after we
+            // return.
+            add    %rax,%rsp
+
+            leave
+            .cfi_def_cfa_register %rsp
+            .cfi_adjust_cfa_offset -8
+    ",
+    #[cfg(not(all(target_env = "sgx", target_vendor = "fortanix")))]
+    "       ret",
+    #[cfg(all(target_env = "sgx", target_vendor = "fortanix"))]
     "
-    ),
-    options(att_syntax)
-);
-
-// This function is the same as above, except that some instructions are
-// [manually patched for LVI].
-//
-// [manually patched for LVI]: https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions
-#[cfg(all(
-    target_arch = "x86_64",
-    all(target_env = "sgx", target_vendor = "fortanix")
-))]
-core::arch::global_asm!(
-    define_rust_probestack!(
-        "
-    .cfi_startproc
-    pushq  %rbp
-    .cfi_adjust_cfa_offset 8
-    .cfi_offset %rbp, -16
-    movq   %rsp, %rbp
-    .cfi_def_cfa_register %rbp
-
-    mov    %rax,%r11        // duplicate %rax as we're clobbering %r11
-
-    // Main loop, taken in one page increments. We're decrementing rsp by
-    // a page each time until there's less than a page remaining. We're
-    // guaranteed that this function isn't called unless there's more than a
-    // page needed.
-    //
-    // Note that we're also testing against `8(%rsp)` to account for the 8
-    // bytes pushed on the stack orginally with our return address. Using
-    // `8(%rsp)` simulates us testing the stack pointer in the caller's
-    // context.
-
-    // It's usually called when %rax >= 0x1000, but that's not always true.
-    // Dynamic stack allocation, which is needed to implement unsized
-    // rvalues, triggers stackprobe even if %rax < 0x1000.
-    // Thus we have to check %r11 first to avoid segfault.
-    cmp    $0x1000,%r11
-    jna    3f
-2:
-    sub    $0x1000,%rsp
-    test   %rsp,8(%rsp)
-    sub    $0x1000,%r11
-    cmp    $0x1000,%r11
-    ja     2b
-
-3:
-    // Finish up the last remaining stack space requested, getting the last
-    // bits out of r11
-    sub    %r11,%rsp
-    test   %rsp,8(%rsp)
-
-    // Restore the stack pointer to what it previously was when entering
-    // this function. The caller will readjust the stack pointer after we
-    // return.
-    add    %rax,%rsp
-
-    leave
-    .cfi_def_cfa_register %rsp
-    .cfi_adjust_cfa_offset -8
-    pop %r11
-    lfence
-    jmp *%r11
-    .cfi_endproc
+            // for this target, [manually patch for LVI].
+            //
+            // [manually patch for LVI]: https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions
+            pop %r11
+            lfence
+            jmp *%r11
+    ",
     "
-    ),
-    options(att_syntax)
-);
+            .cfi_endproc
+    ",
+        options(att_syntax)
+    )
+}
 
 #[cfg(all(target_arch = "x86", not(target_os = "uefi")))]
 // This is the same as x86_64 above, only translated for 32-bit sizes. Note
@@ -259,42 +129,44 @@ core::arch::global_asm!(
 // function basically can't tamper with anything.
 //
 // The ABI here is the same as x86_64, except everything is 32-bits large.
-core::arch::global_asm!(
-    define_rust_probestack!(
+#[unsafe(naked)]
+#[rustc_std_internal_symbol]
+pub unsafe extern "custom" fn __rust_probestack() {
+    core::arch::naked_asm!(
         "
-    .cfi_startproc
-    push   %ebp
-    .cfi_adjust_cfa_offset 4
-    .cfi_offset %ebp, -8
-    mov    %esp, %ebp
-    .cfi_def_cfa_register %ebp
-    push   %ecx
-    mov    %eax,%ecx
-
-    cmp    $0x1000,%ecx
-    jna    3f
-2:
-    sub    $0x1000,%esp
-    test   %esp,8(%esp)
-    sub    $0x1000,%ecx
-    cmp    $0x1000,%ecx
-    ja     2b
-
-3:
-    sub    %ecx,%esp
-    test   %esp,8(%esp)
-
-    add    %eax,%esp
-    pop    %ecx
-    leave
-    .cfi_def_cfa_register %esp
-    .cfi_adjust_cfa_offset -4
-    ret
-    .cfi_endproc
-    "
-    ),
-    options(att_syntax)
-);
+            .cfi_startproc
+            push   %ebp
+            .cfi_adjust_cfa_offset 4
+            .cfi_offset %ebp, -8
+            mov    %esp, %ebp
+            .cfi_def_cfa_register %ebp
+            push   %ecx
+            mov    %eax,%ecx
+
+            cmp    $0x1000,%ecx
+            jna    3f
+        2:
+            sub    $0x1000,%esp
+            test   %esp,8(%esp)
+            sub    $0x1000,%ecx
+            cmp    $0x1000,%ecx
+            ja     2b
+
+        3:
+            sub    %ecx,%esp
+            test   %esp,8(%esp)
+
+            add    %eax,%esp
+            pop    %ecx
+            leave
+            .cfi_def_cfa_register %esp
+            .cfi_adjust_cfa_offset -4
+            ret
+            .cfi_endproc
+    ",
+        options(att_syntax)
+    )
+}
 
 #[cfg(all(target_arch = "x86", target_os = "uefi"))]
 // UEFI target is windows like target. LLVM will do _chkstk things like windows.
@@ -307,44 +179,46 @@ core::arch::global_asm!(
 //   MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
 //   MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
 //   themselves.
-core::arch::global_asm!(
-    define_rust_probestack!(
+#[unsafe(naked)]
+#[rustc_std_internal_symbol]
+pub unsafe extern "custom" fn __rust_probestack() {
+    core::arch::naked_asm!(
         "
-    .cfi_startproc
-    push   %ebp
-    .cfi_adjust_cfa_offset 4
-    .cfi_offset %ebp, -8
-    mov    %esp, %ebp
-    .cfi_def_cfa_register %ebp
-    push   %ecx
-    push   %edx
-    mov    %eax,%ecx
-
-    cmp    $0x1000,%ecx
-    jna    3f
-2:
-    sub    $0x1000,%esp
-    test   %esp,8(%esp)
-    sub    $0x1000,%ecx
-    cmp    $0x1000,%ecx
-    ja     2b
-
-3:
-    sub    %ecx,%esp
-    test   %esp,8(%esp)
-    mov    4(%ebp),%edx
-    mov    %edx, 12(%esp)
-    add    %eax,%esp
-    pop    %edx
-    pop    %ecx
-    leave
-
-    sub   %eax, %esp
-    .cfi_def_cfa_register %esp
-    .cfi_adjust_cfa_offset -4
-    ret
-    .cfi_endproc
-    "
-    ),
-    options(att_syntax)
-);
+            .cfi_startproc
+            push   %ebp
+            .cfi_adjust_cfa_offset 4
+            .cfi_offset %ebp, -8
+            mov    %esp, %ebp
+            .cfi_def_cfa_register %ebp
+            push   %ecx
+            push   %edx
+            mov    %eax,%ecx
+
+            cmp    $0x1000,%ecx
+            jna    3f
+        2:
+            sub    $0x1000,%esp
+            test   %esp,8(%esp)
+            sub    $0x1000,%ecx
+            cmp    $0x1000,%ecx
+            ja     2b
+
+        3:
+            sub    %ecx,%esp
+            test   %esp,8(%esp)
+            mov    4(%ebp),%edx
+            mov    %edx, 12(%esp)
+            add    %eax,%esp
+            pop    %edx
+            pop    %ecx
+            leave
+
+            sub   %eax, %esp
+            .cfi_def_cfa_register %esp
+            .cfi_adjust_cfa_offset -4
+            ret
+            .cfi_endproc
+    ",
+        options(att_syntax)
+    )
+}
diff --git a/compiler-builtins/src/x86.rs b/compiler-builtins/src/x86.rs
index 01152d9c7..51940b3b3 100644
--- a/compiler-builtins/src/x86.rs
+++ b/compiler-builtins/src/x86.rs
@@ -2,30 +2,24 @@
 
 use core::intrinsics;
 
-// NOTE These functions are implemented using assembly because they using a custom
+// NOTE These functions are implemented using assembly because they use a custom
 // calling convention which can't be implemented using a normal Rust function
 
 // NOTE These functions are never mangled as they are not tested against compiler-rt
 
 intrinsics! {
     #[unsafe(naked)]
-    #[cfg(all(
-        any(all(windows, target_env = "gnu"), target_os = "uefi"),
-        not(feature = "no-asm")
-    ))]
-    pub unsafe extern "C" fn __chkstk() {
+    #[cfg(any(all(windows, target_env = "gnu"), target_os = "uefi"))]
+    pub unsafe extern "custom" fn __chkstk() {
         core::arch::naked_asm!(
-            "jmp __alloca", // Jump to __alloca since fallthrough may be unreliable"
-            options(att_syntax)
+            "jmp {}", // Jump to __alloca since fallthrough may be unreliable"
+            sym crate::x86::_alloca::_alloca,
         );
     }
 
     #[unsafe(naked)]
-    #[cfg(all(
-        any(all(windows, target_env = "gnu"), target_os = "uefi"),
-        not(feature = "no-asm")
-    ))]
-    pub unsafe extern "C" fn _alloca() {
+    #[cfg(any(all(windows, target_env = "gnu"), target_os = "uefi"))]
+    pub unsafe extern "custom" fn _alloca() {
         // __chkstk and _alloca are the same function
         core::arch::naked_asm!(
             "push   %ecx",
diff --git a/compiler-builtins/src/x86_64.rs b/compiler-builtins/src/x86_64.rs
index fc1190f79..f9ae784d5 100644
--- a/compiler-builtins/src/x86_64.rs
+++ b/compiler-builtins/src/x86_64.rs
@@ -2,22 +2,15 @@
 
 use core::intrinsics;
 
-// NOTE These functions are implemented using assembly because they using a custom
+// NOTE These functions are implemented using assembly because they use a custom
 // calling convention which can't be implemented using a normal Rust function
 
 // NOTE These functions are never mangled as they are not tested against compiler-rt
 
 intrinsics! {
     #[unsafe(naked)]
-    #[cfg(all(
-        any(
-            all(windows, target_env = "gnu"),
-            target_os = "cygwin",
-            target_os = "uefi"
-        ),
-        not(feature = "no-asm")
-    ))]
-    pub unsafe extern "C" fn ___chkstk_ms() {
+    #[cfg(any(all(windows, target_env = "gnu"), target_os = "cygwin", target_os = "uefi"))]
+    pub unsafe extern "custom" fn ___chkstk_ms() {
         core::arch::naked_asm!(
             "push   %rcx",
             "push   %rax",
diff --git a/crates/libm-macros/Cargo.toml b/crates/libm-macros/Cargo.toml
index 3929854f0..100a8d0ec 100644
--- a/crates/libm-macros/Cargo.toml
+++ b/crates/libm-macros/Cargo.toml
@@ -10,9 +10,9 @@ proc-macro = true
 
 [dependencies]
 heck = "0.5.0"
-proc-macro2 = "1.0.94"
+proc-macro2 = "1.0.95"
 quote = "1.0.40"
-syn = { version = "2.0.100", features = ["full", "extra-traits", "visit-mut"] }
+syn = { version = "2.0.104", features = ["full", "extra-traits", "visit-mut"] }
 
 [lints.rust]
 # Values used during testing
diff --git a/crates/libm-macros/src/lib.rs b/crates/libm-macros/src/lib.rs
index e8afe3aad..7efa1488f 100644
--- a/crates/libm-macros/src/lib.rs
+++ b/crates/libm-macros/src/lib.rs
@@ -266,27 +266,27 @@ fn validate(input: &mut StructuredInput) -> syn::Result<Vec<&'static MathOpInfo>
         }
     }
 
-    if let Some(map) = &input.fn_extra {
-        if !map.keys().any(|key| key == "_") {
-            // No default provided; make sure every expected function is covered
-            let mut fns_not_covered = Vec::new();
-            for func in &fn_list {
-                if !map.keys().any(|key| key == func.name) {
-                    // `name` was not mentioned in the `match` statement
-                    fns_not_covered.push(func);
-                }
+    if let Some(map) = &input.fn_extra
+        && !map.keys().any(|key| key == "_")
+    {
+        // No default provided; make sure every expected function is covered
+        let mut fns_not_covered = Vec::new();
+        for func in &fn_list {
+            if !map.keys().any(|key| key == func.name) {
+                // `name` was not mentioned in the `match` statement
+                fns_not_covered.push(func);
             }
+        }
 
-            if !fns_not_covered.is_empty() {
-                let e = syn::Error::new(
-                    input.fn_extra_span.unwrap(),
-                    format!(
-                        "`fn_extra`: no default `_` pattern specified and the following \
-                         patterns are not covered: {fns_not_covered:#?}"
-                    ),
-                );
-                return Err(e);
-            }
+        if !fns_not_covered.is_empty() {
+            let e = syn::Error::new(
+                input.fn_extra_span.unwrap(),
+                format!(
+                    "`fn_extra`: no default `_` pattern specified and the following \
+                     patterns are not covered: {fns_not_covered:#?}"
+                ),
+            );
+            return Err(e);
         }
     };
 
diff --git a/crates/musl-math-sys/Cargo.toml b/crates/musl-math-sys/Cargo.toml
index d3fb147e5..39f6fa906 100644
--- a/crates/musl-math-sys/Cargo.toml
+++ b/crates/musl-math-sys/Cargo.toml
@@ -11,4 +11,4 @@ license = "MIT OR Apache-2.0"
 libm = { path = "../../libm" }
 
 [build-dependencies]
-cc = "1.2.16"
+cc = "1.2.29"
diff --git a/crates/musl-math-sys/build.rs b/crates/musl-math-sys/build.rs
index b00dbc73e..59e42f2d2 100644
--- a/crates/musl-math-sys/build.rs
+++ b/crates/musl-math-sys/build.rs
@@ -120,7 +120,7 @@ fn build_musl_math(cfg: &Config) {
     let arch_dir = musl_dir.join("arch").join(&cfg.musl_arch);
     assert!(
         math.exists(),
-        "musl source not found. Is the submodule up to date?"
+        "musl source not found. You may need to run `./ci/update-musl.sh`."
     );
 
     let source_map = find_math_source(&math, cfg);
diff --git a/crates/musl-math-sys/musl b/crates/musl-math-sys/musl
deleted file mode 160000
index c47ad25ea..000000000
--- a/crates/musl-math-sys/musl
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit c47ad25ea3b484e10326f933e927c0bc8cded3da
diff --git a/crates/musl-math-sys/src/lib.rs b/crates/musl-math-sys/src/lib.rs
index 6a4bf4859..9cab8deef 100644
--- a/crates/musl-math-sys/src/lib.rs
+++ b/crates/musl-math-sys/src/lib.rs
@@ -40,8 +40,6 @@ macro_rules! functions {
     ) => {
         // Run a simple check to ensure we can link and call the function without crashing.
         #[test]
-        // FIXME(#309): LE PPC crashes calling some musl functions
-        #[cfg_attr(all(target_arch = "powerpc64", target_endian = "little"), ignore)]
         fn $name() {
             <fn($($aty),+) -> $rty>::check(super::$name);
         }
diff --git a/crates/panic-handler/src/lib.rs b/crates/panic-handler/src/lib.rs
index 673e00522..f4d7c8397 100644
--- a/crates/panic-handler/src/lib.rs
+++ b/crates/panic-handler/src/lib.rs
@@ -1,11 +1,8 @@
 //! This is needed for tests on targets that require a `#[panic_handler]` function
 
-#![feature(no_core)]
-#![no_core]
-
-extern crate core;
+#![no_std]
 
 #[panic_handler]
-fn panic(_: &core::panic::PanicInfo) -> ! {
+fn panic(_: &core::panic::PanicInfo<'_>) -> ! {
     loop {}
 }
diff --git a/crates/symbol-check/Cargo.toml b/crates/symbol-check/Cargo.toml
new file mode 100644
index 000000000..e2218b491
--- /dev/null
+++ b/crates/symbol-check/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "symbol-check"
+version = "0.1.0"
+edition = "2024"
+publish = false
+
+[dependencies]
+object = "0.37.1"
+serde_json = "1.0.140"
+
+[features]
+wasm = ["object/wasm"]
diff --git a/crates/symbol-check/src/main.rs b/crates/symbol-check/src/main.rs
new file mode 100644
index 000000000..4e9455233
--- /dev/null
+++ b/crates/symbol-check/src/main.rs
@@ -0,0 +1,327 @@
+//! Tool used by CI to inspect compiler-builtins archives and help ensure we won't run into any
+//! linking errors.
+
+use std::collections::{BTreeMap, BTreeSet};
+use std::fs;
+use std::io::{BufRead, BufReader};
+use std::path::{Path, PathBuf};
+use std::process::{Command, Stdio};
+
+use object::read::archive::{ArchiveFile, ArchiveMember};
+use object::{
+    File as ObjFile, Object, ObjectSection, ObjectSymbol, Symbol, SymbolKind, SymbolScope,
+};
+use serde_json::Value;
+
+const CHECK_LIBRARIES: &[&str] = &["compiler_builtins", "builtins_test_intrinsics"];
+const CHECK_EXTENSIONS: &[Option<&str>] = &[Some("rlib"), Some("a"), Some("exe"), None];
+
+const USAGE: &str = "Usage:
+
+    symbol-check build-and-check [TARGET] -- CARGO_BUILD_ARGS ...
+
+Cargo will get invoked with `CARGO_ARGS` and the specified target. All output
+`compiler_builtins*.rlib` files will be checked.
+
+If TARGET is not specified, the host target is used.
+";
+
+fn main() {
+    // Create a `&str` vec so we can match on it.
+    let args = std::env::args().collect::<Vec<_>>();
+    let args_ref = args.iter().map(String::as_str).collect::<Vec<_>>();
+
+    match &args_ref[1..] {
+        ["build-and-check", target, "--", args @ ..] if !args.is_empty() => {
+            check_cargo_args(args);
+            run_build_and_check(target, args);
+        }
+        ["build-and-check", "--", args @ ..] if !args.is_empty() => {
+            check_cargo_args(args);
+            run_build_and_check(&host_target(), args);
+        }
+        _ => {
+            println!("{USAGE}");
+            std::process::exit(1);
+        }
+    }
+}
+
+/// Make sure `--target` isn't passed to avoid confusion (since it should be proivded only once,
+/// positionally).
+fn check_cargo_args(args: &[&str]) {
+    for arg in args {
+        assert!(
+            !arg.contains("--target"),
+            "target must be passed positionally. {USAGE}"
+        );
+    }
+}
+
+fn run_build_and_check(target: &str, args: &[&str]) {
+    let paths = exec_cargo_with_args(target, args);
+    for path in paths {
+        println!("Checking {}", path.display());
+        let archive = Archive::from_path(&path);
+
+        verify_no_duplicates(&archive);
+        verify_core_symbols(&archive);
+    }
+}
+
+fn host_target() -> String {
+    let out = Command::new("rustc")
+        .arg("--version")
+        .arg("--verbose")
+        .output()
+        .unwrap();
+    assert!(out.status.success());
+    let out = String::from_utf8(out.stdout).unwrap();
+    out.lines()
+        .find_map(|s| s.strip_prefix("host: "))
+        .unwrap()
+        .to_owned()
+}
+
+/// Run `cargo build` with the provided additional arguments, collecting the list of created
+/// libraries.
+fn exec_cargo_with_args(target: &str, args: &[&str]) -> Vec<PathBuf> {
+    let mut cmd = Command::new("cargo");
+    cmd.args([
+        "build",
+        "--target",
+        target,
+        "--message-format=json-diagnostic-rendered-ansi",
+    ])
+    .args(args)
+    .stdout(Stdio::piped());
+
+    println!("running: {cmd:?}");
+    let mut child = cmd.spawn().expect("failed to launch Cargo");
+
+    let stdout = child.stdout.take().unwrap();
+    let reader = BufReader::new(stdout);
+    let mut check_files = Vec::new();
+
+    for line in reader.lines() {
+        let line = line.expect("failed to read line");
+        let j: Value = serde_json::from_str(&line).expect("failed to deserialize");
+        let reason = &j["reason"];
+
+        // Forward output that is meant to be user-facing
+        if reason == "compiler-message" {
+            println!("{}", j["message"]["rendered"].as_str().unwrap());
+        } else if reason == "build-finished" {
+            println!("build finshed. success: {}", j["success"]);
+        } else if reason == "build-script-executed" {
+            let pretty = serde_json::to_string_pretty(&j).unwrap();
+            println!("build script output: {pretty}",);
+        }
+
+        // Only interested in the artifact list now
+        if reason != "compiler-artifact" {
+            continue;
+        }
+
+        // Find rlibs in the created file list that match our expected library names and
+        // extensions.
+        for fpath in j["filenames"].as_array().expect("filenames not an array") {
+            let path = fpath.as_str().expect("file name not a string");
+            let path = PathBuf::from(path);
+
+            if CHECK_EXTENSIONS.contains(&path.extension().map(|ex| ex.to_str().unwrap())) {
+                let fname = path.file_name().unwrap().to_str().unwrap();
+
+                if CHECK_LIBRARIES.iter().any(|lib| fname.contains(lib)) {
+                    check_files.push(path);
+                }
+            }
+        }
+    }
+
+    assert!(child.wait().expect("failed to wait on Cargo").success());
+
+    assert!(!check_files.is_empty(), "no compiler_builtins rlibs found");
+    println!("Collected the following rlibs to check: {check_files:#?}");
+
+    check_files
+}
+
+/// Information collected from `object`, for convenience.
+#[expect(unused)] // only for printing
+#[derive(Clone, Debug)]
+struct SymInfo {
+    name: String,
+    kind: SymbolKind,
+    scope: SymbolScope,
+    section: String,
+    is_undefined: bool,
+    is_global: bool,
+    is_local: bool,
+    is_weak: bool,
+    is_common: bool,
+    address: u64,
+    object: String,
+}
+
+impl SymInfo {
+    fn new(sym: &Symbol, obj: &ObjFile, member: &ArchiveMember) -> Self {
+        // Include the section name if possible. Fall back to the `Section` debug impl if not.
+        let section = sym.section();
+        let section_name = sym
+            .section()
+            .index()
+            .and_then(|idx| obj.section_by_index(idx).ok())
+            .and_then(|sec| sec.name().ok())
+            .map(ToString::to_string)
+            .unwrap_or_else(|| format!("{section:?}"));
+
+        Self {
+            name: sym.name().expect("missing name").to_owned(),
+            kind: sym.kind(),
+            scope: sym.scope(),
+            section: section_name,
+            is_undefined: sym.is_undefined(),
+            is_global: sym.is_global(),
+            is_local: sym.is_local(),
+            is_weak: sym.is_weak(),
+            is_common: sym.is_common(),
+            address: sym.address(),
+            object: String::from_utf8_lossy(member.name()).into_owned(),
+        }
+    }
+}
+
+/// Ensure that the same global symbol isn't defined in multiple object files within an archive.
+///
+/// Note that this will also locate cases where a symbol is weakly defined in more than one place.
+/// Technically there are no linker errors that will come from this, but it keeps our binary more
+/// straightforward and saves some distribution size.
+fn verify_no_duplicates(archive: &Archive) {
+    let mut syms = BTreeMap::<String, SymInfo>::new();
+    let mut dups = Vec::new();
+    let mut found_any = false;
+
+    archive.for_each_symbol(|symbol, obj, member| {
+        // Only check defined globals
+        if !symbol.is_global() || symbol.is_undefined() {
+            return;
+        }
+
+        let sym = SymInfo::new(&symbol, obj, member);
+
+        // x86-32 includes multiple copies of thunk symbols
+        if sym.name.starts_with("__x86.get_pc_thunk") {
+            return;
+        }
+
+        // GDB pretty printing symbols may show up more than once but are weak.
+        if sym.section == ".debug_gdb_scripts" && sym.is_weak {
+            return;
+        }
+
+        // Windows has symbols for literal numeric constants, string literals, and MinGW pseudo-
+        // relocations. These are allowed to have repeated definitions.
+        let win_allowed_dup_pfx = ["__real@", "__xmm@", "__ymm@", "??_C@_", ".refptr"];
+        if win_allowed_dup_pfx
+            .iter()
+            .any(|pfx| sym.name.starts_with(pfx))
+        {
+            return;
+        }
+
+        match syms.get(&sym.name) {
+            Some(existing) => {
+                dups.push(sym);
+                dups.push(existing.clone());
+            }
+            None => {
+                syms.insert(sym.name.clone(), sym);
+            }
+        }
+
+        found_any = true;
+    });
+
+    assert!(found_any, "no symbols found");
+
+    if !dups.is_empty() {
+        dups.sort_unstable_by(|a, b| a.name.cmp(&b.name));
+        panic!("found duplicate symbols: {dups:#?}");
+    }
+
+    println!("    success: no duplicate symbols found");
+}
+
+/// Ensure that there are no references to symbols from `core` that aren't also (somehow) defined.
+fn verify_core_symbols(archive: &Archive) {
+    let mut defined = BTreeSet::new();
+    let mut undefined = Vec::new();
+    let mut has_symbols = false;
+
+    archive.for_each_symbol(|symbol, obj, member| {
+        has_symbols = true;
+
+        // Find only symbols from `core`
+        if !symbol.name().unwrap().contains("_ZN4core") {
+            return;
+        }
+
+        let sym = SymInfo::new(&symbol, obj, member);
+        if sym.is_undefined {
+            undefined.push(sym);
+        } else {
+            defined.insert(sym.name);
+        }
+    });
+
+    assert!(has_symbols, "no symbols found");
+
+    // Discard any symbols that are defined somewhere in the archive
+    undefined.retain(|sym| !defined.contains(&sym.name));
+
+    if !undefined.is_empty() {
+        undefined.sort_unstable_by(|a, b| a.name.cmp(&b.name));
+        panic!("found undefined symbols from core: {undefined:#?}");
+    }
+
+    println!("    success: no undefined references to core found");
+}
+
+/// Thin wrapper for owning data used by `object`.
+struct Archive {
+    data: Vec<u8>,
+}
+
+impl Archive {
+    fn from_path(path: &Path) -> Self {
+        Self {
+            data: fs::read(path).expect("reading file failed"),
+        }
+    }
+
+    fn file(&self) -> ArchiveFile<'_> {
+        ArchiveFile::parse(self.data.as_slice()).expect("archive parse failed")
+    }
+
+    /// For a given archive, do something with each object file.
+    fn for_each_object(&self, mut f: impl FnMut(ObjFile, &ArchiveMember)) {
+        let archive = self.file();
+
+        for member in archive.members() {
+            let member = member.expect("failed to access member");
+            let obj_data = member
+                .data(self.data.as_slice())
+                .expect("failed to access object");
+            let obj = ObjFile::parse(obj_data).expect("failed to parse object");
+            f(obj, &member);
+        }
+    }
+
+    /// For a given archive, do something with each symbol.
+    fn for_each_symbol(&self, mut f: impl FnMut(Symbol, &ObjFile, &ArchiveMember)) {
+        self.for_each_object(|obj, member| {
+            obj.symbols().for_each(|sym| f(sym, &obj, member));
+        });
+    }
+}
diff --git a/etc/thumbv7em-none-eabi-renamed.json b/etc/thumbv7em-none-eabi-renamed.json
new file mode 100644
index 000000000..81273d44e
--- /dev/null
+++ b/etc/thumbv7em-none-eabi-renamed.json
@@ -0,0 +1,23 @@
+{
+  "abi": "eabi",
+  "arch": "arm",
+  "c-enum-min-bits": 8,
+  "crt-objects-fallback": "false",
+  "data-layout": "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64",
+  "emit-debug-gdb-scripts": false,
+  "frame-pointer": "always",
+  "linker": "rust-lld",
+  "linker-flavor": "gnu-lld",
+  "llvm-floatabi": "soft",
+  "llvm-target": "thumbv7em-none-eabi",
+  "max-atomic-width": 32,
+  "metadata": {
+    "description": "Bare ARMv7E-M",
+    "host_tools": false,
+    "std": false,
+    "tier": 2
+  },
+  "panic-strategy": "abort",
+  "relocation-model": "static",
+  "target-pointer-width": "32"
+}
diff --git a/josh-sync.toml b/josh-sync.toml
new file mode 100644
index 000000000..599a12af8
--- /dev/null
+++ b/josh-sync.toml
@@ -0,0 +1,3 @@
+org = "rust-lang"
+repo = "compiler-builtins"
+path = "library/compiler-builtins"
diff --git a/libm-test/Cargo.toml b/libm-test/Cargo.toml
index 7a306e735..0af6b0c1d 100644
--- a/libm-test/Cargo.toml
+++ b/libm-test/Cargo.toml
@@ -6,7 +6,7 @@ publish = false
 license = "MIT OR Apache-2.0"
 
 [features]
-default = ["build-mpfr", "build-musl", "unstable-float"]
+default = ["build-mpfr", "unstable-float"]
 
 # Propagated from libm because this affects which functions we test.
 unstable-float = ["libm/unstable-float", "rug?/nightly-float"]
@@ -28,28 +28,28 @@ icount = ["dep:iai-callgrind"]
 short-benchmarks = []
 
 [dependencies]
-anyhow = "1.0.97"
+anyhow = "1.0.98"
 # This is not directly used but is required so we can enable `gmp-mpfr-sys/force-cross`.
-gmp-mpfr-sys = { version = "1.6.4", optional = true, default-features = false }
-iai-callgrind = { version = "0.14.0", optional = true }
-indicatif = { version = "0.17.11", default-features = false }
+gmp-mpfr-sys = { version = "1.6.5", optional = true, default-features = false }
+iai-callgrind = { version = "0.15.2", optional = true }
+indicatif = { version = "0.18.0", default-features = false }
 libm = { path = "../libm", features = ["unstable-public-internals"] }
 libm-macros = { path = "../crates/libm-macros" }
 musl-math-sys = { path = "../crates/musl-math-sys", optional = true }
 paste = "1.0.15"
-rand = "0.9.0"
+rand = "0.9.1"
 rand_chacha = "0.9.0"
 rayon = "1.10.0"
 rug = { version = "1.27.0", optional = true, default-features = false, features = ["float", "integer", "std"] }
 
 [target.'cfg(target_family = "wasm")'.dependencies]
-getrandom = { version = "0.3.2", features = ["wasm_js"] }
+getrandom = { version = "0.3.3", features = ["wasm_js"] }
 
 [build-dependencies]
-rand = { version = "0.9.0", optional = true }
+rand = { version = "0.9.1", optional = true }
 
 [dev-dependencies]
-criterion = { version = "0.5.1", default-features = false, features = ["cargo_bench_support"] }
+criterion = { version = "0.6.0", default-features = false, features = ["cargo_bench_support"] }
 libtest-mimic = "0.8.1"
 
 [[bench]]
diff --git a/libm-test/benches/icount.rs b/libm-test/benches/icount.rs
index da8c6bfd1..02ee13f80 100644
--- a/libm-test/benches/icount.rs
+++ b/libm-test/benches/icount.rs
@@ -1,9 +1,11 @@
 //! Benchmarks that use `iai-cachegrind` to be reasonably CI-stable.
+#![feature(f16)]
+#![feature(f128)]
 
 use std::hint::black_box;
 
 use iai_callgrind::{library_benchmark, library_benchmark_group, main};
-use libm::support::{HInt, u256};
+use libm::support::{HInt, Hexf, hf16, hf32, hf64, hf128, u256};
 use libm_test::generate::spaced;
 use libm_test::{CheckBasis, CheckCtx, GeneratorKind, MathOp, OpRustArgs, TupleCall, op};
 
@@ -21,7 +23,7 @@ macro_rules! icount_benches {
                 let mut ctx = CheckCtx::new(
                     Op::IDENTIFIER,
                     CheckBasis::None,
-                    GeneratorKind::QuickSpaced
+                    GeneratorKind::Spaced
                 );
                 ctx.override_iterations(BENCH_ITER_ITEMS);
                 let ret = spaced::get_test_cases::<Op>(&ctx).0.collect::<Vec<_>>();
@@ -109,11 +111,6 @@ fn icount_bench_u128_widen_mul(cases: Vec<(u128, u128)>) {
     }
 }
 
-library_benchmark_group!(
-    name = icount_bench_u128_widen_mul_group;
-    benchmarks = icount_bench_u128_widen_mul
-);
-
 #[library_benchmark]
 #[bench::linspace(setup_u256_add())]
 fn icount_bench_u256_add(cases: Vec<(u256, u256)>) {
@@ -122,10 +119,21 @@ fn icount_bench_u256_add(cases: Vec<(u256, u256)>) {
     }
 }
 
-library_benchmark_group!(
-    name = icount_bench_u256_add_group;
-    benchmarks = icount_bench_u256_add
-);
+#[library_benchmark]
+#[bench::linspace(setup_u256_add())]
+fn icount_bench_u256_sub(cases: Vec<(u256, u256)>) {
+    for (x, y) in cases.iter().copied() {
+        black_box(black_box(x) - black_box(y));
+    }
+}
+
+#[library_benchmark]
+#[bench::linspace(setup_u256_shift())]
+fn icount_bench_u256_shl(cases: Vec<(u256, u32)>) {
+    for (x, y) in cases.iter().copied() {
+        black_box(black_box(x) << black_box(y));
+    }
+}
 
 #[library_benchmark]
 #[bench::linspace(setup_u256_shift())]
@@ -136,16 +144,90 @@ fn icount_bench_u256_shr(cases: Vec<(u256, u32)>) {
 }
 
 library_benchmark_group!(
-    name = icount_bench_u256_shr_group;
-    benchmarks = icount_bench_u256_shr
+    name = icount_bench_u128_group;
+    benchmarks = icount_bench_u128_widen_mul, icount_bench_u256_add, icount_bench_u256_sub, icount_bench_u256_shl, icount_bench_u256_shr
+);
+
+#[library_benchmark]
+#[bench::short("0x12.34p+8")]
+#[bench::max("0x1.ffcp+15")]
+fn icount_bench_hf16(s: &str) -> f16 {
+    black_box(hf16(s))
+}
+
+#[library_benchmark]
+#[bench::short("0x12.34p+8")]
+#[bench::max("0x1.fffffep+127")]
+fn icount_bench_hf32(s: &str) -> f32 {
+    black_box(hf32(s))
+}
+
+#[library_benchmark]
+#[bench::short("0x12.34p+8")]
+#[bench::max("0x1.fffffffffffffp+1023")]
+fn icount_bench_hf64(s: &str) -> f64 {
+    black_box(hf64(s))
+}
+
+#[library_benchmark]
+#[bench::short("0x12.34p+8")]
+#[bench::max("0x1.ffffffffffffffffffffffffffffp+16383")]
+fn icount_bench_hf128(s: &str) -> f128 {
+    black_box(hf128(s))
+}
+
+library_benchmark_group!(
+    name = icount_bench_hf_parse_group;
+    benchmarks =
+    icount_bench_hf16,
+    icount_bench_hf32,
+    icount_bench_hf64,
+    icount_bench_hf128
+);
+
+#[library_benchmark]
+#[bench::short(1.015625)]
+#[bench::max(f16::MAX)]
+fn icount_bench_print_hf16(x: f16) -> String {
+    black_box(Hexf(x).to_string())
+}
+
+#[library_benchmark]
+#[bench::short(1.015625)]
+#[bench::max(f32::MAX)]
+fn icount_bench_print_hf32(x: f32) -> String {
+    black_box(Hexf(x).to_string())
+}
+
+#[library_benchmark]
+#[bench::short(1.015625)]
+#[bench::max(f64::MAX)]
+fn icount_bench_print_hf64(x: f64) -> String {
+    black_box(Hexf(x).to_string())
+}
+
+#[library_benchmark]
+#[bench::short(1.015625)]
+#[bench::max(f128::MAX)]
+fn icount_bench_print_hf128(x: f128) -> String {
+    black_box(Hexf(x).to_string())
+}
+
+library_benchmark_group!(
+    name = icount_bench_hf_print_group;
+    benchmarks =
+    icount_bench_print_hf16,
+    icount_bench_print_hf32,
+    icount_bench_print_hf64,
+    icount_bench_print_hf128
 );
 
 main!(
     library_benchmark_groups =
-    // u256-related benchmarks
-    icount_bench_u128_widen_mul_group,
-    icount_bench_u256_add_group,
-    icount_bench_u256_shr_group,
+    // Benchmarks not related to public libm math
+    icount_bench_u128_group,
+    icount_bench_hf_parse_group,
+    icount_bench_hf_print_group,
     // verify-apilist-start
     // verify-sorted-start
     icount_bench_acos_group,
diff --git a/libm-test/examples/plot_domains.rs b/libm-test/examples/plot_domains.rs
index 3563103b8..7331d454f 100644
--- a/libm-test/examples/plot_domains.rs
+++ b/libm-test/examples/plot_domains.rs
@@ -55,7 +55,7 @@ where
     Op: MathOp<FTy = f32, RustArgs = (f32,)>,
     Op::RustArgs: SpacedInput<Op>,
 {
-    let mut ctx = CheckCtx::new(Op::IDENTIFIER, CheckBasis::Mpfr, GeneratorKind::QuickSpaced);
+    let mut ctx = CheckCtx::new(Op::IDENTIFIER, CheckBasis::Mpfr, GeneratorKind::Spaced);
     plot_one_generator(
         out_dir,
         &ctx,
diff --git a/libm-test/src/generate/edge_cases.rs b/libm-test/src/generate/edge_cases.rs
index 2fb074638..4e4a782a1 100644
--- a/libm-test/src/generate/edge_cases.rs
+++ b/libm-test/src/generate/edge_cases.rs
@@ -51,6 +51,7 @@ where
 
     // Check some special values that aren't included in the above ranges
     values.push(Op::FTy::NAN);
+    values.push(Op::FTy::NEG_NAN);
     values.extend(Op::FTy::consts().iter());
 
     // Check around the maximum subnormal value
diff --git a/libm-test/src/precision.rs b/libm-test/src/precision.rs
index f5fb5f670..3fb8c1b37 100644
--- a/libm-test/src/precision.rs
+++ b/libm-test/src/precision.rs
@@ -271,18 +271,6 @@ impl MaybeOverride<(f32,)> for SpecialCase {
 
 impl MaybeOverride<(f64,)> for SpecialCase {
     fn check_float<F: Float>(input: (f64,), actual: F, expected: F, ctx: &CheckCtx) -> CheckAction {
-        if cfg!(x86_no_sse)
-            && ctx.base_name == BaseName::Ceil
-            && ctx.basis == CheckBasis::Musl
-            && input.0 < 0.0
-            && input.0 > -1.0
-            && expected == F::ZERO
-            && actual == F::ZERO
-        {
-            // musl returns -0.0, we return +0.0
-            return XFAIL("i586 ceil signed zero");
-        }
-
         if cfg!(x86_no_sse)
             && (ctx.base_name == BaseName::Rint || ctx.base_name == BaseName::Roundeven)
             && (expected - actual).abs() <= F::ONE
@@ -292,16 +280,6 @@ impl MaybeOverride<(f64,)> for SpecialCase {
             return XFAIL("i586 rint rounding mode");
         }
 
-        if cfg!(x86_no_sse)
-            && (ctx.fn_ident == Identifier::Ceil || ctx.fn_ident == Identifier::Floor)
-            && expected.eq_repr(F::NEG_ZERO)
-            && actual.eq_repr(F::ZERO)
-        {
-            // FIXME: the x87 implementations do not keep the distinction between -0.0 and 0.0.
-            // See https://github.com/rust-lang/libm/pull/404#issuecomment-2572399955
-            return XFAIL("i586 ceil/floor signed zero");
-        }
-
         if cfg!(x86_no_sse)
             && (ctx.fn_ident == Identifier::Exp10 || ctx.fn_ident == Identifier::Exp2)
         {
@@ -381,7 +359,7 @@ fn unop_common<F1: Float, F2: Float>(
         }
 
         // abs and copysign require signaling NaNs to be propagated, so verify bit equality.
-        if actual.to_bits() == expected.to_bits() {
+        if actual.biteq(expected) {
             return CheckAction::Custom(Ok(()));
         } else {
             return CheckAction::Custom(Err(anyhow::anyhow!("NaNs have different bitpatterns")));
@@ -444,13 +422,18 @@ fn binop_common<F1: Float, F2: Float>(
     expected: F2,
     ctx: &CheckCtx,
 ) -> CheckAction {
-    // MPFR only has one NaN bitpattern; allow the default `.is_nan()` checks to validate. Skip if
-    // the first input (magnitude source) is NaN and the output is also a NaN, or if the second
-    // input (sign source) is NaN.
-    if ctx.basis == CheckBasis::Mpfr
+    // MPFR only has one NaN bitpattern; skip tests in cases where the first argument would take
+    // the sign of a NaN second argument. The default NaN checks cover other cases.
+    if ctx.base_name == BaseName::Copysign && ctx.basis == CheckBasis::Mpfr && input.1.is_nan() {
+        return SKIP;
+    }
+
+    // FIXME(#939): this should not be skipped, there is a bug in our implementationi.
+    if ctx.base_name == BaseName::FmaximumNum
+        && ctx.basis == CheckBasis::Mpfr
         && ((input.0.is_nan() && actual.is_nan() && expected.is_nan()) || input.1.is_nan())
     {
-        return SKIP;
+        return XFAIL_NOCHECK;
     }
 
     /* FIXME(#439): our fmin and fmax do not compare signed zeros */
diff --git a/libm-test/src/run_cfg.rs b/libm-test/src/run_cfg.rs
index 3345a01d2..90f81195c 100644
--- a/libm-test/src/run_cfg.rs
+++ b/libm-test/src/run_cfg.rs
@@ -22,13 +22,38 @@ static EXTENSIVE_ITER_OVERRIDE: LazyLock<Option<u64>> = LazyLock::new(|| {
 
 /// Specific tests that need to have a reduced amount of iterations to complete in a reasonable
 /// amount of time.
-///
-/// Contains the itentifier+generator combo to match on, plus the factor to reduce by.
-const EXTEMELY_SLOW_TESTS: &[(Identifier, GeneratorKind, u64)] = &[
-    (Identifier::Fmodf128, GeneratorKind::QuickSpaced, 50),
-    (Identifier::Fmodf128, GeneratorKind::Extensive, 50),
+const EXTREMELY_SLOW_TESTS: &[SlowTest] = &[
+    SlowTest {
+        ident: Identifier::Fmodf128,
+        gen_kind: GeneratorKind::Spaced,
+        extensive: false,
+        reduce_factor: 50,
+    },
+    SlowTest {
+        ident: Identifier::Fmodf128,
+        gen_kind: GeneratorKind::Spaced,
+        extensive: true,
+        reduce_factor: 50,
+    },
 ];
 
+/// A pattern to match a `CheckCtx`, plus a factor to reduce by.
+struct SlowTest {
+    ident: Identifier,
+    gen_kind: GeneratorKind,
+    extensive: bool,
+    reduce_factor: u64,
+}
+
+impl SlowTest {
+    /// True if the test in `CheckCtx` should be reduced by `reduce_factor`.
+    fn matches_ctx(&self, ctx: &CheckCtx) -> bool {
+        self.ident == ctx.fn_ident
+            && self.gen_kind == ctx.gen_kind
+            && self.extensive == ctx.extensive
+    }
+}
+
 /// Maximum number of iterations to run for a single routine.
 ///
 /// The default value of one greater than `u32::MAX` allows testing single-argument `f32` routines
@@ -54,6 +79,7 @@ pub struct CheckCtx {
     /// Source of truth for tests.
     pub basis: CheckBasis,
     pub gen_kind: GeneratorKind,
+    pub extensive: bool,
     /// If specified, this value will override the value returned by [`iteration_count`].
     pub override_iterations: Option<u64>,
 }
@@ -69,12 +95,19 @@ impl CheckCtx {
             base_name_str: fn_ident.base_name().as_str(),
             basis,
             gen_kind,
+            extensive: false,
             override_iterations: None,
         };
         ret.ulp = crate::default_ulp(&ret);
         ret
     }
 
+    /// Configure that this is an extensive test.
+    pub fn extensive(mut self, extensive: bool) -> Self {
+        self.extensive = extensive;
+        self
+    }
+
     /// The number of input arguments for this function.
     pub fn input_count(&self) -> usize {
         self.fn_ident.math_op().rust_sig.args.len()
@@ -100,14 +133,17 @@ pub enum CheckBasis {
 /// and quantity.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum GeneratorKind {
+    /// Extremes, zeros, nonstandard numbers, etc.
     EdgeCases,
-    Extensive,
-    QuickSpaced,
+    /// Spaced by logarithm (floats) or linear (integers).
+    Spaced,
+    /// Test inputs from an RNG.
     Random,
+    /// A provided test case list.
     List,
 }
 
-/// A list of all functions that should get extensive tests.
+/// A list of all functions that should get extensive tests, as configured by environment variable.
 ///
 /// This also supports the special test name `all` to run all tests, as well as `all_f16`,
 /// `all_f32`, `all_f64`, and `all_f128` to run all tests for a specific float type.
@@ -216,17 +252,17 @@ pub fn iteration_count(ctx: &CheckCtx, argnum: usize) -> u64 {
     let random_iter_count = domain_iter_count / 100;
 
     let mut total_iterations = match ctx.gen_kind {
-        GeneratorKind::QuickSpaced => domain_iter_count,
+        GeneratorKind::Spaced if ctx.extensive => extensive_max_iterations(),
+        GeneratorKind::Spaced => domain_iter_count,
         GeneratorKind::Random => random_iter_count,
-        GeneratorKind::Extensive => extensive_max_iterations(),
         GeneratorKind::EdgeCases | GeneratorKind::List => {
             unimplemented!("shoudn't need `iteration_count` for {:?}", ctx.gen_kind)
         }
     };
 
     // Larger float types get more iterations.
-    if t_env.large_float_ty && ctx.gen_kind != GeneratorKind::Extensive {
-        if ctx.gen_kind == GeneratorKind::Extensive {
+    if t_env.large_float_ty {
+        if ctx.extensive {
             // Extensive already has a pretty high test count.
             total_iterations *= 2;
         } else {
@@ -244,13 +280,13 @@ pub fn iteration_count(ctx: &CheckCtx, argnum: usize) -> u64 {
     }
 
     // Some tests are significantly slower than others and need to be further reduced.
-    if let Some((_id, _gen, scale)) = EXTEMELY_SLOW_TESTS
+    if let Some(slow) = EXTREMELY_SLOW_TESTS
         .iter()
-        .find(|(id, generator, _scale)| *id == ctx.fn_ident && *generator == ctx.gen_kind)
+        .find(|slow| slow.matches_ctx(ctx))
     {
         // However, do not override if the extensive iteration count has been manually set.
-        if !(ctx.gen_kind == GeneratorKind::Extensive && EXTENSIVE_ITER_OVERRIDE.is_some()) {
-            total_iterations /= scale;
+        if !(ctx.extensive && EXTENSIVE_ITER_OVERRIDE.is_some()) {
+            total_iterations /= slow.reduce_factor;
         }
     }
 
@@ -279,7 +315,7 @@ pub fn iteration_count(ctx: &CheckCtx, argnum: usize) -> u64 {
     let total = ntests.pow(t_env.input_count.try_into().unwrap());
 
     let seed_msg = match ctx.gen_kind {
-        GeneratorKind::QuickSpaced | GeneratorKind::Extensive => String::new(),
+        GeneratorKind::Spaced => String::new(),
         GeneratorKind::Random => {
             format!(
                 " using `{SEED_ENV}={}`",
@@ -327,8 +363,8 @@ pub fn int_range(ctx: &CheckCtx, argnum: usize) -> RangeInclusive<i32> {
     let extensive_range = (-0xfff)..=0xfffff;
 
     match ctx.gen_kind {
-        GeneratorKind::Extensive => extensive_range,
-        GeneratorKind::QuickSpaced | GeneratorKind::Random => non_extensive_range,
+        _ if ctx.extensive => extensive_range,
+        GeneratorKind::Spaced | GeneratorKind::Random => non_extensive_range,
         GeneratorKind::EdgeCases => extensive_range,
         GeneratorKind::List => unimplemented!("shoudn't need range for {:?}", ctx.gen_kind),
     }
diff --git a/libm-test/src/test_traits.rs b/libm-test/src/test_traits.rs
index dbb970161..278274d91 100644
--- a/libm-test/src/test_traits.rs
+++ b/libm-test/src/test_traits.rs
@@ -312,12 +312,9 @@ where
     let mut inner = || -> TestResult {
         let mut allowed_ulp = ctx.ulp;
 
-        // Forbid overrides if the items came from an explicit list, as long as we are checking
-        // against either MPFR or the result itself.
-        let require_biteq = ctx.gen_kind == GeneratorKind::List && ctx.basis != CheckBasis::Musl;
-
         match SpecialCase::check_float(input, actual, expected, ctx) {
-            _ if require_biteq => (),
+            // Forbid overrides if the items came from an explicit list
+            _ if ctx.gen_kind == GeneratorKind::List => (),
             CheckAction::AssertSuccess => (),
             CheckAction::AssertFailure(msg) => assert_failure_msg = Some(msg),
             CheckAction::Custom(res) => return res,
@@ -327,12 +324,20 @@ where
 
         // Check when both are NaNs
         if actual.is_nan() && expected.is_nan() {
-            if require_biteq && ctx.basis == CheckBasis::None {
-                ensure!(
-                    actual.to_bits() == expected.to_bits(),
-                    "mismatched NaN bitpatterns"
-                );
+            // Don't assert NaN bitwise equality if:
+            //
+            // * Testing against MPFR (there is a single NaN representation)
+            // * Testing against Musl except for explicit tests (Musl does some NaN quieting)
+            //
+            // In these cases, just the check that actual and expected are both NaNs is
+            // sufficient.
+            let skip_nan_biteq = ctx.basis == CheckBasis::Mpfr
+                || (ctx.basis == CheckBasis::Musl && ctx.gen_kind != GeneratorKind::List);
+
+            if !skip_nan_biteq {
+                ensure!(actual.biteq(expected), "mismatched NaN bitpatterns");
             }
+
             // By default, NaNs have nothing special to check.
             return Ok(());
         } else if actual.is_nan() || expected.is_nan() {
diff --git a/libm-test/tests/compare_built_musl.rs b/libm-test/tests/compare_built_musl.rs
index 6ccbb6f4c..86f3b8b71 100644
--- a/libm-test/tests/compare_built_musl.rs
+++ b/libm-test/tests/compare_built_musl.rs
@@ -65,7 +65,7 @@ macro_rules! musl_tests {
             $(#[$attr])*
             fn [< musl_quickspace_ $fn_name >]() {
                 type Op = libm_test::op::$fn_name::Routine;
-                let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::QuickSpaced);
+                let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::Spaced);
                 let cases = spaced::get_test_cases::<Op>(&ctx).0;
                 musl_runner::<Op>(&ctx, cases, musl_math_sys::$fn_name);
             }
diff --git a/libm-test/tests/multiprecision.rs b/libm-test/tests/multiprecision.rs
index 80b2c7868..60175ae61 100644
--- a/libm-test/tests/multiprecision.rs
+++ b/libm-test/tests/multiprecision.rs
@@ -55,7 +55,7 @@ macro_rules! mp_tests {
             $(#[$attr])*
             fn [< mp_quickspace_ $fn_name >]() {
                 type Op = libm_test::op::$fn_name::Routine;
-                let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::QuickSpaced);
+                let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::Spaced);
                 let cases = spaced::get_test_cases::<Op>(&ctx).0;
                 mp_runner::<Op>(&ctx, cases);
             }
diff --git a/libm-test/tests/u256.rs b/libm-test/tests/u256.rs
index 8cbb3ad22..d1c5cfbcc 100644
--- a/libm-test/tests/u256.rs
+++ b/libm-test/tests/u256.rs
@@ -111,12 +111,54 @@ fn mp_u256_add() {
         let y = random_u256(&mut rng);
         assign_bigint(&mut bx, x);
         assign_bigint(&mut by, y);
-        let actual = x + y;
+        let actual = if u256::MAX - x >= y {
+            x + y
+        } else {
+            // otherwise (u256::MAX - x) < y, so the wrapped result is
+            // (x + y) - (u256::MAX + 1) == y - (u256::MAX - x) - 1
+            y - (u256::MAX - x) - 1_u128.widen()
+        };
         bx += &by;
         check_one(|| hexu(x), || Some(hexu(y)), actual, &mut bx);
     }
 }
 
+#[test]
+fn mp_u256_sub() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+    let mut by = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        let y = random_u256(&mut rng);
+        assign_bigint(&mut bx, x);
+        assign_bigint(&mut by, y);
+
+        // since the operators (may) panic on overflow,
+        // we should test something that doesn't
+        let actual = if x >= y { x - y } else { y - x };
+        bx -= &by;
+        bx.abs_mut();
+        check_one(|| hexu(x), || Some(hexu(y)), actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_shl() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        let shift: u32 = rng.random_range(0..256);
+        assign_bigint(&mut bx, x);
+        let actual = x << shift;
+        bx <<= shift;
+        check_one(|| hexu(x), || Some(shift.to_string()), actual, &mut bx);
+    }
+}
+
 #[test]
 fn mp_u256_shr() {
     let mut rng = ChaCha8Rng::from_seed(*SEED);
@@ -124,7 +166,7 @@ fn mp_u256_shr() {
 
     for _ in 0..bigint_fuzz_iteration_count() {
         let x = random_u256(&mut rng);
-        let shift: u32 = rng.random_range(0..255);
+        let shift: u32 = rng.random_range(0..256);
         assign_bigint(&mut bx, x);
         let actual = x >> shift;
         bx >>= shift;
diff --git a/libm-test/tests/z_extensive/run.rs b/libm-test/tests/z_extensive/run.rs
index 59c806ce7..e04e00c6d 100644
--- a/libm-test/tests/z_extensive/run.rs
+++ b/libm-test/tests/z_extensive/run.rs
@@ -17,7 +17,6 @@ use rayon::prelude::*;
 use spaced::SpacedInput;
 
 const BASIS: CheckBasis = CheckBasis::Mpfr;
-const GEN_KIND: GeneratorKind = GeneratorKind::Extensive;
 
 /// Run the extensive test suite.
 pub fn run() {
@@ -77,7 +76,7 @@ where
     Op::RustArgs: SpacedInput<Op> + Send,
 {
     let test_name = format!("mp_extensive_{}", Op::NAME);
-    let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GEN_KIND);
+    let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::Spaced).extensive(true);
     let skip = skip_extensive_test(&ctx);
 
     let runner = move || {
@@ -198,15 +197,15 @@ impl Progress {
 
     fn update(&self, completed: u64, input: impl fmt::Debug) {
         // Infrequently update the progress bar.
-        if completed % 20_000 == 0 {
+        if completed.is_multiple_of(20_000) {
             self.pb.set_position(completed);
         }
 
-        if completed % 500_000 == 0 {
+        if completed.is_multiple_of(500_000) {
             self.pb.set_message(format!("input: {input:<24?}"));
         }
 
-        if !self.is_tty && completed % 5_000_000 == 0 {
+        if !self.is_tty && completed.is_multiple_of(5_000_000) {
             let len = self.pb.length().unwrap_or_default();
             eprintln!(
                 "[{elapsed:3?}s {percent:3.0}%] {name} \
diff --git a/libm/Cargo.toml b/libm/Cargo.toml
index b6fb5efcf..63b4d3c27 100644
--- a/libm/Cargo.toml
+++ b/libm/Cargo.toml
@@ -1,14 +1,12 @@
 [package]
+name = "libm"
+version = "0.2.15"
 authors = ["Jorge Aparicio <jorge@japaric.io>"]
-categories = ["no-std"]
 description = "libm in pure Rust"
-documentation = "https://docs.rs/libm"
+categories = ["no-std"]
 keywords = ["libm", "math"]
-license = "MIT"
-name = "libm"
-readme = "README.md"
 repository = "https://github.com/rust-lang/compiler-builtins"
-version = "0.2.15"
+license = "MIT"
 edition = "2021"
 rust-version = "1.63"
 
diff --git a/libm/README.md b/libm/README.md
index 349e892df..77608db3d 100644
--- a/libm/README.md
+++ b/libm/README.md
@@ -34,7 +34,7 @@ Usage is under the MIT license, available at
 ### Contribution
 
 Contributions are licensed under both the MIT license and the Apache License,
-Version 2.0, available at <htps://www.apache.org/licenses/LICENSE-2.0>. Unless
+Version 2.0, available at <https://www.apache.org/licenses/LICENSE-2.0>. Unless
 you explicitly state otherwise, any contribution intentionally submitted for
 inclusion in the work by you, as defined in the Apache-2.0 license, shall be
 dual licensed as mentioned, without any additional terms or conditions.
diff --git a/libm/configure.rs b/libm/configure.rs
index 2a497c7b1..76186e636 100644
--- a/libm/configure.rs
+++ b/libm/configure.rs
@@ -3,12 +3,14 @@
 use std::env;
 use std::path::PathBuf;
 
+#[derive(Debug)]
 #[allow(dead_code)]
 pub struct Config {
     pub manifest_dir: PathBuf,
     pub out_dir: PathBuf,
     pub opt_level: String,
     pub cargo_features: Vec<String>,
+    pub target_triple: String,
     pub target_arch: String,
     pub target_env: String,
     pub target_family: Option<String>,
@@ -16,10 +18,13 @@ pub struct Config {
     pub target_string: String,
     pub target_vendor: String,
     pub target_features: Vec<String>,
+    pub reliable_f128: bool,
+    pub reliable_f16: bool,
 }
 
 impl Config {
     pub fn from_env() -> Self {
+        let target_triple = env::var("TARGET").unwrap();
         let target_features = env::var("CARGO_CFG_TARGET_FEATURE")
             .map(|feats| feats.split(',').map(ToOwned::to_owned).collect())
             .unwrap_or_default();
@@ -29,6 +34,7 @@ impl Config {
             .collect();
 
         Self {
+            target_triple,
             manifest_dir: PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()),
             out_dir: PathBuf::from(env::var("OUT_DIR").unwrap()),
             opt_level: env::var("OPT_LEVEL").unwrap(),
@@ -40,6 +46,10 @@ impl Config {
             target_string: env::var("TARGET").unwrap(),
             target_vendor: env::var("CARGO_CFG_TARGET_VENDOR").unwrap(),
             target_features,
+            // Note that these are unstable options, so only show up with the nightly compiler or
+            // with `RUSTC_BOOTSTRAP=1` (which is required to use the types anyway).
+            reliable_f128: env::var_os("CARGO_CFG_TARGET_HAS_RELIABLE_F128").is_some(),
+            reliable_f16: env::var_os("CARGO_CFG_TARGET_HAS_RELIABLE_F16").is_some(),
         }
     }
 }
@@ -128,62 +138,18 @@ fn emit_f16_f128_cfg(cfg: &Config) {
         return;
     }
 
-    // Set whether or not `f16` and `f128` are supported at a basic level by LLVM. This only means
-    // that the backend will not crash when using these types and generates code that can be called
-    // without crashing (no infinite recursion). This does not mean that the platform doesn't have
-    // ABI or other bugs.
-    //
-    // We do this here rather than in `rust-lang/rust` because configuring via cargo features is
-    // not straightforward.
-    //
-    // Original source of this list:
-    // <https://github.com/rust-lang/compiler-builtins/pull/652#issuecomment-2266151350>
-    let f16_enabled = match cfg.target_arch.as_str() {
-        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
-        "arm64ec" => false,
-        // Selection failure <https://github.com/llvm/llvm-project/issues/50374>
-        "s390x" => false,
-        // Infinite recursion <https://github.com/llvm/llvm-project/issues/97981>
-        // FIXME(llvm): loongarch fixed by <https://github.com/llvm/llvm-project/pull/107791>
-        "csky" => false,
-        "hexagon" => false,
-        "loongarch64" => false,
-        "mips" | "mips64" | "mips32r6" | "mips64r6" => false,
-        "powerpc" | "powerpc64" => false,
-        "sparc" | "sparc64" => false,
-        "wasm32" | "wasm64" => false,
-        // Most everything else works as of LLVM 19
-        _ => true,
-    };
-
-    let f128_enabled = match cfg.target_arch.as_str() {
-        // Unsupported (libcall is not supported) <https://github.com/llvm/llvm-project/issues/121122>
-        "amdgpu" => false,
-        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
-        "arm64ec" => false,
-        // Selection failure <https://github.com/llvm/llvm-project/issues/96432>
-        "mips64" | "mips64r6" => false,
-        // Selection failure <https://github.com/llvm/llvm-project/issues/95471>
-        "nvptx64" => false,
-        // Selection failure <https://github.com/llvm/llvm-project/issues/101545>
-        "powerpc64" if &cfg.target_os == "aix" => false,
-        // Selection failure <https://github.com/llvm/llvm-project/issues/41838>
-        "sparc" => false,
-        // Most everything else works as of LLVM 19
-        _ => true,
-    };
-
-    // If the feature is set, disable these types.
-    let disable_both = env::var_os("CARGO_FEATURE_NO_F16_F128").is_some();
+    /* See the compiler-builtins configure file for info about the meaning of these options */
 
-    println!("cargo:rustc-check-cfg=cfg(f16_enabled)");
-    println!("cargo:rustc-check-cfg=cfg(f128_enabled)");
+    // If the feature is set, disable both of these types.
+    let no_f16_f128 = cfg.cargo_features.iter().any(|s| s == "no-f16-f128");
 
-    if f16_enabled && !disable_both {
+    println!("cargo:rustc-check-cfg=cfg(f16_enabled)");
+    if cfg.reliable_f16 && !no_f16_f128 {
         println!("cargo:rustc-cfg=f16_enabled");
     }
 
-    if f128_enabled && !disable_both {
+    println!("cargo:rustc-check-cfg=cfg(f128_enabled)");
+    if cfg.reliable_f128 && !no_f16_f128 {
         println!("cargo:rustc-cfg=f128_enabled");
     }
 }
diff --git a/libm/src/math/acos.rs b/libm/src/math/acos.rs
index 23b13251e..89b2e7c5f 100644
--- a/libm/src/math/acos.rs
+++ b/libm/src/math/acos.rs
@@ -59,7 +59,7 @@ fn r(z: f64) -> f64 {
 /// Computes the inverse cosine (arc cosine) of the input value.
 /// Arguments must be in the range -1 to 1.
 /// Returns values in radians, in the range of 0 to pi.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn acos(x: f64) -> f64 {
     let x1p_120f = f64::from_bits(0x3870000000000000); // 0x1p-120 === 2 ^ -120
     let z: f64;
diff --git a/libm/src/math/acosf.rs b/libm/src/math/acosf.rs
index dd88eea5b..d263b3f2c 100644
--- a/libm/src/math/acosf.rs
+++ b/libm/src/math/acosf.rs
@@ -33,7 +33,7 @@ fn r(z: f32) -> f32 {
 /// Computes the inverse cosine (arc cosine) of the input value.
 /// Arguments must be in the range -1 to 1.
 /// Returns values in radians, in the range of 0 to pi.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn acosf(x: f32) -> f32 {
     let x1p_120 = f32::from_bits(0x03800000); // 0x1p-120 === 2 ^ (-120)
 
diff --git a/libm/src/math/acosh.rs b/libm/src/math/acosh.rs
index d1f5b9fa9..8737bad01 100644
--- a/libm/src/math/acosh.rs
+++ b/libm/src/math/acosh.rs
@@ -7,7 +7,7 @@ const LN2: f64 = 0.693147180559945309417232121458176568; /* 0x3fe62e42,  0xfefa3
 /// Calculates the inverse hyperbolic cosine of `x`.
 /// Is defined as `log(x + sqrt(x*x-1))`.
 /// `x` must be a number greater than or equal to 1.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn acosh(x: f64) -> f64 {
     let u = x.to_bits();
     let e = ((u >> 52) as usize) & 0x7ff;
diff --git a/libm/src/math/acoshf.rs b/libm/src/math/acoshf.rs
index ad3455fdd..432fa03f1 100644
--- a/libm/src/math/acoshf.rs
+++ b/libm/src/math/acoshf.rs
@@ -7,7 +7,7 @@ const LN2: f32 = 0.693147180559945309417232121458176568;
 /// Calculates the inverse hyperbolic cosine of `x`.
 /// Is defined as `log(x + sqrt(x*x-1))`.
 /// `x` must be a number greater than or equal to 1.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn acoshf(x: f32) -> f32 {
     let u = x.to_bits();
     let a = u & 0x7fffffff;
diff --git a/libm/src/math/arch/aarch64.rs b/libm/src/math/arch/aarch64.rs
index 020bb731c..8896804b5 100644
--- a/libm/src/math/arch/aarch64.rs
+++ b/libm/src/math/arch/aarch64.rs
@@ -30,6 +30,12 @@ pub fn fmaf(mut x: f32, y: f32, z: f32) -> f32 {
     x
 }
 
+// NB: `frintx` is technically the correct instruction for C's `rint`. However, in Rust (and LLVM
+// by default), `rint` is identical to `roundeven` (no fpenv interaction) so we use the
+// side-effect-free `frintn`.
+//
+// In general, C code that calls Rust's libm should assume that fpenv is ignored.
+
 pub fn rint(mut x: f64) -> f64 {
     // SAFETY: `frintn` is available with neon and has no side effects.
     //
diff --git a/libm/src/math/arch/i586.rs b/libm/src/math/arch/i586.rs
index f92b9a2af..b9a667620 100644
--- a/libm/src/math/arch/i586.rs
+++ b/libm/src/math/arch/i586.rs
@@ -1,37 +1,62 @@
 //! Architecture-specific support for x86-32 without SSE2
+//!
+//! We use an alternative implementation on x86, because the
+//! main implementation fails with the x87 FPU used by
+//! debian i386, probably due to excess precision issues.
+//!
+//! See https://github.com/rust-lang/compiler-builtins/pull/976 for discussion on why these
+//! functions are implemented in this way.
 
-use super::super::fabs;
-
-/// Use an alternative implementation on x86, because the
-/// main implementation fails with the x87 FPU used by
-/// debian i386, probably due to excess precision issues.
-/// Basic implementation taken from https://github.com/rust-lang/libm/issues/219.
-pub fn ceil(x: f64) -> f64 {
-    if fabs(x).to_bits() < 4503599627370496.0_f64.to_bits() {
-        let truncated = x as i64 as f64;
-        if truncated < x {
-            return truncated + 1.0;
-        } else {
-            return truncated;
-        }
-    } else {
-        return x;
+pub fn ceil(mut x: f64) -> f64 {
+    unsafe {
+        core::arch::asm!(
+            "fld qword ptr [{x}]",
+            // Save the FPU control word, using `x` as scratch space.
+            "fstcw [{x}]",
+            // Set rounding control to 0b10 (+∞).
+            "mov word ptr [{x} + 2], 0x0b7f",
+            "fldcw [{x} + 2]",
+            // Round.
+            "frndint",
+            // Restore FPU control word.
+            "fldcw [{x}]",
+            // Save rounded value to memory.
+            "fstp qword ptr [{x}]",
+            x = in(reg) &mut x,
+            // All the x87 FPU stack is used, all registers must be clobbered
+            out("st(0)") _, out("st(1)") _,
+            out("st(2)") _, out("st(3)") _,
+            out("st(4)") _, out("st(5)") _,
+            out("st(6)") _, out("st(7)") _,
+            options(nostack),
+        );
     }
+    x
 }
 
-/// Use an alternative implementation on x86, because the
-/// main implementation fails with the x87 FPU used by
-/// debian i386, probably due to excess precision issues.
-/// Basic implementation taken from https://github.com/rust-lang/libm/issues/219.
-pub fn floor(x: f64) -> f64 {
-    if fabs(x).to_bits() < 4503599627370496.0_f64.to_bits() {
-        let truncated = x as i64 as f64;
-        if truncated > x {
-            return truncated - 1.0;
-        } else {
-            return truncated;
-        }
-    } else {
-        return x;
+pub fn floor(mut x: f64) -> f64 {
+    unsafe {
+        core::arch::asm!(
+            "fld qword ptr [{x}]",
+            // Save the FPU control word, using `x` as scratch space.
+            "fstcw [{x}]",
+            // Set rounding control to 0b01 (-∞).
+            "mov word ptr [{x} + 2], 0x077f",
+            "fldcw [{x} + 2]",
+            // Round.
+            "frndint",
+            // Restore FPU control word.
+            "fldcw [{x}]",
+            // Save rounded value to memory.
+            "fstp qword ptr [{x}]",
+            x = in(reg) &mut x,
+            // All the x87 FPU stack is used, all registers must be clobbered
+            out("st(0)") _, out("st(1)") _,
+            out("st(2)") _, out("st(3)") _,
+            out("st(4)") _, out("st(5)") _,
+            out("st(6)") _, out("st(7)") _,
+            options(nostack),
+        );
     }
+    x
 }
diff --git a/libm/src/math/asin.rs b/libm/src/math/asin.rs
index 12d0cd35f..9554a3eac 100644
--- a/libm/src/math/asin.rs
+++ b/libm/src/math/asin.rs
@@ -66,7 +66,7 @@ fn comp_r(z: f64) -> f64 {
 /// Computes the inverse sine (arc sine) of the argument `x`.
 /// Arguments to asin must be in the range -1 to 1.
 /// Returns values in radians, in the range of -pi/2 to pi/2.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn asin(mut x: f64) -> f64 {
     let z: f64;
     let r: f64;
diff --git a/libm/src/math/asinf.rs b/libm/src/math/asinf.rs
index ed6855567..2dfe2a6d4 100644
--- a/libm/src/math/asinf.rs
+++ b/libm/src/math/asinf.rs
@@ -35,7 +35,7 @@ fn r(z: f32) -> f32 {
 /// Computes the inverse sine (arc sine) of the argument `x`.
 /// Arguments to asin must be in the range -1 to 1.
 /// Returns values in radians, in the range of -pi/2 to pi/2.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn asinf(mut x: f32) -> f32 {
     let x1p_120 = f64::from_bits(0x3870000000000000); // 0x1p-120 === 2 ^ (-120)
 
diff --git a/libm/src/math/asinh.rs b/libm/src/math/asinh.rs
index 75d3c3ad4..d63bc0aa9 100644
--- a/libm/src/math/asinh.rs
+++ b/libm/src/math/asinh.rs
@@ -7,7 +7,7 @@ const LN2: f64 = 0.693147180559945309417232121458176568; /* 0x3fe62e42,  0xfefa3
 ///
 /// Calculates the inverse hyperbolic sine of `x`.
 /// Is defined as `sgn(x)*log(|x|+sqrt(x*x+1))`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn asinh(mut x: f64) -> f64 {
     let mut u = x.to_bits();
     let e = ((u >> 52) as usize) & 0x7ff;
diff --git a/libm/src/math/asinhf.rs b/libm/src/math/asinhf.rs
index 27ed9dd37..3ca2d4489 100644
--- a/libm/src/math/asinhf.rs
+++ b/libm/src/math/asinhf.rs
@@ -7,7 +7,7 @@ const LN2: f32 = 0.693147180559945309417232121458176568;
 ///
 /// Calculates the inverse hyperbolic sine of `x`.
 /// Is defined as `sgn(x)*log(|x|+sqrt(x*x+1))`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn asinhf(mut x: f32) -> f32 {
     let u = x.to_bits();
     let i = u & 0x7fffffff;
diff --git a/libm/src/math/atan.rs b/libm/src/math/atan.rs
index 4ca5cc91a..0590ba87c 100644
--- a/libm/src/math/atan.rs
+++ b/libm/src/math/atan.rs
@@ -65,7 +65,7 @@ const AT: [f64; 11] = [
 ///
 /// Computes the inverse tangent (arc tangent) of the input value.
 /// Returns a value in radians, in the range of -pi/2 to pi/2.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn atan(x: f64) -> f64 {
     let mut x = x;
     let mut ix = (x.to_bits() >> 32) as u32;
diff --git a/libm/src/math/atan2.rs b/libm/src/math/atan2.rs
index c668731cf..51456e409 100644
--- a/libm/src/math/atan2.rs
+++ b/libm/src/math/atan2.rs
@@ -47,7 +47,7 @@ const PI_LO: f64 = 1.2246467991473531772E-16; /* 0x3CA1A626, 0x33145C07 */
 /// Computes the inverse tangent (arc tangent) of `y/x`.
 /// Produces the correct result even for angles near pi/2 or -pi/2 (that is, when `x` is near 0).
 /// Returns a value in radians, in the range of -pi to pi.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn atan2(y: f64, x: f64) -> f64 {
     if x.is_nan() || y.is_nan() {
         return x + y;
diff --git a/libm/src/math/atan2f.rs b/libm/src/math/atan2f.rs
index 95b466fff..0f46c9f39 100644
--- a/libm/src/math/atan2f.rs
+++ b/libm/src/math/atan2f.rs
@@ -23,7 +23,7 @@ const PI_LO: f32 = -8.7422776573e-08; /* 0xb3bbbd2e */
 /// Computes the inverse tangent (arc tangent) of `y/x`.
 /// Produces the correct result even for angles near pi/2 or -pi/2 (that is, when `x` is near 0).
 /// Returns a value in radians, in the range of -pi to pi.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn atan2f(y: f32, x: f32) -> f32 {
     if x.is_nan() || y.is_nan() {
         return x + y;
diff --git a/libm/src/math/atanf.rs b/libm/src/math/atanf.rs
index da8daa41a..58568d9a8 100644
--- a/libm/src/math/atanf.rs
+++ b/libm/src/math/atanf.rs
@@ -41,7 +41,7 @@ const A_T: [f32; 5] = [
 ///
 /// Computes the inverse tangent (arc tangent) of the input value.
 /// Returns a value in radians, in the range of -pi/2 to pi/2.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn atanf(mut x: f32) -> f32 {
     let x1p_120 = f32::from_bits(0x03800000); // 0x1p-120 === 2 ^ (-120)
 
diff --git a/libm/src/math/atanh.rs b/libm/src/math/atanh.rs
index 9dc826f56..883ff150f 100644
--- a/libm/src/math/atanh.rs
+++ b/libm/src/math/atanh.rs
@@ -5,7 +5,7 @@ use super::log1p;
 ///
 /// Calculates the inverse hyperbolic tangent of `x`.
 /// Is defined as `log((1+x)/(1-x))/2 = log1p(2x/(1-x))/2`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn atanh(x: f64) -> f64 {
     let u = x.to_bits();
     let e = ((u >> 52) as usize) & 0x7ff;
diff --git a/libm/src/math/atanhf.rs b/libm/src/math/atanhf.rs
index 80ccec1f6..e4e356d18 100644
--- a/libm/src/math/atanhf.rs
+++ b/libm/src/math/atanhf.rs
@@ -5,7 +5,7 @@ use super::log1pf;
 ///
 /// Calculates the inverse hyperbolic tangent of `x`.
 /// Is defined as `log((1+x)/(1-x))/2 = log1p(2x/(1-x))/2`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn atanhf(mut x: f32) -> f32 {
     let mut u = x.to_bits();
     let sign = (u >> 31) != 0;
diff --git a/libm/src/math/cbrt.rs b/libm/src/math/cbrt.rs
index cf56f7a97..e905e15f1 100644
--- a/libm/src/math/cbrt.rs
+++ b/libm/src/math/cbrt.rs
@@ -8,7 +8,7 @@ use super::Float;
 use super::support::{FpResult, Round, cold_path};
 
 /// Compute the cube root of the argument.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn cbrt(x: f64) -> f64 {
     cbrt_round(x, Round::Nearest).val
 }
diff --git a/libm/src/math/cbrtf.rs b/libm/src/math/cbrtf.rs
index 9d70305c6..9d6958483 100644
--- a/libm/src/math/cbrtf.rs
+++ b/libm/src/math/cbrtf.rs
@@ -25,7 +25,7 @@ const B2: u32 = 642849266; /* B2 = (127-127.0/3-24/3-0.03306235651)*2**23 */
 /// Cube root (f32)
 ///
 /// Computes the cube root of the argument.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn cbrtf(x: f32) -> f32 {
     let x1p24 = f32::from_bits(0x4b800000); // 0x1p24f === 2 ^ 24
 
diff --git a/libm/src/math/ceil.rs b/libm/src/math/ceil.rs
index 4e1035457..2cac49f29 100644
--- a/libm/src/math/ceil.rs
+++ b/libm/src/math/ceil.rs
@@ -2,7 +2,7 @@
 ///
 /// Finds the nearest integer greater than or equal to `x`.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn ceilf16(x: f16) -> f16 {
     super::generic::ceil(x)
 }
@@ -10,7 +10,7 @@ pub fn ceilf16(x: f16) -> f16 {
 /// Ceil (f32)
 ///
 /// Finds the nearest integer greater than or equal to `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn ceilf(x: f32) -> f32 {
     select_implementation! {
         name: ceilf,
@@ -24,7 +24,7 @@ pub fn ceilf(x: f32) -> f32 {
 /// Ceil (f64)
 ///
 /// Finds the nearest integer greater than or equal to `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn ceil(x: f64) -> f64 {
     select_implementation! {
         name: ceil,
@@ -40,7 +40,7 @@ pub fn ceil(x: f64) -> f64 {
 ///
 /// Finds the nearest integer greater than or equal to `x`.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn ceilf128(x: f128) -> f128 {
     super::generic::ceil(x)
 }
diff --git a/libm/src/math/copysign.rs b/libm/src/math/copysign.rs
index d2a86e7fd..591a87a94 100644
--- a/libm/src/math/copysign.rs
+++ b/libm/src/math/copysign.rs
@@ -3,7 +3,7 @@
 /// Constructs a number with the magnitude (absolute value) of its
 /// first argument, `x`, and the sign of its second argument, `y`.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn copysignf16(x: f16, y: f16) -> f16 {
     super::generic::copysign(x, y)
 }
@@ -12,7 +12,7 @@ pub fn copysignf16(x: f16, y: f16) -> f16 {
 ///
 /// Constructs a number with the magnitude (absolute value) of its
 /// first argument, `x`, and the sign of its second argument, `y`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn copysignf(x: f32, y: f32) -> f32 {
     super::generic::copysign(x, y)
 }
@@ -21,7 +21,7 @@ pub fn copysignf(x: f32, y: f32) -> f32 {
 ///
 /// Constructs a number with the magnitude (absolute value) of its
 /// first argument, `x`, and the sign of its second argument, `y`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn copysign(x: f64, y: f64) -> f64 {
     super::generic::copysign(x, y)
 }
@@ -31,7 +31,7 @@ pub fn copysign(x: f64, y: f64) -> f64 {
 /// Constructs a number with the magnitude (absolute value) of its
 /// first argument, `x`, and the sign of its second argument, `y`.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn copysignf128(x: f128, y: f128) -> f128 {
     super::generic::copysign(x, y)
 }
@@ -59,9 +59,17 @@ mod tests {
 
         // Not required but we expect it
         assert_biteq!(f(F::NAN, F::NAN), F::NAN);
-        assert_biteq!(f(F::NEG_NAN, F::NAN), F::NAN);
+        assert_biteq!(f(F::NAN, F::ONE), F::NAN);
+        assert_biteq!(f(F::NAN, F::NEG_ONE), F::NEG_NAN);
         assert_biteq!(f(F::NAN, F::NEG_NAN), F::NEG_NAN);
+        assert_biteq!(f(F::NEG_NAN, F::NAN), F::NAN);
+        assert_biteq!(f(F::NEG_NAN, F::ONE), F::NAN);
+        assert_biteq!(f(F::NEG_NAN, F::NEG_ONE), F::NEG_NAN);
         assert_biteq!(f(F::NEG_NAN, F::NEG_NAN), F::NEG_NAN);
+        assert_biteq!(f(F::ONE, F::NAN), F::ONE);
+        assert_biteq!(f(F::ONE, F::NEG_NAN), F::NEG_ONE);
+        assert_biteq!(f(F::NEG_ONE, F::NAN), F::ONE);
+        assert_biteq!(f(F::NEG_ONE, F::NEG_NAN), F::NEG_ONE);
     }
 
     #[test]
diff --git a/libm/src/math/copysignf.rs b/libm/src/math/copysignf.rs
deleted file mode 100644
index 8b9bed4c0..000000000
--- a/libm/src/math/copysignf.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-/// Sign of Y, magnitude of X (f32)
-///
-/// Constructs a number with the magnitude (absolute value) of its
-/// first argument, `x`, and the sign of its second argument, `y`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn copysignf(x: f32, y: f32) -> f32 {
-    super::generic::copysign(x, y)
-}
diff --git a/libm/src/math/copysignf128.rs b/libm/src/math/copysignf128.rs
deleted file mode 100644
index 7bd81d42b..000000000
--- a/libm/src/math/copysignf128.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-/// Sign of Y, magnitude of X (f128)
-///
-/// Constructs a number with the magnitude (absolute value) of its
-/// first argument, `x`, and the sign of its second argument, `y`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn copysignf128(x: f128, y: f128) -> f128 {
-    super::generic::copysign(x, y)
-}
diff --git a/libm/src/math/copysignf16.rs b/libm/src/math/copysignf16.rs
deleted file mode 100644
index 820658686..000000000
--- a/libm/src/math/copysignf16.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-/// Sign of Y, magnitude of X (f16)
-///
-/// Constructs a number with the magnitude (absolute value) of its
-/// first argument, `x`, and the sign of its second argument, `y`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn copysignf16(x: f16, y: f16) -> f16 {
-    super::generic::copysign(x, y)
-}
diff --git a/libm/src/math/cos.rs b/libm/src/math/cos.rs
index de99cd4c5..b2f786323 100644
--- a/libm/src/math/cos.rs
+++ b/libm/src/math/cos.rs
@@ -45,7 +45,7 @@ use super::{k_cos, k_sin, rem_pio2};
 /// The cosine of `x` (f64).
 ///
 /// `x` is specified in radians.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn cos(x: f64) -> f64 {
     let ix = (f64::to_bits(x) >> 32) as u32 & 0x7fffffff;
 
diff --git a/libm/src/math/cosf.rs b/libm/src/math/cosf.rs
index 27c2fc3b9..bf5cb9196 100644
--- a/libm/src/math/cosf.rs
+++ b/libm/src/math/cosf.rs
@@ -27,7 +27,7 @@ const C4_PIO2: f64 = 4. * FRAC_PI_2; /* 0x401921FB, 0x54442D18 */
 /// The cosine of `x` (f32).
 ///
 /// `x` is specified in radians.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn cosf(x: f32) -> f32 {
     let x64 = x as f64;
 
diff --git a/libm/src/math/cosh.rs b/libm/src/math/cosh.rs
index d2e43fd6c..01081cfc7 100644
--- a/libm/src/math/cosh.rs
+++ b/libm/src/math/cosh.rs
@@ -5,7 +5,7 @@ use super::{exp, expm1, k_expo2};
 /// Computes the hyperbolic cosine of the argument x.
 /// Is defined as `(exp(x) + exp(-x))/2`
 /// Angles are specified in radians.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn cosh(mut x: f64) -> f64 {
     /* |x| */
     let mut ix = x.to_bits();
diff --git a/libm/src/math/coshf.rs b/libm/src/math/coshf.rs
index 567a24410..dc039a311 100644
--- a/libm/src/math/coshf.rs
+++ b/libm/src/math/coshf.rs
@@ -5,7 +5,7 @@ use super::{expf, expm1f, k_expo2f};
 /// Computes the hyperbolic cosine of the argument x.
 /// Is defined as `(exp(x) + exp(-x))/2`
 /// Angles are specified in radians.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn coshf(mut x: f32) -> f32 {
     let x1p120 = f32::from_bits(0x7b800000); // 0x1p120f === 2 ^ 120
 
diff --git a/libm/src/math/erf.rs b/libm/src/math/erf.rs
index 5d82228a0..6c78440af 100644
--- a/libm/src/math/erf.rs
+++ b/libm/src/math/erf.rs
@@ -219,7 +219,7 @@ fn erfc2(ix: u32, mut x: f64) -> f64 {
 /// Calculates an approximation to the “error function”, which estimates
 /// the probability that an observation will fall within x standard
 /// deviations of the mean (assuming a normal distribution).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn erf(x: f64) -> f64 {
     let r: f64;
     let s: f64;
diff --git a/libm/src/math/erff.rs b/libm/src/math/erff.rs
index fe15f0108..2a7680275 100644
--- a/libm/src/math/erff.rs
+++ b/libm/src/math/erff.rs
@@ -130,7 +130,7 @@ fn erfc2(mut ix: u32, mut x: f32) -> f32 {
 /// Calculates an approximation to the “error function”, which estimates
 /// the probability that an observation will fall within x standard
 /// deviations of the mean (assuming a normal distribution).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn erff(x: f32) -> f32 {
     let r: f32;
     let s: f32;
diff --git a/libm/src/math/exp.rs b/libm/src/math/exp.rs
index 782042b62..78ce5dd13 100644
--- a/libm/src/math/exp.rs
+++ b/libm/src/math/exp.rs
@@ -81,7 +81,7 @@ const P5: f64 = 4.13813679705723846039e-08; /* 0x3E663769, 0x72BEA4D0 */
 ///
 /// Calculate the exponential of `x`, that is, *e* raised to the power `x`
 /// (where *e* is the base of the natural system of logarithms, approximately 2.71828).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp(mut x: f64) -> f64 {
     let x1p1023 = f64::from_bits(0x7fe0000000000000); // 0x1p1023 === 2 ^ 1023
     let x1p_149 = f64::from_bits(0x36a0000000000000); // 0x1p-149 === 2 ^ -149
diff --git a/libm/src/math/exp10.rs b/libm/src/math/exp10.rs
index 7c33c92b6..1f49f5e96 100644
--- a/libm/src/math/exp10.rs
+++ b/libm/src/math/exp10.rs
@@ -7,7 +7,7 @@ const P10: &[f64] = &[
 ];
 
 /// Calculates 10 raised to the power of `x` (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp10(x: f64) -> f64 {
     let (mut y, n) = modf(x);
     let u: u64 = n.to_bits();
diff --git a/libm/src/math/exp10f.rs b/libm/src/math/exp10f.rs
index 303045b33..22a264211 100644
--- a/libm/src/math/exp10f.rs
+++ b/libm/src/math/exp10f.rs
@@ -7,7 +7,7 @@ const P10: &[f32] = &[
 ];
 
 /// Calculates 10 raised to the power of `x` (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp10f(x: f32) -> f32 {
     let (mut y, n) = modff(x);
     let u = n.to_bits();
diff --git a/libm/src/math/exp2.rs b/libm/src/math/exp2.rs
index 6e98d066c..6e4cbc29d 100644
--- a/libm/src/math/exp2.rs
+++ b/libm/src/math/exp2.rs
@@ -322,7 +322,7 @@ static TBL: [u64; TBLSIZE * 2] = [
 /// Exponential, base 2 (f64)
 ///
 /// Calculate `2^x`, that is, 2 raised to the power `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp2(mut x: f64) -> f64 {
     let redux = f64::from_bits(0x4338000000000000) / TBLSIZE as f64;
     let p1 = f64::from_bits(0x3fe62e42fefa39ef);
diff --git a/libm/src/math/exp2f.rs b/libm/src/math/exp2f.rs
index f452b6a20..733d2f1a8 100644
--- a/libm/src/math/exp2f.rs
+++ b/libm/src/math/exp2f.rs
@@ -73,7 +73,7 @@ static EXP2FT: [u64; TBLSIZE] = [
 /// Exponential, base 2 (f32)
 ///
 /// Calculate `2^x`, that is, 2 raised to the power `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp2f(mut x: f32) -> f32 {
     let redux = f32::from_bits(0x4b400000) / TBLSIZE as f32;
     let p1 = f32::from_bits(0x3f317218);
diff --git a/libm/src/math/expf.rs b/libm/src/math/expf.rs
index 8dc067ab0..dbbfdbba9 100644
--- a/libm/src/math/expf.rs
+++ b/libm/src/math/expf.rs
@@ -30,7 +30,7 @@ const P2: f32 = -2.7667332906e-3; /* -0xb55215.0p-32 */
 ///
 /// Calculate the exponential of `x`, that is, *e* raised to the power `x`
 /// (where *e* is the base of the natural system of logarithms, approximately 2.71828).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn expf(mut x: f32) -> f32 {
     let x1p127 = f32::from_bits(0x7f000000); // 0x1p127f === 2 ^ 127
     let x1p_126 = f32::from_bits(0x800000); // 0x1p-126f === 2 ^ -126  /*original 0x1p-149f    ??????????? */
diff --git a/libm/src/math/expm1.rs b/libm/src/math/expm1.rs
index f25153f32..3714bf3af 100644
--- a/libm/src/math/expm1.rs
+++ b/libm/src/math/expm1.rs
@@ -30,7 +30,7 @@ const Q5: f64 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */
 /// system of logarithms, approximately 2.71828).
 /// The result is accurate even for small values of `x`,
 /// where using `exp(x)-1` would lose many significant digits.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn expm1(mut x: f64) -> f64 {
     let hi: f64;
     let lo: f64;
diff --git a/libm/src/math/expm1f.rs b/libm/src/math/expm1f.rs
index 63dc86e37..f77515a4b 100644
--- a/libm/src/math/expm1f.rs
+++ b/libm/src/math/expm1f.rs
@@ -32,7 +32,7 @@ const Q2: f32 = 1.5807170421e-3; /*  0xcf3010.0p-33 */
 /// system of logarithms, approximately 2.71828).
 /// The result is accurate even for small values of `x`,
 /// where using `exp(x)-1` would lose many significant digits.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn expm1f(mut x: f32) -> f32 {
     let x1p127 = f32::from_bits(0x7f000000); // 0x1p127f === 2 ^ 127
 
diff --git a/libm/src/math/expo2.rs b/libm/src/math/expo2.rs
index 82e9b360a..ce90858ec 100644
--- a/libm/src/math/expo2.rs
+++ b/libm/src/math/expo2.rs
@@ -1,7 +1,7 @@
 use super::{combine_words, exp};
 
 /* exp(x)/2 for x >= log(DBL_MAX), slightly better than 0.5*exp(x/2)*exp(x/2) */
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub(crate) fn expo2(x: f64) -> f64 {
     /* k is such that k*ln2 has minimal relative error and x - kln2 > log(DBL_MIN) */
     const K: i32 = 2043;
diff --git a/libm/src/math/fabs.rs b/libm/src/math/fabs.rs
index 0050a309f..7344e21a1 100644
--- a/libm/src/math/fabs.rs
+++ b/libm/src/math/fabs.rs
@@ -3,7 +3,7 @@
 /// Calculates the absolute value (magnitude) of the argument `x`,
 /// by direct manipulation of the bit representation of `x`.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fabsf16(x: f16) -> f16 {
     super::generic::fabs(x)
 }
@@ -12,7 +12,7 @@ pub fn fabsf16(x: f16) -> f16 {
 ///
 /// Calculates the absolute value (magnitude) of the argument `x`,
 /// by direct manipulation of the bit representation of `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fabsf(x: f32) -> f32 {
     select_implementation! {
         name: fabsf,
@@ -27,7 +27,7 @@ pub fn fabsf(x: f32) -> f32 {
 ///
 /// Calculates the absolute value (magnitude) of the argument `x`,
 /// by direct manipulation of the bit representation of `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fabs(x: f64) -> f64 {
     select_implementation! {
         name: fabs,
@@ -43,7 +43,7 @@ pub fn fabs(x: f64) -> f64 {
 /// Calculates the absolute value (magnitude) of the argument `x`,
 /// by direct manipulation of the bit representation of `x`.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fabsf128(x: f128) -> f128 {
     super::generic::fabs(x)
 }
diff --git a/libm/src/math/fabsf.rs b/libm/src/math/fabsf.rs
deleted file mode 100644
index e5820a26c..000000000
--- a/libm/src/math/fabsf.rs
+++ /dev/null
@@ -1,39 +0,0 @@
-/// Absolute value (magnitude) (f32)
-///
-/// Calculates the absolute value (magnitude) of the argument `x`,
-/// by direct manipulation of the bit representation of `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fabsf(x: f32) -> f32 {
-    select_implementation! {
-        name: fabsf,
-        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
-        args: x,
-    }
-
-    super::generic::fabs(x)
-}
-
-// PowerPC tests are failing on LLVM 13: https://github.com/rust-lang/rust/issues/88520
-#[cfg(not(target_arch = "powerpc64"))]
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn sanity_check() {
-        assert_eq!(fabsf(-1.0), 1.0);
-        assert_eq!(fabsf(2.8), 2.8);
-    }
-
-    /// The spec: https://en.cppreference.com/w/cpp/numeric/math/fabs
-    #[test]
-    fn spec_tests() {
-        assert!(fabsf(f32::NAN).is_nan());
-        for f in [0.0, -0.0].iter().copied() {
-            assert_eq!(fabsf(f), 0.0);
-        }
-        for f in [f32::INFINITY, f32::NEG_INFINITY].iter().copied() {
-            assert_eq!(fabsf(f), f32::INFINITY);
-        }
-    }
-}
diff --git a/libm/src/math/fabsf128.rs b/libm/src/math/fabsf128.rs
deleted file mode 100644
index 46429ca49..000000000
--- a/libm/src/math/fabsf128.rs
+++ /dev/null
@@ -1,31 +0,0 @@
-/// Absolute value (magnitude) (f128)
-///
-/// Calculates the absolute value (magnitude) of the argument `x`,
-/// by direct manipulation of the bit representation of `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fabsf128(x: f128) -> f128 {
-    super::generic::fabs(x)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn sanity_check() {
-        assert_eq!(fabsf128(-1.0), 1.0);
-        assert_eq!(fabsf128(2.8), 2.8);
-    }
-
-    /// The spec: https://en.cppreference.com/w/cpp/numeric/math/fabs
-    #[test]
-    fn spec_tests() {
-        assert!(fabsf128(f128::NAN).is_nan());
-        for f in [0.0, -0.0].iter().copied() {
-            assert_eq!(fabsf128(f), 0.0);
-        }
-        for f in [f128::INFINITY, f128::NEG_INFINITY].iter().copied() {
-            assert_eq!(fabsf128(f), f128::INFINITY);
-        }
-    }
-}
diff --git a/libm/src/math/fabsf16.rs b/libm/src/math/fabsf16.rs
deleted file mode 100644
index eee42ac6a..000000000
--- a/libm/src/math/fabsf16.rs
+++ /dev/null
@@ -1,31 +0,0 @@
-/// Absolute value (magnitude) (f16)
-///
-/// Calculates the absolute value (magnitude) of the argument `x`,
-/// by direct manipulation of the bit representation of `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fabsf16(x: f16) -> f16 {
-    super::generic::fabs(x)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn sanity_check() {
-        assert_eq!(fabsf16(-1.0), 1.0);
-        assert_eq!(fabsf16(2.8), 2.8);
-    }
-
-    /// The spec: https://en.cppreference.com/w/cpp/numeric/math/fabs
-    #[test]
-    fn spec_tests() {
-        assert!(fabsf16(f16::NAN).is_nan());
-        for f in [0.0, -0.0].iter().copied() {
-            assert_eq!(fabsf16(f), 0.0);
-        }
-        for f in [f16::INFINITY, f16::NEG_INFINITY].iter().copied() {
-            assert_eq!(fabsf16(f), f16::INFINITY);
-        }
-    }
-}
diff --git a/libm/src/math/fdim.rs b/libm/src/math/fdim.rs
index 082c5478b..dac409e86 100644
--- a/libm/src/math/fdim.rs
+++ b/libm/src/math/fdim.rs
@@ -7,7 +7,7 @@
 ///
 /// A range error may occur.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fdimf16(x: f16, y: f16) -> f16 {
     super::generic::fdim(x, y)
 }
@@ -20,7 +20,7 @@ pub fn fdimf16(x: f16, y: f16) -> f16 {
 /// * NAN   if either argument is NAN.
 ///
 /// A range error may occur.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fdimf(x: f32, y: f32) -> f32 {
     super::generic::fdim(x, y)
 }
@@ -33,7 +33,7 @@ pub fn fdimf(x: f32, y: f32) -> f32 {
 /// * NAN   if either argument is NAN.
 ///
 /// A range error may occur.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fdim(x: f64, y: f64) -> f64 {
     super::generic::fdim(x, y)
 }
@@ -47,7 +47,7 @@ pub fn fdim(x: f64, y: f64) -> f64 {
 ///
 /// A range error may occur.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fdimf128(x: f128, y: f128) -> f128 {
     super::generic::fdim(x, y)
 }
diff --git a/libm/src/math/fdimf.rs b/libm/src/math/fdimf.rs
deleted file mode 100644
index 367ef517c..000000000
--- a/libm/src/math/fdimf.rs
+++ /dev/null
@@ -1,12 +0,0 @@
-/// Positive difference (f32)
-///
-/// Determines the positive difference between arguments, returning:
-/// * x - y if x > y, or
-/// * +0    if x <= y, or
-/// * NAN   if either argument is NAN.
-///
-/// A range error may occur.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fdimf(x: f32, y: f32) -> f32 {
-    super::generic::fdim(x, y)
-}
diff --git a/libm/src/math/fdimf128.rs b/libm/src/math/fdimf128.rs
deleted file mode 100644
index 6f3d1d0ff..000000000
--- a/libm/src/math/fdimf128.rs
+++ /dev/null
@@ -1,12 +0,0 @@
-/// Positive difference (f128)
-///
-/// Determines the positive difference between arguments, returning:
-/// * x - y if x > y, or
-/// * +0    if x <= y, or
-/// * NAN   if either argument is NAN.
-///
-/// A range error may occur.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fdimf128(x: f128, y: f128) -> f128 {
-    super::generic::fdim(x, y)
-}
diff --git a/libm/src/math/fdimf16.rs b/libm/src/math/fdimf16.rs
deleted file mode 100644
index 37bd68858..000000000
--- a/libm/src/math/fdimf16.rs
+++ /dev/null
@@ -1,12 +0,0 @@
-/// Positive difference (f16)
-///
-/// Determines the positive difference between arguments, returning:
-/// * x - y if x > y, or
-/// * +0    if x <= y, or
-/// * NAN   if either argument is NAN.
-///
-/// A range error may occur.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fdimf16(x: f16, y: f16) -> f16 {
-    super::generic::fdim(x, y)
-}
diff --git a/libm/src/math/floor.rs b/libm/src/math/floor.rs
index 3c5eab101..7241c427f 100644
--- a/libm/src/math/floor.rs
+++ b/libm/src/math/floor.rs
@@ -2,7 +2,7 @@
 ///
 /// Finds the nearest integer less than or equal to `x`.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn floorf16(x: f16) -> f16 {
     return super::generic::floor(x);
 }
@@ -10,7 +10,7 @@ pub fn floorf16(x: f16) -> f16 {
 /// Floor (f64)
 ///
 /// Finds the nearest integer less than or equal to `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn floor(x: f64) -> f64 {
     select_implementation! {
         name: floor,
@@ -25,7 +25,7 @@ pub fn floor(x: f64) -> f64 {
 /// Floor (f32)
 ///
 /// Finds the nearest integer less than or equal to `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn floorf(x: f32) -> f32 {
     select_implementation! {
         name: floorf,
@@ -40,7 +40,7 @@ pub fn floorf(x: f32) -> f32 {
 ///
 /// Finds the nearest integer less than or equal to `x`.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn floorf128(x: f128) -> f128 {
     return super::generic::floor(x);
 }
diff --git a/libm/src/math/floorf.rs b/libm/src/math/floorf.rs
deleted file mode 100644
index 16957b7f3..000000000
--- a/libm/src/math/floorf.rs
+++ /dev/null
@@ -1,13 +0,0 @@
-/// Floor (f32)
-///
-/// Finds the nearest integer less than or equal to `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn floorf(x: f32) -> f32 {
-    select_implementation! {
-        name: floorf,
-        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
-        args: x,
-    }
-
-    return super::generic::floor(x);
-}
diff --git a/libm/src/math/floorf128.rs b/libm/src/math/floorf128.rs
deleted file mode 100644
index 9a9fe4151..000000000
--- a/libm/src/math/floorf128.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-/// Floor (f128)
-///
-/// Finds the nearest integer less than or equal to `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn floorf128(x: f128) -> f128 {
-    return super::generic::floor(x);
-}
diff --git a/libm/src/math/floorf16.rs b/libm/src/math/floorf16.rs
deleted file mode 100644
index f9b868e04..000000000
--- a/libm/src/math/floorf16.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-/// Floor (f16)
-///
-/// Finds the nearest integer less than or equal to `x`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn floorf16(x: f16) -> f16 {
-    return super::generic::floor(x);
-}
diff --git a/libm/src/math/fma.rs b/libm/src/math/fma.rs
index 5bf473cfe..70e6de768 100644
--- a/libm/src/math/fma.rs
+++ b/libm/src/math/fma.rs
@@ -7,7 +7,7 @@ use crate::support::Round;
 // Placeholder so we can have `fmaf16` in the `Float` trait.
 #[allow(unused)]
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 {
     unimplemented!()
 }
@@ -15,7 +15,7 @@ pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 {
 /// Floating multiply add (f32)
 ///
 /// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
     select_implementation! {
         name: fmaf,
@@ -32,7 +32,7 @@ pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
 /// Fused multiply add (f64)
 ///
 /// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fma(x: f64, y: f64, z: f64) -> f64 {
     select_implementation! {
         name: fma,
@@ -50,7 +50,7 @@ pub fn fma(x: f64, y: f64, z: f64) -> f64 {
 ///
 /// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmaf128(x: f128, y: f128, z: f128) -> f128 {
     generic::fma_round(x, y, z, Round::Nearest).val
 }
diff --git a/libm/src/math/fmin_fmax.rs b/libm/src/math/fmin_fmax.rs
index 2947b783e..c4c1b0435 100644
--- a/libm/src/math/fmin_fmax.rs
+++ b/libm/src/math/fmin_fmax.rs
@@ -3,7 +3,7 @@
 /// This coincides with IEEE 754-2011 `minNum`. The result disregards signed zero (meaning if
 /// the inputs are -0.0 and +0.0, either may be returned).
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fminf16(x: f16, y: f16) -> f16 {
     super::generic::fmin(x, y)
 }
@@ -12,7 +12,7 @@ pub fn fminf16(x: f16, y: f16) -> f16 {
 ///
 /// This coincides with IEEE 754-2011 `minNum`. The result disregards signed zero (meaning if
 /// the inputs are -0.0 and +0.0, either may be returned).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fminf(x: f32, y: f32) -> f32 {
     super::generic::fmin(x, y)
 }
@@ -21,7 +21,7 @@ pub fn fminf(x: f32, y: f32) -> f32 {
 ///
 /// This coincides with IEEE 754-2011 `minNum`. The result disregards signed zero (meaning if
 /// the inputs are -0.0 and +0.0, either may be returned).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmin(x: f64, y: f64) -> f64 {
     super::generic::fmin(x, y)
 }
@@ -31,7 +31,7 @@ pub fn fmin(x: f64, y: f64) -> f64 {
 /// This coincides with IEEE 754-2011 `minNum`. The result disregards signed zero (meaning if
 /// the inputs are -0.0 and +0.0, either may be returned).
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fminf128(x: f128, y: f128) -> f128 {
     super::generic::fmin(x, y)
 }
@@ -41,7 +41,7 @@ pub fn fminf128(x: f128, y: f128) -> f128 {
 /// This coincides with IEEE 754-2011 `maxNum`. The result disregards signed zero (meaning if
 /// the inputs are -0.0 and +0.0, either may be returned).
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmaxf16(x: f16, y: f16) -> f16 {
     super::generic::fmax(x, y)
 }
@@ -50,7 +50,7 @@ pub fn fmaxf16(x: f16, y: f16) -> f16 {
 ///
 /// This coincides with IEEE 754-2011 `maxNum`. The result disregards signed zero (meaning if
 /// the inputs are -0.0 and +0.0, either may be returned).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmaxf(x: f32, y: f32) -> f32 {
     super::generic::fmax(x, y)
 }
@@ -59,7 +59,7 @@ pub fn fmaxf(x: f32, y: f32) -> f32 {
 ///
 /// This coincides with IEEE 754-2011 `maxNum`. The result disregards signed zero (meaning if
 /// the inputs are -0.0 and +0.0, either may be returned).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmax(x: f64, y: f64) -> f64 {
     super::generic::fmax(x, y)
 }
@@ -69,7 +69,7 @@ pub fn fmax(x: f64, y: f64) -> f64 {
 /// This coincides with IEEE 754-2011 `maxNum`. The result disregards signed zero (meaning if
 /// the inputs are -0.0 and +0.0, either may be returned).
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmaxf128(x: f128, y: f128) -> f128 {
     super::generic::fmax(x, y)
 }
@@ -82,22 +82,77 @@ mod tests {
     fn fmin_spec_test<F: Float>(f: impl Fn(F, F) -> F) {
         let cases = [
             (F::ZERO, F::ZERO, F::ZERO),
-            (F::ONE, F::ONE, F::ONE),
             (F::ZERO, F::ONE, F::ZERO),
-            (F::ONE, F::ZERO, F::ZERO),
             (F::ZERO, F::NEG_ONE, F::NEG_ONE),
+            (F::ZERO, F::INFINITY, F::ZERO),
+            (F::ZERO, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::ZERO, F::NAN, F::ZERO),
+            (F::ZERO, F::NEG_NAN, F::ZERO),
+            (F::NEG_ZERO, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_ZERO, F::ONE, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_ZERO, F::INFINITY, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::NEG_ZERO, F::NAN, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NEG_NAN, F::NEG_ZERO),
+            (F::ONE, F::ZERO, F::ZERO),
+            (F::ONE, F::NEG_ZERO, F::NEG_ZERO),
+            (F::ONE, F::ONE, F::ONE),
+            (F::ONE, F::NEG_ONE, F::NEG_ONE),
+            (F::ONE, F::INFINITY, F::ONE),
+            (F::ONE, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::ONE, F::NAN, F::ONE),
+            (F::ONE, F::NEG_NAN, F::ONE),
             (F::NEG_ONE, F::ZERO, F::NEG_ONE),
+            (F::NEG_ONE, F::NEG_ZERO, F::NEG_ONE),
+            (F::NEG_ONE, F::ONE, F::NEG_ONE),
+            (F::NEG_ONE, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_ONE, F::INFINITY, F::NEG_ONE),
+            (F::NEG_ONE, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::NEG_ONE, F::NAN, F::NEG_ONE),
+            (F::NEG_ONE, F::NEG_NAN, F::NEG_ONE),
             (F::INFINITY, F::ZERO, F::ZERO),
+            (F::INFINITY, F::NEG_ZERO, F::NEG_ZERO),
+            (F::INFINITY, F::ONE, F::ONE),
+            (F::INFINITY, F::NEG_ONE, F::NEG_ONE),
+            (F::INFINITY, F::INFINITY, F::INFINITY),
+            (F::INFINITY, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::INFINITY, F::NAN, F::INFINITY),
+            (F::INFINITY, F::NEG_NAN, F::INFINITY),
             (F::NEG_INFINITY, F::ZERO, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NEG_ZERO, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::ONE, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NEG_ONE, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::INFINITY, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NAN, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NEG_NAN, F::NEG_INFINITY),
             (F::NAN, F::ZERO, F::ZERO),
-            (F::ZERO, F::NAN, F::ZERO),
+            (F::NAN, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NAN, F::ONE, F::ONE),
+            (F::NAN, F::NEG_ONE, F::NEG_ONE),
+            (F::NAN, F::INFINITY, F::INFINITY),
+            (F::NAN, F::NEG_INFINITY, F::NEG_INFINITY),
             (F::NAN, F::NAN, F::NAN),
+            (F::NEG_NAN, F::ZERO, F::ZERO),
+            (F::NEG_NAN, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_NAN, F::ONE, F::ONE),
+            (F::NEG_NAN, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_NAN, F::INFINITY, F::INFINITY),
+            (F::NEG_NAN, F::NEG_INFINITY, F::NEG_INFINITY),
         ];
 
         for (x, y, res) in cases {
             let val = f(x, y);
             assert_biteq!(val, res, "fmin({}, {})", Hexf(x), Hexf(y));
         }
+
+        // Ordering between zeros and NaNs does not matter
+        assert_eq!(f(F::ZERO, F::NEG_ZERO), F::ZERO);
+        assert_eq!(f(F::NEG_ZERO, F::ZERO), F::ZERO);
+        assert!(f(F::NAN, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_NAN, F::NAN).is_nan());
+        assert!(f(F::NEG_NAN, F::NEG_NAN).is_nan());
     }
 
     #[test]
@@ -125,22 +180,77 @@ mod tests {
     fn fmax_spec_test<F: Float>(f: impl Fn(F, F) -> F) {
         let cases = [
             (F::ZERO, F::ZERO, F::ZERO),
-            (F::ONE, F::ONE, F::ONE),
             (F::ZERO, F::ONE, F::ONE),
-            (F::ONE, F::ZERO, F::ONE),
             (F::ZERO, F::NEG_ONE, F::ZERO),
+            (F::ZERO, F::INFINITY, F::INFINITY),
+            (F::ZERO, F::NEG_INFINITY, F::ZERO),
+            (F::ZERO, F::NAN, F::ZERO),
+            (F::ZERO, F::NEG_NAN, F::ZERO),
+            (F::NEG_ZERO, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_ZERO, F::ONE, F::ONE),
+            (F::NEG_ZERO, F::NEG_ONE, F::NEG_ZERO),
+            (F::NEG_ZERO, F::INFINITY, F::INFINITY),
+            (F::NEG_ZERO, F::NEG_INFINITY, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NAN, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NEG_NAN, F::NEG_ZERO),
+            (F::ONE, F::ZERO, F::ONE),
+            (F::ONE, F::NEG_ZERO, F::ONE),
+            (F::ONE, F::ONE, F::ONE),
+            (F::ONE, F::NEG_ONE, F::ONE),
+            (F::ONE, F::INFINITY, F::INFINITY),
+            (F::ONE, F::NEG_INFINITY, F::ONE),
+            (F::ONE, F::NAN, F::ONE),
+            (F::ONE, F::NEG_NAN, F::ONE),
             (F::NEG_ONE, F::ZERO, F::ZERO),
+            (F::NEG_ONE, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_ONE, F::ONE, F::ONE),
+            (F::NEG_ONE, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_ONE, F::INFINITY, F::INFINITY),
+            (F::NEG_ONE, F::NEG_INFINITY, F::NEG_ONE),
+            (F::NEG_ONE, F::NAN, F::NEG_ONE),
+            (F::NEG_ONE, F::NEG_NAN, F::NEG_ONE),
             (F::INFINITY, F::ZERO, F::INFINITY),
+            (F::INFINITY, F::NEG_ZERO, F::INFINITY),
+            (F::INFINITY, F::ONE, F::INFINITY),
+            (F::INFINITY, F::NEG_ONE, F::INFINITY),
+            (F::INFINITY, F::INFINITY, F::INFINITY),
+            (F::INFINITY, F::NEG_INFINITY, F::INFINITY),
+            (F::INFINITY, F::NAN, F::INFINITY),
+            (F::INFINITY, F::NEG_NAN, F::INFINITY),
             (F::NEG_INFINITY, F::ZERO, F::ZERO),
+            (F::NEG_INFINITY, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_INFINITY, F::ONE, F::ONE),
+            (F::NEG_INFINITY, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_INFINITY, F::INFINITY, F::INFINITY),
+            (F::NEG_INFINITY, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NAN, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NEG_NAN, F::NEG_INFINITY),
             (F::NAN, F::ZERO, F::ZERO),
-            (F::ZERO, F::NAN, F::ZERO),
+            (F::NAN, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NAN, F::ONE, F::ONE),
+            (F::NAN, F::NEG_ONE, F::NEG_ONE),
+            (F::NAN, F::INFINITY, F::INFINITY),
+            (F::NAN, F::NEG_INFINITY, F::NEG_INFINITY),
             (F::NAN, F::NAN, F::NAN),
+            (F::NEG_NAN, F::ZERO, F::ZERO),
+            (F::NEG_NAN, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_NAN, F::ONE, F::ONE),
+            (F::NEG_NAN, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_NAN, F::INFINITY, F::INFINITY),
+            (F::NEG_NAN, F::NEG_INFINITY, F::NEG_INFINITY),
         ];
 
         for (x, y, res) in cases {
             let val = f(x, y);
             assert_biteq!(val, res, "fmax({}, {})", Hexf(x), Hexf(y));
         }
+
+        // Ordering between zeros and NaNs does not matter
+        assert_eq!(f(F::ZERO, F::NEG_ZERO), F::ZERO);
+        assert_eq!(f(F::NEG_ZERO, F::ZERO), F::ZERO);
+        assert!(f(F::NAN, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_NAN, F::NAN).is_nan());
+        assert!(f(F::NEG_NAN, F::NEG_NAN).is_nan());
     }
 
     #[test]
diff --git a/libm/src/math/fminimum_fmaximum.rs b/libm/src/math/fminimum_fmaximum.rs
index b7999e273..a3c9c9c39 100644
--- a/libm/src/math/fminimum_fmaximum.rs
+++ b/libm/src/math/fminimum_fmaximum.rs
@@ -2,7 +2,7 @@
 ///
 /// This coincides with IEEE 754-2019 `minimum`. The result orders -0.0 < 0.0.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fminimumf16(x: f16, y: f16) -> f16 {
     super::generic::fminimum(x, y)
 }
@@ -10,7 +10,7 @@ pub fn fminimumf16(x: f16, y: f16) -> f16 {
 /// Return the lesser of two arguments or, if either argument is NaN, the other argument.
 ///
 /// This coincides with IEEE 754-2019 `minimum`. The result orders -0.0 < 0.0.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fminimum(x: f64, y: f64) -> f64 {
     super::generic::fminimum(x, y)
 }
@@ -18,7 +18,7 @@ pub fn fminimum(x: f64, y: f64) -> f64 {
 /// Return the lesser of two arguments or, if either argument is NaN, the other argument.
 ///
 /// This coincides with IEEE 754-2019 `minimum`. The result orders -0.0 < 0.0.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fminimumf(x: f32, y: f32) -> f32 {
     super::generic::fminimum(x, y)
 }
@@ -27,7 +27,7 @@ pub fn fminimumf(x: f32, y: f32) -> f32 {
 ///
 /// This coincides with IEEE 754-2019 `minimum`. The result orders -0.0 < 0.0.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fminimumf128(x: f128, y: f128) -> f128 {
     super::generic::fminimum(x, y)
 }
@@ -36,7 +36,7 @@ pub fn fminimumf128(x: f128, y: f128) -> f128 {
 ///
 /// This coincides with IEEE 754-2019 `maximum`. The result orders -0.0 < 0.0.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmaximumf16(x: f16, y: f16) -> f16 {
     super::generic::fmaximum(x, y)
 }
@@ -44,7 +44,7 @@ pub fn fmaximumf16(x: f16, y: f16) -> f16 {
 /// Return the greater of two arguments or, if either argument is NaN, the other argument.
 ///
 /// This coincides with IEEE 754-2019 `maximum`. The result orders -0.0 < 0.0.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmaximumf(x: f32, y: f32) -> f32 {
     super::generic::fmaximum(x, y)
 }
@@ -52,7 +52,7 @@ pub fn fmaximumf(x: f32, y: f32) -> f32 {
 /// Return the greater of two arguments or, if either argument is NaN, the other argument.
 ///
 /// This coincides with IEEE 754-2019 `maximum`. The result orders -0.0 < 0.0.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmaximum(x: f64, y: f64) -> f64 {
     super::generic::fmaximum(x, y)
 }
@@ -61,7 +61,7 @@ pub fn fmaximum(x: f64, y: f64) -> f64 {
 ///
 /// This coincides with IEEE 754-2019 `maximum`. The result orders -0.0 < 0.0.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmaximumf128(x: f128, y: f128) -> f128 {
     super::generic::fmaximum(x, y)
 }
@@ -74,24 +74,77 @@ mod tests {
     fn fminimum_spec_test<F: Float>(f: impl Fn(F, F) -> F) {
         let cases = [
             (F::ZERO, F::ZERO, F::ZERO),
-            (F::ONE, F::ONE, F::ONE),
+            (F::ZERO, F::NEG_ZERO, F::NEG_ZERO),
             (F::ZERO, F::ONE, F::ZERO),
-            (F::ONE, F::ZERO, F::ZERO),
             (F::ZERO, F::NEG_ONE, F::NEG_ONE),
+            (F::ZERO, F::INFINITY, F::ZERO),
+            (F::ZERO, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::ZERO, F::NAN, F::NAN),
+            (F::NEG_ZERO, F::ZERO, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_ZERO, F::ONE, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_ZERO, F::INFINITY, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::NEG_ZERO, F::NAN, F::NAN),
+            (F::ONE, F::ZERO, F::ZERO),
+            (F::ONE, F::NEG_ZERO, F::NEG_ZERO),
+            (F::ONE, F::ONE, F::ONE),
+            (F::ONE, F::NEG_ONE, F::NEG_ONE),
+            (F::ONE, F::INFINITY, F::ONE),
+            (F::ONE, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::ONE, F::NAN, F::NAN),
             (F::NEG_ONE, F::ZERO, F::NEG_ONE),
+            (F::NEG_ONE, F::NEG_ZERO, F::NEG_ONE),
+            (F::NEG_ONE, F::ONE, F::NEG_ONE),
+            (F::NEG_ONE, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_ONE, F::INFINITY, F::NEG_ONE),
+            (F::NEG_ONE, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::NEG_ONE, F::NAN, F::NAN),
             (F::INFINITY, F::ZERO, F::ZERO),
+            (F::INFINITY, F::NEG_ZERO, F::NEG_ZERO),
+            (F::INFINITY, F::ONE, F::ONE),
+            (F::INFINITY, F::NEG_ONE, F::NEG_ONE),
+            (F::INFINITY, F::INFINITY, F::INFINITY),
+            (F::INFINITY, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::INFINITY, F::NAN, F::NAN),
             (F::NEG_INFINITY, F::ZERO, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NEG_ZERO, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::ONE, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NEG_ONE, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::INFINITY, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NAN, F::NAN),
             (F::NAN, F::ZERO, F::NAN),
-            (F::ZERO, F::NAN, F::NAN),
+            (F::NAN, F::NEG_ZERO, F::NAN),
+            (F::NAN, F::ONE, F::NAN),
+            (F::NAN, F::NEG_ONE, F::NAN),
+            (F::NAN, F::INFINITY, F::NAN),
+            (F::NAN, F::NEG_INFINITY, F::NAN),
             (F::NAN, F::NAN, F::NAN),
-            (F::ZERO, F::NEG_ZERO, F::NEG_ZERO),
-            (F::NEG_ZERO, F::ZERO, F::NEG_ZERO),
         ];
 
         for (x, y, res) in cases {
             let val = f(x, y);
             assert_biteq!(val, res, "fminimum({}, {})", Hexf(x), Hexf(y));
         }
+
+        // Ordering between NaNs does not matter
+        assert!(f(F::NAN, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_NAN, F::NAN).is_nan());
+        assert!(f(F::ZERO, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_ZERO, F::NEG_NAN).is_nan());
+        assert!(f(F::ONE, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_ONE, F::NEG_NAN).is_nan());
+        assert!(f(F::INFINITY, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_INFINITY, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_NAN, F::ZERO).is_nan());
+        assert!(f(F::NEG_NAN, F::NEG_ZERO).is_nan());
+        assert!(f(F::NEG_NAN, F::ONE).is_nan());
+        assert!(f(F::NEG_NAN, F::NEG_ONE).is_nan());
+        assert!(f(F::NEG_NAN, F::INFINITY).is_nan());
+        assert!(f(F::NEG_NAN, F::NEG_INFINITY).is_nan());
+        assert!(f(F::NEG_NAN, F::NEG_NAN).is_nan());
     }
 
     #[test]
@@ -119,24 +172,77 @@ mod tests {
     fn fmaximum_spec_test<F: Float>(f: impl Fn(F, F) -> F) {
         let cases = [
             (F::ZERO, F::ZERO, F::ZERO),
-            (F::ONE, F::ONE, F::ONE),
+            (F::ZERO, F::NEG_ZERO, F::ZERO),
             (F::ZERO, F::ONE, F::ONE),
-            (F::ONE, F::ZERO, F::ONE),
             (F::ZERO, F::NEG_ONE, F::ZERO),
+            (F::ZERO, F::INFINITY, F::INFINITY),
+            (F::ZERO, F::NEG_INFINITY, F::ZERO),
+            (F::ZERO, F::NAN, F::NAN),
+            (F::NEG_ZERO, F::ZERO, F::ZERO),
+            (F::NEG_ZERO, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_ZERO, F::ONE, F::ONE),
+            (F::NEG_ZERO, F::NEG_ONE, F::NEG_ZERO),
+            (F::NEG_ZERO, F::INFINITY, F::INFINITY),
+            (F::NEG_ZERO, F::NEG_INFINITY, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NAN, F::NAN),
+            (F::ONE, F::ZERO, F::ONE),
+            (F::ONE, F::NEG_ZERO, F::ONE),
+            (F::ONE, F::ONE, F::ONE),
+            (F::ONE, F::NEG_ONE, F::ONE),
+            (F::ONE, F::INFINITY, F::INFINITY),
+            (F::ONE, F::NEG_INFINITY, F::ONE),
+            (F::ONE, F::NAN, F::NAN),
             (F::NEG_ONE, F::ZERO, F::ZERO),
+            (F::NEG_ONE, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_ONE, F::ONE, F::ONE),
+            (F::NEG_ONE, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_ONE, F::INFINITY, F::INFINITY),
+            (F::NEG_ONE, F::NEG_INFINITY, F::NEG_ONE),
+            (F::NEG_ONE, F::NAN, F::NAN),
             (F::INFINITY, F::ZERO, F::INFINITY),
+            (F::INFINITY, F::NEG_ZERO, F::INFINITY),
+            (F::INFINITY, F::ONE, F::INFINITY),
+            (F::INFINITY, F::NEG_ONE, F::INFINITY),
+            (F::INFINITY, F::INFINITY, F::INFINITY),
+            (F::INFINITY, F::NEG_INFINITY, F::INFINITY),
+            (F::INFINITY, F::NAN, F::NAN),
             (F::NEG_INFINITY, F::ZERO, F::ZERO),
+            (F::NEG_INFINITY, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_INFINITY, F::ONE, F::ONE),
+            (F::NEG_INFINITY, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_INFINITY, F::INFINITY, F::INFINITY),
+            (F::NEG_INFINITY, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NAN, F::NAN),
             (F::NAN, F::ZERO, F::NAN),
-            (F::ZERO, F::NAN, F::NAN),
+            (F::NAN, F::NEG_ZERO, F::NAN),
+            (F::NAN, F::ONE, F::NAN),
+            (F::NAN, F::NEG_ONE, F::NAN),
+            (F::NAN, F::INFINITY, F::NAN),
+            (F::NAN, F::NEG_INFINITY, F::NAN),
             (F::NAN, F::NAN, F::NAN),
-            (F::ZERO, F::NEG_ZERO, F::ZERO),
-            (F::NEG_ZERO, F::ZERO, F::ZERO),
         ];
 
         for (x, y, res) in cases {
             let val = f(x, y);
             assert_biteq!(val, res, "fmaximum({}, {})", Hexf(x), Hexf(y));
         }
+
+        // Ordering between NaNs does not matter
+        assert!(f(F::NAN, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_NAN, F::NAN).is_nan());
+        assert!(f(F::ZERO, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_ZERO, F::NEG_NAN).is_nan());
+        assert!(f(F::ONE, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_ONE, F::NEG_NAN).is_nan());
+        assert!(f(F::INFINITY, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_INFINITY, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_NAN, F::ZERO).is_nan());
+        assert!(f(F::NEG_NAN, F::NEG_ZERO).is_nan());
+        assert!(f(F::NEG_NAN, F::ONE).is_nan());
+        assert!(f(F::NEG_NAN, F::NEG_ONE).is_nan());
+        assert!(f(F::NEG_NAN, F::INFINITY).is_nan());
+        assert!(f(F::NEG_NAN, F::NEG_INFINITY).is_nan());
+        assert!(f(F::NEG_NAN, F::NEG_NAN).is_nan());
     }
 
     #[test]
diff --git a/libm/src/math/fminimum_fmaximum_num.rs b/libm/src/math/fminimum_fmaximum_num.rs
index 180d21f72..612cefe75 100644
--- a/libm/src/math/fminimum_fmaximum_num.rs
+++ b/libm/src/math/fminimum_fmaximum_num.rs
@@ -2,7 +2,7 @@
 ///
 /// This coincides with IEEE 754-2019 `minimumNumber`. The result orders -0.0 < 0.0.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fminimum_numf16(x: f16, y: f16) -> f16 {
     super::generic::fminimum_num(x, y)
 }
@@ -10,7 +10,7 @@ pub fn fminimum_numf16(x: f16, y: f16) -> f16 {
 /// Return the lesser of two arguments or, if either argument is NaN, NaN.
 ///
 /// This coincides with IEEE 754-2019 `minimumNumber`. The result orders -0.0 < 0.0.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fminimum_numf(x: f32, y: f32) -> f32 {
     super::generic::fminimum_num(x, y)
 }
@@ -18,7 +18,7 @@ pub fn fminimum_numf(x: f32, y: f32) -> f32 {
 /// Return the lesser of two arguments or, if either argument is NaN, NaN.
 ///
 /// This coincides with IEEE 754-2019 `minimumNumber`. The result orders -0.0 < 0.0.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fminimum_num(x: f64, y: f64) -> f64 {
     super::generic::fminimum_num(x, y)
 }
@@ -27,7 +27,7 @@ pub fn fminimum_num(x: f64, y: f64) -> f64 {
 ///
 /// This coincides with IEEE 754-2019 `minimumNumber`. The result orders -0.0 < 0.0.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fminimum_numf128(x: f128, y: f128) -> f128 {
     super::generic::fminimum_num(x, y)
 }
@@ -36,7 +36,7 @@ pub fn fminimum_numf128(x: f128, y: f128) -> f128 {
 ///
 /// This coincides with IEEE 754-2019 `maximumNumber`. The result orders -0.0 < 0.0.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmaximum_numf16(x: f16, y: f16) -> f16 {
     super::generic::fmaximum_num(x, y)
 }
@@ -44,7 +44,7 @@ pub fn fmaximum_numf16(x: f16, y: f16) -> f16 {
 /// Return the greater of two arguments or, if either argument is NaN, NaN.
 ///
 /// This coincides with IEEE 754-2019 `maximumNumber`. The result orders -0.0 < 0.0.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmaximum_numf(x: f32, y: f32) -> f32 {
     super::generic::fmaximum_num(x, y)
 }
@@ -52,7 +52,7 @@ pub fn fmaximum_numf(x: f32, y: f32) -> f32 {
 /// Return the greater of two arguments or, if either argument is NaN, NaN.
 ///
 /// This coincides with IEEE 754-2019 `maximumNumber`. The result orders -0.0 < 0.0.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmaximum_num(x: f64, y: f64) -> f64 {
     super::generic::fmaximum_num(x, y)
 }
@@ -61,7 +61,7 @@ pub fn fmaximum_num(x: f64, y: f64) -> f64 {
 ///
 /// This coincides with IEEE 754-2019 `maximumNumber`. The result orders -0.0 < 0.0.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmaximum_numf128(x: f128, y: f128) -> f128 {
     super::generic::fmaximum_num(x, y)
 }
@@ -74,24 +74,77 @@ mod tests {
     fn fminimum_num_spec_test<F: Float>(f: impl Fn(F, F) -> F) {
         let cases = [
             (F::ZERO, F::ZERO, F::ZERO),
-            (F::ONE, F::ONE, F::ONE),
+            (F::ZERO, F::NEG_ZERO, F::NEG_ZERO),
             (F::ZERO, F::ONE, F::ZERO),
-            (F::ONE, F::ZERO, F::ZERO),
             (F::ZERO, F::NEG_ONE, F::NEG_ONE),
+            (F::ZERO, F::INFINITY, F::ZERO),
+            (F::ZERO, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::ZERO, F::NAN, F::ZERO),
+            (F::ZERO, F::NEG_NAN, F::ZERO),
+            (F::NEG_ZERO, F::ZERO, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_ZERO, F::ONE, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_ZERO, F::INFINITY, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::NEG_ZERO, F::NAN, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NEG_NAN, F::NEG_ZERO),
+            (F::ONE, F::ZERO, F::ZERO),
+            (F::ONE, F::NEG_ZERO, F::NEG_ZERO),
+            (F::ONE, F::ONE, F::ONE),
+            (F::ONE, F::NEG_ONE, F::NEG_ONE),
+            (F::ONE, F::INFINITY, F::ONE),
+            (F::ONE, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::ONE, F::NAN, F::ONE),
+            (F::ONE, F::NEG_NAN, F::ONE),
             (F::NEG_ONE, F::ZERO, F::NEG_ONE),
+            (F::NEG_ONE, F::NEG_ZERO, F::NEG_ONE),
+            (F::NEG_ONE, F::ONE, F::NEG_ONE),
+            (F::NEG_ONE, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_ONE, F::INFINITY, F::NEG_ONE),
+            (F::NEG_ONE, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::NEG_ONE, F::NAN, F::NEG_ONE),
+            (F::NEG_ONE, F::NEG_NAN, F::NEG_ONE),
             (F::INFINITY, F::ZERO, F::ZERO),
+            (F::INFINITY, F::NEG_ZERO, F::NEG_ZERO),
+            (F::INFINITY, F::ONE, F::ONE),
+            (F::INFINITY, F::NEG_ONE, F::NEG_ONE),
+            (F::INFINITY, F::INFINITY, F::INFINITY),
+            (F::INFINITY, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::INFINITY, F::NAN, F::INFINITY),
+            (F::INFINITY, F::NEG_NAN, F::INFINITY),
             (F::NEG_INFINITY, F::ZERO, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NEG_ZERO, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::ONE, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NEG_ONE, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::INFINITY, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NAN, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NEG_NAN, F::NEG_INFINITY),
             (F::NAN, F::ZERO, F::ZERO),
-            (F::ZERO, F::NAN, F::ZERO),
+            (F::NAN, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NAN, F::ONE, F::ONE),
+            (F::NAN, F::NEG_ONE, F::NEG_ONE),
+            (F::NAN, F::INFINITY, F::INFINITY),
+            (F::NAN, F::NEG_INFINITY, F::NEG_INFINITY),
             (F::NAN, F::NAN, F::NAN),
-            (F::ZERO, F::NEG_ZERO, F::NEG_ZERO),
-            (F::NEG_ZERO, F::ZERO, F::NEG_ZERO),
+            (F::NEG_NAN, F::ZERO, F::ZERO),
+            (F::NEG_NAN, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_NAN, F::ONE, F::ONE),
+            (F::NEG_NAN, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_NAN, F::INFINITY, F::INFINITY),
+            (F::NEG_NAN, F::NEG_INFINITY, F::NEG_INFINITY),
         ];
 
-        for (x, y, res) in cases {
-            let val = f(x, y);
-            assert_biteq!(val, res, "fminimum_num({}, {})", Hexf(x), Hexf(y));
+        for (x, y, expected) in cases {
+            let actual = f(x, y);
+            assert_biteq!(actual, expected, "fminimum_num({}, {})", Hexf(x), Hexf(y));
         }
+
+        // Ordering between NaNs does not matter
+        assert!(f(F::NAN, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_NAN, F::NAN).is_nan());
+        assert!(f(F::NEG_NAN, F::NEG_NAN).is_nan());
     }
 
     #[test]
@@ -119,24 +172,77 @@ mod tests {
     fn fmaximum_num_spec_test<F: Float>(f: impl Fn(F, F) -> F) {
         let cases = [
             (F::ZERO, F::ZERO, F::ZERO),
-            (F::ONE, F::ONE, F::ONE),
+            (F::ZERO, F::NEG_ZERO, F::ZERO),
             (F::ZERO, F::ONE, F::ONE),
-            (F::ONE, F::ZERO, F::ONE),
             (F::ZERO, F::NEG_ONE, F::ZERO),
+            (F::ZERO, F::INFINITY, F::INFINITY),
+            (F::ZERO, F::NEG_INFINITY, F::ZERO),
+            (F::ZERO, F::NAN, F::ZERO),
+            (F::ZERO, F::NEG_NAN, F::ZERO),
+            (F::NEG_ZERO, F::ZERO, F::ZERO),
+            (F::NEG_ZERO, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_ZERO, F::ONE, F::ONE),
+            (F::NEG_ZERO, F::NEG_ONE, F::NEG_ZERO),
+            (F::NEG_ZERO, F::INFINITY, F::INFINITY),
+            (F::NEG_ZERO, F::NEG_INFINITY, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NAN, F::NEG_ZERO),
+            (F::NEG_ZERO, F::NEG_NAN, F::NEG_ZERO),
+            (F::ONE, F::ZERO, F::ONE),
+            (F::ONE, F::NEG_ZERO, F::ONE),
+            (F::ONE, F::ONE, F::ONE),
+            (F::ONE, F::NEG_ONE, F::ONE),
+            (F::ONE, F::INFINITY, F::INFINITY),
+            (F::ONE, F::NEG_INFINITY, F::ONE),
+            (F::ONE, F::NAN, F::ONE),
+            (F::ONE, F::NEG_NAN, F::ONE),
             (F::NEG_ONE, F::ZERO, F::ZERO),
+            (F::NEG_ONE, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_ONE, F::ONE, F::ONE),
+            (F::NEG_ONE, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_ONE, F::INFINITY, F::INFINITY),
+            (F::NEG_ONE, F::NEG_INFINITY, F::NEG_ONE),
+            (F::NEG_ONE, F::NAN, F::NEG_ONE),
+            (F::NEG_ONE, F::NEG_NAN, F::NEG_ONE),
             (F::INFINITY, F::ZERO, F::INFINITY),
+            (F::INFINITY, F::NEG_ZERO, F::INFINITY),
+            (F::INFINITY, F::ONE, F::INFINITY),
+            (F::INFINITY, F::NEG_ONE, F::INFINITY),
+            (F::INFINITY, F::INFINITY, F::INFINITY),
+            (F::INFINITY, F::NEG_INFINITY, F::INFINITY),
+            (F::INFINITY, F::NAN, F::INFINITY),
+            (F::INFINITY, F::NEG_NAN, F::INFINITY),
             (F::NEG_INFINITY, F::ZERO, F::ZERO),
+            (F::NEG_INFINITY, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_INFINITY, F::ONE, F::ONE),
+            (F::NEG_INFINITY, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_INFINITY, F::INFINITY, F::INFINITY),
+            (F::NEG_INFINITY, F::NEG_INFINITY, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NAN, F::NEG_INFINITY),
+            (F::NEG_INFINITY, F::NEG_NAN, F::NEG_INFINITY),
             (F::NAN, F::ZERO, F::ZERO),
-            (F::ZERO, F::NAN, F::ZERO),
+            (F::NAN, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NAN, F::ONE, F::ONE),
+            (F::NAN, F::NEG_ONE, F::NEG_ONE),
+            (F::NAN, F::INFINITY, F::INFINITY),
+            (F::NAN, F::NEG_INFINITY, F::NEG_INFINITY),
             (F::NAN, F::NAN, F::NAN),
-            (F::ZERO, F::NEG_ZERO, F::ZERO),
-            (F::NEG_ZERO, F::ZERO, F::ZERO),
+            (F::NEG_NAN, F::ZERO, F::ZERO),
+            (F::NEG_NAN, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_NAN, F::ONE, F::ONE),
+            (F::NEG_NAN, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_NAN, F::INFINITY, F::INFINITY),
+            (F::NEG_NAN, F::NEG_INFINITY, F::NEG_INFINITY),
         ];
 
-        for (x, y, res) in cases {
-            let val = f(x, y);
-            assert_biteq!(val, res, "fmaximum_num({}, {})", Hexf(x), Hexf(y));
+        for (x, y, expected) in cases {
+            let actual = f(x, y);
+            assert_biteq!(actual, expected, "fmaximum_num({}, {})", Hexf(x), Hexf(y));
         }
+
+        // Ordering between NaNs does not matter
+        assert!(f(F::NAN, F::NEG_NAN).is_nan());
+        assert!(f(F::NEG_NAN, F::NAN).is_nan());
+        assert!(f(F::NEG_NAN, F::NEG_NAN).is_nan());
     }
 
     #[test]
diff --git a/libm/src/math/fmod.rs b/libm/src/math/fmod.rs
index c4752b925..6ae1be560 100644
--- a/libm/src/math/fmod.rs
+++ b/libm/src/math/fmod.rs
@@ -1,25 +1,25 @@
 /// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmodf16(x: f16, y: f16) -> f16 {
     super::generic::fmod(x, y)
 }
 
 /// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmodf(x: f32, y: f32) -> f32 {
     super::generic::fmod(x, y)
 }
 
 /// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmod(x: f64, y: f64) -> f64 {
     super::generic::fmod(x, y)
 }
 
 /// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn fmodf128(x: f128, y: f128) -> f128 {
     super::generic::fmod(x, y)
 }
diff --git a/libm/src/math/fmodf.rs b/libm/src/math/fmodf.rs
deleted file mode 100644
index 4e95696e2..000000000
--- a/libm/src/math/fmodf.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-/// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fmodf(x: f32, y: f32) -> f32 {
-    super::generic::fmod(x, y)
-}
diff --git a/libm/src/math/fmodf128.rs b/libm/src/math/fmodf128.rs
deleted file mode 100644
index ff0e0493e..000000000
--- a/libm/src/math/fmodf128.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-/// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fmodf128(x: f128, y: f128) -> f128 {
-    super::generic::fmod(x, y)
-}
diff --git a/libm/src/math/fmodf16.rs b/libm/src/math/fmodf16.rs
deleted file mode 100644
index 11972a7de..000000000
--- a/libm/src/math/fmodf16.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-/// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fmodf16(x: f16, y: f16) -> f16 {
-    super::generic::fmod(x, y)
-}
diff --git a/libm/src/math/frexp.rs b/libm/src/math/frexp.rs
index de7a64fda..932111eeb 100644
--- a/libm/src/math/frexp.rs
+++ b/libm/src/math/frexp.rs
@@ -1,4 +1,4 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn frexp(x: f64) -> (f64, i32) {
     let mut y = x.to_bits();
     let ee = ((y >> 52) & 0x7ff) as i32;
diff --git a/libm/src/math/frexpf.rs b/libm/src/math/frexpf.rs
index 0ec91c2d3..904bf14f7 100644
--- a/libm/src/math/frexpf.rs
+++ b/libm/src/math/frexpf.rs
@@ -1,4 +1,4 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn frexpf(x: f32) -> (f32, i32) {
     let mut y = x.to_bits();
     let ee: i32 = ((y >> 23) & 0xff) as i32;
diff --git a/libm/src/math/generic/fmax.rs b/libm/src/math/generic/fmax.rs
index 54207e4b3..b05804704 100644
--- a/libm/src/math/generic/fmax.rs
+++ b/libm/src/math/generic/fmax.rs
@@ -19,6 +19,5 @@ use crate::support::Float;
 #[inline]
 pub fn fmax<F: Float>(x: F, y: F) -> F {
     let res = if x.is_nan() || x < y { y } else { x };
-    // Canonicalize
-    res * F::ONE
+    res.canonicalize()
 }
diff --git a/libm/src/math/generic/fmaximum.rs b/libm/src/math/generic/fmaximum.rs
index 4b6295bc0..55a031e18 100644
--- a/libm/src/math/generic/fmaximum.rs
+++ b/libm/src/math/generic/fmaximum.rs
@@ -4,8 +4,8 @@
 //! Per the spec, returns the canonicalized result of:
 //! - `x` if `x > y`
 //! - `y` if `y > x`
+//! - +0.0 if x and y are zero with opposite signs
 //! - qNaN if either operation is NaN
-//! - Logic following +0.0 > -0.0
 //!
 //! Excluded from our implementation is sNaN handling.
 
@@ -17,12 +17,11 @@ pub fn fmaximum<F: Float>(x: F, y: F) -> F {
         x
     } else if y.is_nan() {
         y
-    } else if x > y || (y.to_bits() == F::NEG_ZERO.to_bits() && x.is_sign_positive()) {
+    } else if x > y || (y.biteq(F::NEG_ZERO) && x.is_sign_positive()) {
         x
     } else {
         y
     };
 
-    // Canonicalize
-    res * F::ONE
+    res.canonicalize()
 }
diff --git a/libm/src/math/generic/fmaximum_num.rs b/libm/src/math/generic/fmaximum_num.rs
index 2e97ff6d3..2dc60b2d2 100644
--- a/libm/src/math/generic/fmaximum_num.rs
+++ b/libm/src/math/generic/fmaximum_num.rs
@@ -4,10 +4,10 @@
 //! Per the spec, returns:
 //! - `x` if `x > y`
 //! - `y` if `y > x`
-//! - Non-NaN if one operand is NaN
-//! - Logic following +0.0 > -0.0
+//! - +0.0 if x and y are zero with opposite signs
 //! - Either `x` or `y` if `x == y` and the signs are the same
-//! - qNaN if either operand is a NaN
+//! - Non-NaN if one operand is NaN
+//! - qNaN if both operands are NaNx
 //!
 //! Excluded from our implementation is sNaN handling.
 
@@ -15,13 +15,15 @@ use crate::support::Float;
 
 #[inline]
 pub fn fmaximum_num<F: Float>(x: F, y: F) -> F {
-    let res =
-        if x.is_nan() || x < y || (x.to_bits() == F::NEG_ZERO.to_bits() && y.is_sign_positive()) {
-            y
-        } else {
-            x
-        };
+    let res = if x > y || y.is_nan() {
+        x
+    } else if y > x || x.is_nan() {
+        y
+    } else if x.is_sign_positive() {
+        x
+    } else {
+        y
+    };
 
-    // Canonicalize
-    res * F::ONE
+    res.canonicalize()
 }
diff --git a/libm/src/math/generic/fmin.rs b/libm/src/math/generic/fmin.rs
index 0f86364d2..e2245bf9e 100644
--- a/libm/src/math/generic/fmin.rs
+++ b/libm/src/math/generic/fmin.rs
@@ -19,6 +19,5 @@ use crate::support::Float;
 #[inline]
 pub fn fmin<F: Float>(x: F, y: F) -> F {
     let res = if y.is_nan() || x < y { x } else { y };
-    // Canonicalize
-    res * F::ONE
+    res.canonicalize()
 }
diff --git a/libm/src/math/generic/fminimum.rs b/libm/src/math/generic/fminimum.rs
index 9dc0b64be..aa68b1291 100644
--- a/libm/src/math/generic/fminimum.rs
+++ b/libm/src/math/generic/fminimum.rs
@@ -4,8 +4,8 @@
 //! Per the spec, returns the canonicalized result of:
 //! - `x` if `x < y`
 //! - `y` if `y < x`
+//! - -0.0 if x and y are zero with opposite signs
 //! - qNaN if either operation is NaN
-//! - Logic following +0.0 > -0.0
 //!
 //! Excluded from our implementation is sNaN handling.
 
@@ -17,12 +17,11 @@ pub fn fminimum<F: Float>(x: F, y: F) -> F {
         x
     } else if y.is_nan() {
         y
-    } else if x < y || (x.to_bits() == F::NEG_ZERO.to_bits() && y.is_sign_positive()) {
+    } else if x < y || (x.biteq(F::NEG_ZERO) && y.is_sign_positive()) {
         x
     } else {
         y
     };
 
-    // Canonicalize
-    res * F::ONE
+    res.canonicalize()
 }
diff --git a/libm/src/math/generic/fminimum_num.rs b/libm/src/math/generic/fminimum_num.rs
index 40db8b189..265bd4605 100644
--- a/libm/src/math/generic/fminimum_num.rs
+++ b/libm/src/math/generic/fminimum_num.rs
@@ -4,10 +4,10 @@
 //! Per the spec, returns:
 //! - `x` if `x < y`
 //! - `y` if `y < x`
-//! - Non-NaN if one operand is NaN
-//! - Logic following +0.0 > -0.0
+//! - -0.0 if x and y are zero with opposite signs
 //! - Either `x` or `y` if `x == y` and the signs are the same
-//! - qNaN if either operand is a NaN
+//! - Non-NaN if one operand is NaN
+//! - qNaN if both operands are NaNx
 //!
 //! Excluded from our implementation is sNaN handling.
 
@@ -15,13 +15,15 @@ use crate::support::Float;
 
 #[inline]
 pub fn fminimum_num<F: Float>(x: F, y: F) -> F {
-    let res =
-        if y.is_nan() || x < y || (x.to_bits() == F::NEG_ZERO.to_bits() && y.is_sign_positive()) {
-            x
-        } else {
-            y
-        };
+    let res = if x > y || x.is_nan() {
+        y
+    } else if y > x || y.is_nan() {
+        x
+    } else if x.is_sign_positive() {
+        y
+    } else {
+        x
+    };
 
-    // Canonicalize
-    res * F::ONE
+    res.canonicalize()
 }
diff --git a/libm/src/math/hypot.rs b/libm/src/math/hypot.rs
index da458ea1d..b92ee18ca 100644
--- a/libm/src/math/hypot.rs
+++ b/libm/src/math/hypot.rs
@@ -17,7 +17,7 @@ fn sq(x: f64) -> (f64, f64) {
     (hi, lo)
 }
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn hypot(mut x: f64, mut y: f64) -> f64 {
     let x1p700 = f64::from_bits(0x6bb0000000000000); // 0x1p700 === 2 ^ 700
     let x1p_700 = f64::from_bits(0x1430000000000000); // 0x1p-700 === 2 ^ -700
diff --git a/libm/src/math/hypotf.rs b/libm/src/math/hypotf.rs
index 576eebb33..e7635ffc9 100644
--- a/libm/src/math/hypotf.rs
+++ b/libm/src/math/hypotf.rs
@@ -2,7 +2,7 @@ use core::f32;
 
 use super::sqrtf;
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn hypotf(mut x: f32, mut y: f32) -> f32 {
     let x1p90 = f32::from_bits(0x6c800000); // 0x1p90f === 2 ^ 90
     let x1p_90 = f32::from_bits(0x12800000); // 0x1p-90f === 2 ^ -90
diff --git a/libm/src/math/ilogb.rs b/libm/src/math/ilogb.rs
index 5b41f7b1d..ef774f6ad 100644
--- a/libm/src/math/ilogb.rs
+++ b/libm/src/math/ilogb.rs
@@ -1,7 +1,7 @@
 const FP_ILOGBNAN: i32 = -1 - 0x7fffffff;
 const FP_ILOGB0: i32 = FP_ILOGBNAN;
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn ilogb(x: f64) -> i32 {
     let mut i: u64 = x.to_bits();
     let e = ((i >> 52) & 0x7ff) as i32;
diff --git a/libm/src/math/ilogbf.rs b/libm/src/math/ilogbf.rs
index 3585d6d36..5b0cb46ec 100644
--- a/libm/src/math/ilogbf.rs
+++ b/libm/src/math/ilogbf.rs
@@ -1,7 +1,7 @@
 const FP_ILOGBNAN: i32 = -1 - 0x7fffffff;
 const FP_ILOGB0: i32 = FP_ILOGBNAN;
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn ilogbf(x: f32) -> i32 {
     let mut i = x.to_bits();
     let e = ((i >> 23) & 0xff) as i32;
diff --git a/libm/src/math/j0.rs b/libm/src/math/j0.rs
index 99d656f0d..7b0800477 100644
--- a/libm/src/math/j0.rs
+++ b/libm/src/math/j0.rs
@@ -110,7 +110,7 @@ const S03: f64 = 5.13546550207318111446e-07; /* 0x3EA13B54, 0xCE84D5A9 */
 const S04: f64 = 1.16614003333790000205e-09; /* 0x3E1408BC, 0xF4745D8F */
 
 /// Zeroth order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the first kind (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn j0(mut x: f64) -> f64 {
     let z: f64;
     let r: f64;
@@ -165,7 +165,7 @@ const V03: f64 = 2.59150851840457805467e-07; /* 0x3E91642D, 0x7FF202FD */
 const V04: f64 = 4.41110311332675467403e-10; /* 0x3DFE5018, 0x3BD6D9EF */
 
 /// Zeroth order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the second kind (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn y0(x: f64) -> f64 {
     let z: f64;
     let u: f64;
diff --git a/libm/src/math/j0f.rs b/libm/src/math/j0f.rs
index 25e5b325c..1c6a7c344 100644
--- a/libm/src/math/j0f.rs
+++ b/libm/src/math/j0f.rs
@@ -63,7 +63,7 @@ const S03: f32 = 5.1354652442e-07; /* 0x3509daa6 */
 const S04: f32 = 1.1661400734e-09; /* 0x30a045e8 */
 
 /// Zeroth order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the first kind (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn j0f(mut x: f32) -> f32 {
     let z: f32;
     let r: f32;
@@ -110,7 +110,7 @@ const V03: f32 = 2.5915085189e-07; /* 0x348b216c */
 const V04: f32 = 4.4111031494e-10; /* 0x2ff280c2 */
 
 /// Zeroth order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the second kind (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn y0f(x: f32) -> f32 {
     let z: f32;
     let u: f32;
diff --git a/libm/src/math/j1.rs b/libm/src/math/j1.rs
index 9b604d9e4..7d304ba10 100644
--- a/libm/src/math/j1.rs
+++ b/libm/src/math/j1.rs
@@ -114,7 +114,7 @@ const S04: f64 = 5.04636257076217042715e-09; /* 0x3E35AC88, 0xC97DFF2C */
 const S05: f64 = 1.23542274426137913908e-11; /* 0x3DAB2ACF, 0xCFB97ED8 */
 
 /// First order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the first kind (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn j1(x: f64) -> f64 {
     let mut z: f64;
     let r: f64;
@@ -161,7 +161,7 @@ const V0: [f64; 5] = [
 ];
 
 /// First order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the second kind (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn y1(x: f64) -> f64 {
     let z: f64;
     let u: f64;
diff --git a/libm/src/math/j1f.rs b/libm/src/math/j1f.rs
index a47472401..cd829c1aa 100644
--- a/libm/src/math/j1f.rs
+++ b/libm/src/math/j1f.rs
@@ -64,7 +64,7 @@ const S04: f32 = 5.0463624390e-09; /* 0x31ad6446 */
 const S05: f32 = 1.2354227016e-11; /* 0x2d59567e */
 
 /// First order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the first kind (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn j1f(x: f32) -> f32 {
     let mut z: f32;
     let r: f32;
@@ -110,7 +110,7 @@ const V0: [f32; 5] = [
 ];
 
 /// First order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the second kind (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn y1f(x: f32) -> f32 {
     let z: f32;
     let u: f32;
@@ -361,8 +361,6 @@ fn qonef(x: f32) -> f32 {
     return (0.375 + r / s) / x;
 }
 
-// PowerPC tests are failing on LLVM 13: https://github.com/rust-lang/rust/issues/88520
-#[cfg(not(target_arch = "powerpc64"))]
 #[cfg(test)]
 mod tests {
     use super::{j1f, y1f};
@@ -371,6 +369,7 @@ mod tests {
         // 0x401F3E49
         assert_eq!(j1f(2.4881766_f32), 0.49999475_f32);
     }
+
     #[test]
     fn test_y1f_2002() {
         //allow slightly different result on x87
diff --git a/libm/src/math/jn.rs b/libm/src/math/jn.rs
index 31f8d9c53..b87aeaf1c 100644
--- a/libm/src/math/jn.rs
+++ b/libm/src/math/jn.rs
@@ -39,7 +39,7 @@ use super::{cos, fabs, get_high_word, get_low_word, j0, j1, log, sin, sqrt, y0,
 const INVSQRTPI: f64 = 5.64189583547756279280e-01; /* 0x3FE20DD7, 0x50429B6D */
 
 /// Integer order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the first kind (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn jn(n: i32, mut x: f64) -> f64 {
     let mut ix: u32;
     let lx: u32;
@@ -249,7 +249,7 @@ pub fn jn(n: i32, mut x: f64) -> f64 {
 }
 
 /// Integer order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the second kind (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn yn(n: i32, x: f64) -> f64 {
     let mut ix: u32;
     let lx: u32;
diff --git a/libm/src/math/jnf.rs b/libm/src/math/jnf.rs
index 52cf7d8a8..34fdc5112 100644
--- a/libm/src/math/jnf.rs
+++ b/libm/src/math/jnf.rs
@@ -16,7 +16,7 @@
 use super::{fabsf, j0f, j1f, logf, y0f, y1f};
 
 /// Integer order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the first kind (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn jnf(n: i32, mut x: f32) -> f32 {
     let mut ix: u32;
     let mut nm1: i32;
@@ -192,7 +192,7 @@ pub fn jnf(n: i32, mut x: f32) -> f32 {
 }
 
 /// Integer order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the second kind (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn ynf(n: i32, x: f32) -> f32 {
     let mut ix: u32;
     let mut ib: u32;
diff --git a/libm/src/math/k_cos.rs b/libm/src/math/k_cos.rs
index 49b2fc64d..1a2ebabe3 100644
--- a/libm/src/math/k_cos.rs
+++ b/libm/src/math/k_cos.rs
@@ -51,7 +51,7 @@ const C6: f64 = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
 //         expression for cos().  Retention happens in all cases tested
 //         under FreeBSD, so don't pessimize things by forcibly clipping
 //         any extra precision in w.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub(crate) fn k_cos(x: f64, y: f64) -> f64 {
     let z = x * x;
     let w = z * z;
diff --git a/libm/src/math/k_cosf.rs b/libm/src/math/k_cosf.rs
index e99f2348c..68f568c24 100644
--- a/libm/src/math/k_cosf.rs
+++ b/libm/src/math/k_cosf.rs
@@ -20,7 +20,7 @@ const C1: f64 = 0.0416666233237390631894; /*  0x155553e1053a42.0p-57 */
 const C2: f64 = -0.00138867637746099294692; /* -0x16c087e80f1e27.0p-62 */
 const C3: f64 = 0.0000243904487962774090654; /*  0x199342e0ee5069.0p-68 */
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub(crate) fn k_cosf(x: f64) -> f32 {
     let z = x * x;
     let w = z * z;
diff --git a/libm/src/math/k_expo2.rs b/libm/src/math/k_expo2.rs
index 7345075f3..7b63952d2 100644
--- a/libm/src/math/k_expo2.rs
+++ b/libm/src/math/k_expo2.rs
@@ -4,7 +4,7 @@ use super::exp;
 const K: i32 = 2043;
 
 /* expf(x)/2 for x >= log(FLT_MAX), slightly better than 0.5f*expf(x/2)*expf(x/2) */
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub(crate) fn k_expo2(x: f64) -> f64 {
     let k_ln2 = f64::from_bits(0x40962066151add8b);
     /* note that k is odd and scale*scale overflows */
diff --git a/libm/src/math/k_expo2f.rs b/libm/src/math/k_expo2f.rs
index fbd7b27d5..02213cec4 100644
--- a/libm/src/math/k_expo2f.rs
+++ b/libm/src/math/k_expo2f.rs
@@ -4,7 +4,7 @@ use super::expf;
 const K: i32 = 235;
 
 /* expf(x)/2 for x >= log(FLT_MAX), slightly better than 0.5f*expf(x/2)*expf(x/2) */
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub(crate) fn k_expo2f(x: f32) -> f32 {
     let k_ln2 = f32::from_bits(0x4322e3bc);
     /* note that k is odd and scale*scale overflows */
diff --git a/libm/src/math/k_sin.rs b/libm/src/math/k_sin.rs
index 9dd96c944..2f8542945 100644
--- a/libm/src/math/k_sin.rs
+++ b/libm/src/math/k_sin.rs
@@ -43,7 +43,7 @@ const S6: f64 = 1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
 //              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
 //         then                   3    2
 //              sin(x) = x + (S1*x + (x *(r-y/2)+y))
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub(crate) fn k_sin(x: f64, y: f64, iy: i32) -> f64 {
     let z = x * x;
     let w = z * z;
diff --git a/libm/src/math/k_sinf.rs b/libm/src/math/k_sinf.rs
index 88d10caba..297d88bbb 100644
--- a/libm/src/math/k_sinf.rs
+++ b/libm/src/math/k_sinf.rs
@@ -20,7 +20,7 @@ const S2: f64 = 0.0083333293858894631756; /*  0x111110896efbb2.0p-59 */
 const S3: f64 = -0.000198393348360966317347; /* -0x1a00f9e2cae774.0p-65 */
 const S4: f64 = 0.0000027183114939898219064; /*  0x16cd878c3b46a7.0p-71 */
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub(crate) fn k_sinf(x: f64) -> f32 {
     let z = x * x;
     let w = z * z;
diff --git a/libm/src/math/k_tan.rs b/libm/src/math/k_tan.rs
index d177010bb..ac48d661f 100644
--- a/libm/src/math/k_tan.rs
+++ b/libm/src/math/k_tan.rs
@@ -58,7 +58,7 @@ static T: [f64; 13] = [
 const PIO4: f64 = 7.85398163397448278999e-01; /* 3FE921FB, 54442D18 */
 const PIO4_LO: f64 = 3.06161699786838301793e-17; /* 3C81A626, 33145C07 */
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub(crate) fn k_tan(mut x: f64, mut y: f64, odd: i32) -> f64 {
     let hx = (f64::to_bits(x) >> 32) as u32;
     let big = (hx & 0x7fffffff) >= 0x3FE59428; /* |x| >= 0.6744 */
diff --git a/libm/src/math/k_tanf.rs b/libm/src/math/k_tanf.rs
index af8db539d..79382f57b 100644
--- a/libm/src/math/k_tanf.rs
+++ b/libm/src/math/k_tanf.rs
@@ -19,7 +19,7 @@ const T: [f64; 6] = [
     0.00946564784943673166728, /* 0x1362b9bf971bcd.0p-59 */
 ];
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub(crate) fn k_tanf(x: f64, odd: bool) -> f32 {
     let z = x * x;
     /*
diff --git a/libm/src/math/ldexp.rs b/libm/src/math/ldexp.rs
index 24899ba30..b32b8d524 100644
--- a/libm/src/math/ldexp.rs
+++ b/libm/src/math/ldexp.rs
@@ -1,21 +1,21 @@
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn ldexpf16(x: f16, n: i32) -> f16 {
     super::scalbnf16(x, n)
 }
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn ldexpf(x: f32, n: i32) -> f32 {
     super::scalbnf(x, n)
 }
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn ldexp(x: f64, n: i32) -> f64 {
     super::scalbn(x, n)
 }
 
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn ldexpf128(x: f128, n: i32) -> f128 {
     super::scalbnf128(x, n)
 }
diff --git a/libm/src/math/ldexpf.rs b/libm/src/math/ldexpf.rs
deleted file mode 100644
index 95b27fc49..000000000
--- a/libm/src/math/ldexpf.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn ldexpf(x: f32, n: i32) -> f32 {
-    super::scalbnf(x, n)
-}
diff --git a/libm/src/math/ldexpf128.rs b/libm/src/math/ldexpf128.rs
deleted file mode 100644
index b35277d15..000000000
--- a/libm/src/math/ldexpf128.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn ldexpf128(x: f128, n: i32) -> f128 {
-    super::scalbnf128(x, n)
-}
diff --git a/libm/src/math/ldexpf16.rs b/libm/src/math/ldexpf16.rs
deleted file mode 100644
index 8de6cffd6..000000000
--- a/libm/src/math/ldexpf16.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn ldexpf16(x: f16, n: i32) -> f16 {
-    super::scalbnf16(x, n)
-}
diff --git a/libm/src/math/lgamma.rs b/libm/src/math/lgamma.rs
index 8312dc186..da7ce5c98 100644
--- a/libm/src/math/lgamma.rs
+++ b/libm/src/math/lgamma.rs
@@ -2,7 +2,7 @@ use super::lgamma_r;
 
 /// The natural logarithm of the
 /// [Gamma function](https://en.wikipedia.org/wiki/Gamma_function) (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn lgamma(x: f64) -> f64 {
     lgamma_r(x).0
 }
diff --git a/libm/src/math/lgamma_r.rs b/libm/src/math/lgamma_r.rs
index 6becaad2c..38eb270f6 100644
--- a/libm/src/math/lgamma_r.rs
+++ b/libm/src/math/lgamma_r.rs
@@ -165,7 +165,7 @@ fn sin_pi(mut x: f64) -> f64 {
     }
 }
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn lgamma_r(mut x: f64) -> (f64, i32) {
     let u: u64 = x.to_bits();
     let mut t: f64;
diff --git a/libm/src/math/lgammaf.rs b/libm/src/math/lgammaf.rs
index d37512397..920acfed2 100644
--- a/libm/src/math/lgammaf.rs
+++ b/libm/src/math/lgammaf.rs
@@ -2,7 +2,7 @@ use super::lgammaf_r;
 
 /// The natural logarithm of the
 /// [Gamma function](https://en.wikipedia.org/wiki/Gamma_function) (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn lgammaf(x: f32) -> f32 {
     lgammaf_r(x).0
 }
diff --git a/libm/src/math/lgammaf_r.rs b/libm/src/math/lgammaf_r.rs
index 10cecee54..a0b6a678a 100644
--- a/libm/src/math/lgammaf_r.rs
+++ b/libm/src/math/lgammaf_r.rs
@@ -100,7 +100,7 @@ fn sin_pi(mut x: f32) -> f32 {
     }
 }
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn lgammaf_r(mut x: f32) -> (f32, i32) {
     let u = x.to_bits();
     let mut t: f32;
diff --git a/libm/src/math/log.rs b/libm/src/math/log.rs
index f2dc47ec5..9499c56d8 100644
--- a/libm/src/math/log.rs
+++ b/libm/src/math/log.rs
@@ -71,7 +71,7 @@ const LG6: f64 = 1.531383769920937332e-01; /* 3FC39A09 D078C69F */
 const LG7: f64 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
 
 /// The natural logarithm of `x` (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn log(mut x: f64) -> f64 {
     let x1p54 = f64::from_bits(0x4350000000000000); // 0x1p54 === 2 ^ 54
 
diff --git a/libm/src/math/log10.rs b/libm/src/math/log10.rs
index 8c9d68c49..29f25d944 100644
--- a/libm/src/math/log10.rs
+++ b/libm/src/math/log10.rs
@@ -32,7 +32,7 @@ const LG6: f64 = 1.531383769920937332e-01; /* 3FC39A09 D078C69F */
 const LG7: f64 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
 
 /// The base 10 logarithm of `x` (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn log10(mut x: f64) -> f64 {
     let x1p54 = f64::from_bits(0x4350000000000000); // 0x1p54 === 2 ^ 54
 
diff --git a/libm/src/math/log10f.rs b/libm/src/math/log10f.rs
index 18bf8fcc8..f89584bf9 100644
--- a/libm/src/math/log10f.rs
+++ b/libm/src/math/log10f.rs
@@ -26,7 +26,7 @@ const LG3: f32 = 0.28498786688; /* 0x91e9ee.0p-25 */
 const LG4: f32 = 0.24279078841; /* 0xf89e26.0p-26 */
 
 /// The base 10 logarithm of `x` (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn log10f(mut x: f32) -> f32 {
     let x1p25f = f32::from_bits(0x4c000000); // 0x1p25f === 2 ^ 25
 
diff --git a/libm/src/math/log1p.rs b/libm/src/math/log1p.rs
index 65142c0d6..c991cce60 100644
--- a/libm/src/math/log1p.rs
+++ b/libm/src/math/log1p.rs
@@ -66,7 +66,7 @@ const LG6: f64 = 1.531383769920937332e-01; /* 3FC39A09 D078C69F */
 const LG7: f64 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
 
 /// The natural logarithm of 1+`x` (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn log1p(x: f64) -> f64 {
     let mut ui: u64 = x.to_bits();
     let hfsq: f64;
diff --git a/libm/src/math/log1pf.rs b/libm/src/math/log1pf.rs
index 23978e61c..89a92fac9 100644
--- a/libm/src/math/log1pf.rs
+++ b/libm/src/math/log1pf.rs
@@ -21,7 +21,7 @@ const LG3: f32 = 0.28498786688; /* 0x91e9ee.0p-25 */
 const LG4: f32 = 0.24279078841; /* 0xf89e26.0p-26 */
 
 /// The natural logarithm of 1+`x` (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn log1pf(x: f32) -> f32 {
     let mut ui: u32 = x.to_bits();
     let hfsq: f32;
diff --git a/libm/src/math/log2.rs b/libm/src/math/log2.rs
index 701f63c25..9b750c9a2 100644
--- a/libm/src/math/log2.rs
+++ b/libm/src/math/log2.rs
@@ -30,7 +30,7 @@ const LG6: f64 = 1.531383769920937332e-01; /* 3FC39A09 D078C69F */
 const LG7: f64 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
 
 /// The base 2 logarithm of `x` (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn log2(mut x: f64) -> f64 {
     let x1p54 = f64::from_bits(0x4350000000000000); // 0x1p54 === 2 ^ 54
 
diff --git a/libm/src/math/log2f.rs b/libm/src/math/log2f.rs
index 5ba2427d1..0e5177d7a 100644
--- a/libm/src/math/log2f.rs
+++ b/libm/src/math/log2f.rs
@@ -24,7 +24,7 @@ const LG3: f32 = 0.28498786688; /* 0x91e9ee.0p-25 */
 const LG4: f32 = 0.24279078841; /* 0xf89e26.0p-26 */
 
 /// The base 2 logarithm of `x` (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn log2f(mut x: f32) -> f32 {
     let x1p25f = f32::from_bits(0x4c000000); // 0x1p25f === 2 ^ 25
 
diff --git a/libm/src/math/logf.rs b/libm/src/math/logf.rs
index 68d194302..cd7a7b0ba 100644
--- a/libm/src/math/logf.rs
+++ b/libm/src/math/logf.rs
@@ -22,7 +22,7 @@ const LG3: f32 = 0.28498786688; /*  0x91e9ee.0p-25 */
 const LG4: f32 = 0.24279078841; /*  0xf89e26.0p-26 */
 
 /// The natural logarithm of `x` (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn logf(mut x: f32) -> f32 {
     let x1p25 = f32::from_bits(0x4c000000); // 0x1p25f === 2 ^ 25
 
diff --git a/libm/src/math/mod.rs b/libm/src/math/mod.rs
index ce9b8fc58..8eecfe566 100644
--- a/libm/src/math/mod.rs
+++ b/libm/src/math/mod.rs
@@ -1,3 +1,5 @@
+#![allow(clippy::approx_constant)] // many false positives
+
 macro_rules! force_eval {
     ($e:expr) => {
         unsafe { ::core::ptr::read_volatile(&$e) }
diff --git a/libm/src/math/modf.rs b/libm/src/math/modf.rs
index 6541862cd..a92a83dc5 100644
--- a/libm/src/math/modf.rs
+++ b/libm/src/math/modf.rs
@@ -1,4 +1,4 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn modf(x: f64) -> (f64, f64) {
     let rv2: f64;
     let mut u = x.to_bits();
diff --git a/libm/src/math/modff.rs b/libm/src/math/modff.rs
index 90c6bca7d..691f351ca 100644
--- a/libm/src/math/modff.rs
+++ b/libm/src/math/modff.rs
@@ -1,4 +1,4 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn modff(x: f32) -> (f32, f32) {
     let rv2: f32;
     let mut u: u32 = x.to_bits();
diff --git a/libm/src/math/nextafter.rs b/libm/src/math/nextafter.rs
index c991ff6f2..f4408468c 100644
--- a/libm/src/math/nextafter.rs
+++ b/libm/src/math/nextafter.rs
@@ -1,4 +1,4 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn nextafter(x: f64, y: f64) -> f64 {
     if x.is_nan() || y.is_nan() {
         return x + y;
diff --git a/libm/src/math/nextafterf.rs b/libm/src/math/nextafterf.rs
index 8ba383356..c15eb9de2 100644
--- a/libm/src/math/nextafterf.rs
+++ b/libm/src/math/nextafterf.rs
@@ -1,4 +1,4 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn nextafterf(x: f32, y: f32) -> f32 {
     if x.is_nan() || y.is_nan() {
         return x + y;
diff --git a/libm/src/math/pow.rs b/libm/src/math/pow.rs
index 94ae31cf0..914d68cfc 100644
--- a/libm/src/math/pow.rs
+++ b/libm/src/math/pow.rs
@@ -90,7 +90,7 @@ const IVLN2_H: f64 = 1.44269502162933349609e+00; /* 0x3ff71547_60000000 =24b 1/l
 const IVLN2_L: f64 = 1.92596299112661746887e-08; /* 0x3e54ae0b_f85ddf44 =1/ln2 tail*/
 
 /// Returns `x` to the power of `y` (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn pow(x: f64, y: f64) -> f64 {
     let t1: f64;
     let t2: f64;
diff --git a/libm/src/math/powf.rs b/libm/src/math/powf.rs
index 11c7a7cbd..17772ae87 100644
--- a/libm/src/math/powf.rs
+++ b/libm/src/math/powf.rs
@@ -46,7 +46,7 @@ const IVLN2_H: f32 = 1.4426879883e+00;
 const IVLN2_L: f32 = 7.0526075433e-06;
 
 /// Returns `x` to the power of `y` (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn powf(x: f32, y: f32) -> f32 {
     let mut z: f32;
     let mut ax: f32;
diff --git a/libm/src/math/rem_pio2.rs b/libm/src/math/rem_pio2.rs
index d677fd9dc..61b103027 100644
--- a/libm/src/math/rem_pio2.rs
+++ b/libm/src/math/rem_pio2.rs
@@ -41,7 +41,7 @@ const PIO2_3T: f64 = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
 // use rem_pio2_large() for large x
 //
 // caller must handle the case when reduction is not needed: |x| ~<= pi/4 */
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub(crate) fn rem_pio2(x: f64) -> (i32, f64, f64) {
     let x1p24 = f64::from_bits(0x4170000000000000);
 
@@ -195,7 +195,7 @@ mod tests {
 
     #[test]
     // FIXME(correctness): inaccurate results on i586
-    #[cfg_attr(all(target_arch = "x86", not(target_feature = "sse")), ignore)]
+    #[cfg_attr(x86_no_sse, ignore)]
     fn test_near_pi() {
         let arg = 3.141592025756836;
         let arg = force_eval!(arg);
diff --git a/libm/src/math/rem_pio2_large.rs b/libm/src/math/rem_pio2_large.rs
index 6d679bbe9..f1fdf3673 100644
--- a/libm/src/math/rem_pio2_large.rs
+++ b/libm/src/math/rem_pio2_large.rs
@@ -11,7 +11,7 @@
  * ====================================================
  */
 
-use super::{floor, scalbn};
+use super::scalbn;
 
 // initial value for jk
 const INIT_JK: [usize; 4] = [3, 4, 4, 6];
@@ -221,8 +221,16 @@ const PIO2: [f64; 8] = [
 /// skip the part of the product that are known to be a huge integer (
 /// more accurately, = 0 mod 8 ). Thus the number of operations are
 /// independent of the exponent of the input.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub(crate) fn rem_pio2_large(x: &[f64], y: &mut [f64], e0: i32, prec: usize) -> i32 {
+    // FIXME(rust-lang/rust#144518): Inline assembly would cause `no_panic` to fail
+    // on the callers of this function. As a workaround, avoid inlining `floor` here
+    // when implemented with assembly.
+    #[cfg_attr(x86_no_sse, inline(never))]
+    extern "C" fn floor(x: f64) -> f64 {
+        super::floor(x)
+    }
+
     let x1p24 = f64::from_bits(0x4170000000000000); // 0x1p24 === 2 ^ 24
     let x1p_24 = f64::from_bits(0x3e70000000000000); // 0x1p_24 === 2 ^ (-24)
 
diff --git a/libm/src/math/rem_pio2f.rs b/libm/src/math/rem_pio2f.rs
index 3c658fe3d..0472a1035 100644
--- a/libm/src/math/rem_pio2f.rs
+++ b/libm/src/math/rem_pio2f.rs
@@ -31,7 +31,7 @@ const PIO2_1T: f64 = 1.58932547735281966916e-08; /* 0x3E5110b4, 0x611A6263 */
 ///
 /// use double precision for everything except passing x
 /// use __rem_pio2_large() for large x
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub(crate) fn rem_pio2f(x: f32) -> (i32, f64) {
     let x64 = x as f64;
 
diff --git a/libm/src/math/remainder.rs b/libm/src/math/remainder.rs
index 9e966c9ed..54152df32 100644
--- a/libm/src/math/remainder.rs
+++ b/libm/src/math/remainder.rs
@@ -1,4 +1,4 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn remainder(x: f64, y: f64) -> f64 {
     let (result, _) = super::remquo(x, y);
     result
diff --git a/libm/src/math/remainderf.rs b/libm/src/math/remainderf.rs
index b1407cf2a..21f629214 100644
--- a/libm/src/math/remainderf.rs
+++ b/libm/src/math/remainderf.rs
@@ -1,4 +1,4 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn remainderf(x: f32, y: f32) -> f32 {
     let (result, _) = super::remquof(x, y);
     result
diff --git a/libm/src/math/remquo.rs b/libm/src/math/remquo.rs
index 4c11e8487..f13b09237 100644
--- a/libm/src/math/remquo.rs
+++ b/libm/src/math/remquo.rs
@@ -1,4 +1,4 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn remquo(mut x: f64, mut y: f64) -> (f64, i32) {
     let ux: u64 = x.to_bits();
     let mut uy: u64 = y.to_bits();
diff --git a/libm/src/math/remquof.rs b/libm/src/math/remquof.rs
index b0e85ca66..cc7863a09 100644
--- a/libm/src/math/remquof.rs
+++ b/libm/src/math/remquof.rs
@@ -1,4 +1,4 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn remquof(mut x: f32, mut y: f32) -> (f32, i32) {
     let ux: u32 = x.to_bits();
     let mut uy: u32 = y.to_bits();
diff --git a/libm/src/math/rint.rs b/libm/src/math/rint.rs
index e1c32c943..011a7ae3d 100644
--- a/libm/src/math/rint.rs
+++ b/libm/src/math/rint.rs
@@ -2,7 +2,7 @@ use super::support::Round;
 
 /// Round `x` to the nearest integer, breaking ties toward even.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn rintf16(x: f16) -> f16 {
     select_implementation! {
         name: rintf16,
@@ -14,7 +14,7 @@ pub fn rintf16(x: f16) -> f16 {
 }
 
 /// Round `x` to the nearest integer, breaking ties toward even.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn rintf(x: f32) -> f32 {
     select_implementation! {
         name: rintf,
@@ -29,7 +29,7 @@ pub fn rintf(x: f32) -> f32 {
 }
 
 /// Round `x` to the nearest integer, breaking ties toward even.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn rint(x: f64) -> f64 {
     select_implementation! {
         name: rint,
@@ -45,7 +45,7 @@ pub fn rint(x: f64) -> f64 {
 
 /// Round `x` to the nearest integer, breaking ties toward even.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn rintf128(x: f128) -> f128 {
     super::generic::rint_round(x, Round::Nearest).val
 }
diff --git a/libm/src/math/round.rs b/libm/src/math/round.rs
index 6cd091cd7..256197e6c 100644
--- a/libm/src/math/round.rs
+++ b/libm/src/math/round.rs
@@ -1,25 +1,25 @@
 /// Round `x` to the nearest integer, breaking ties away from zero.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn roundf16(x: f16) -> f16 {
     super::generic::round(x)
 }
 
 /// Round `x` to the nearest integer, breaking ties away from zero.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn roundf(x: f32) -> f32 {
     super::generic::round(x)
 }
 
 /// Round `x` to the nearest integer, breaking ties away from zero.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn round(x: f64) -> f64 {
     super::generic::round(x)
 }
 
 /// Round `x` to the nearest integer, breaking ties away from zero.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn roundf128(x: f128) -> f128 {
     super::generic::round(x)
 }
diff --git a/libm/src/math/roundeven.rs b/libm/src/math/roundeven.rs
index 6e621d762..f0d67d410 100644
--- a/libm/src/math/roundeven.rs
+++ b/libm/src/math/roundeven.rs
@@ -3,21 +3,21 @@ use super::support::{Float, Round};
 /// Round `x` to the nearest integer, breaking ties toward even. This is IEEE 754
 /// `roundToIntegralTiesToEven`.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn roundevenf16(x: f16) -> f16 {
     roundeven_impl(x)
 }
 
 /// Round `x` to the nearest integer, breaking ties toward even. This is IEEE 754
 /// `roundToIntegralTiesToEven`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn roundevenf(x: f32) -> f32 {
     roundeven_impl(x)
 }
 
 /// Round `x` to the nearest integer, breaking ties toward even. This is IEEE 754
 /// `roundToIntegralTiesToEven`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn roundeven(x: f64) -> f64 {
     roundeven_impl(x)
 }
@@ -25,7 +25,7 @@ pub fn roundeven(x: f64) -> f64 {
 /// Round `x` to the nearest integer, breaking ties toward even. This is IEEE 754
 /// `roundToIntegralTiesToEven`.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn roundevenf128(x: f128) -> f128 {
     roundeven_impl(x)
 }
diff --git a/libm/src/math/roundf.rs b/libm/src/math/roundf.rs
deleted file mode 100644
index b5d7c9d69..000000000
--- a/libm/src/math/roundf.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-/// Round `x` to the nearest integer, breaking ties away from zero.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn roundf(x: f32) -> f32 {
-    super::generic::round(x)
-}
diff --git a/libm/src/math/roundf128.rs b/libm/src/math/roundf128.rs
deleted file mode 100644
index fc3164929..000000000
--- a/libm/src/math/roundf128.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-/// Round `x` to the nearest integer, breaking ties away from zero.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn roundf128(x: f128) -> f128 {
-    super::generic::round(x)
-}
diff --git a/libm/src/math/roundf16.rs b/libm/src/math/roundf16.rs
deleted file mode 100644
index 8b356eaab..000000000
--- a/libm/src/math/roundf16.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-/// Round `x` to the nearest integer, breaking ties away from zero.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn roundf16(x: f16) -> f16 {
-    super::generic::round(x)
-}
diff --git a/libm/src/math/scalbn.rs b/libm/src/math/scalbn.rs
index ed73c3f94..f1a67cb7f 100644
--- a/libm/src/math/scalbn.rs
+++ b/libm/src/math/scalbn.rs
@@ -1,21 +1,21 @@
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn scalbnf16(x: f16, n: i32) -> f16 {
     super::generic::scalbn(x, n)
 }
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn scalbnf(x: f32, n: i32) -> f32 {
     super::generic::scalbn(x, n)
 }
 
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn scalbn(x: f64, n: i32) -> f64 {
     super::generic::scalbn(x, n)
 }
 
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn scalbnf128(x: f128, n: i32) -> f128 {
     super::generic::scalbn(x, n)
 }
diff --git a/libm/src/math/scalbnf.rs b/libm/src/math/scalbnf.rs
deleted file mode 100644
index 57e7ba76f..000000000
--- a/libm/src/math/scalbnf.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn scalbnf(x: f32, n: i32) -> f32 {
-    super::generic::scalbn(x, n)
-}
diff --git a/libm/src/math/scalbnf128.rs b/libm/src/math/scalbnf128.rs
deleted file mode 100644
index c1d2b4855..000000000
--- a/libm/src/math/scalbnf128.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn scalbnf128(x: f128, n: i32) -> f128 {
-    super::generic::scalbn(x, n)
-}
diff --git a/libm/src/math/scalbnf16.rs b/libm/src/math/scalbnf16.rs
deleted file mode 100644
index 2209e1a17..000000000
--- a/libm/src/math/scalbnf16.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn scalbnf16(x: f16, n: i32) -> f16 {
-    super::generic::scalbn(x, n)
-}
diff --git a/libm/src/math/sin.rs b/libm/src/math/sin.rs
index 229fa4bef..5378a7bc3 100644
--- a/libm/src/math/sin.rs
+++ b/libm/src/math/sin.rs
@@ -44,7 +44,7 @@ use super::{k_cos, k_sin, rem_pio2};
 /// The sine of `x` (f64).
 ///
 /// `x` is specified in radians.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn sin(x: f64) -> f64 {
     let x1p120 = f64::from_bits(0x4770000000000000); // 0x1p120f === 2 ^ 120
 
diff --git a/libm/src/math/sincos.rs b/libm/src/math/sincos.rs
index ebf482f2d..a364f7375 100644
--- a/libm/src/math/sincos.rs
+++ b/libm/src/math/sincos.rs
@@ -15,7 +15,7 @@ use super::{get_high_word, k_cos, k_sin, rem_pio2};
 /// Both the sine and cosine of `x` (f64).
 ///
 /// `x` is specified in radians and the return value is (sin(x), cos(x)).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn sincos(x: f64) -> (f64, f64) {
     let s: f64;
     let c: f64;
diff --git a/libm/src/math/sincosf.rs b/libm/src/math/sincosf.rs
index f33607676..c4beb5267 100644
--- a/libm/src/math/sincosf.rs
+++ b/libm/src/math/sincosf.rs
@@ -26,7 +26,7 @@ const S4PIO2: f64 = 4.0 * PI_2; /* 0x401921FB, 0x54442D18 */
 /// Both the sine and cosine of `x` (f32).
 ///
 /// `x` is specified in radians and the return value is (sin(x), cos(x)).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn sincosf(x: f32) -> (f32, f32) {
     let s: f32;
     let c: f32;
diff --git a/libm/src/math/sinf.rs b/libm/src/math/sinf.rs
index 709b63fcf..b4edf6769 100644
--- a/libm/src/math/sinf.rs
+++ b/libm/src/math/sinf.rs
@@ -27,7 +27,7 @@ const S4_PIO2: f64 = 4. * FRAC_PI_2; /* 0x401921FB, 0x54442D18 */
 /// The sine of `x` (f32).
 ///
 /// `x` is specified in radians.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn sinf(x: f32) -> f32 {
     let x64 = x as f64;
 
diff --git a/libm/src/math/sinh.rs b/libm/src/math/sinh.rs
index 791841982..900dd6ca4 100644
--- a/libm/src/math/sinh.rs
+++ b/libm/src/math/sinh.rs
@@ -6,7 +6,7 @@ use super::{expm1, expo2};
 //
 
 /// The hyperbolic sine of `x` (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn sinh(x: f64) -> f64 {
     // union {double f; uint64_t i;} u = {.f = x};
     // uint32_t w;
diff --git a/libm/src/math/sinhf.rs b/libm/src/math/sinhf.rs
index 44d2e3560..501acea30 100644
--- a/libm/src/math/sinhf.rs
+++ b/libm/src/math/sinhf.rs
@@ -1,7 +1,7 @@
 use super::{expm1f, k_expo2f};
 
 /// The hyperbolic sine of `x` (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn sinhf(x: f32) -> f32 {
     let mut h = 0.5f32;
     let mut ix = x.to_bits();
diff --git a/libm/src/math/sqrt.rs b/libm/src/math/sqrt.rs
index 76bc240cf..7ba1bc9b3 100644
--- a/libm/src/math/sqrt.rs
+++ b/libm/src/math/sqrt.rs
@@ -1,6 +1,6 @@
 /// The square root of `x` (f16).
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn sqrtf16(x: f16) -> f16 {
     select_implementation! {
         name: sqrtf16,
@@ -12,7 +12,7 @@ pub fn sqrtf16(x: f16) -> f16 {
 }
 
 /// The square root of `x` (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn sqrtf(x: f32) -> f32 {
     select_implementation! {
         name: sqrtf,
@@ -28,7 +28,7 @@ pub fn sqrtf(x: f32) -> f32 {
 }
 
 /// The square root of `x` (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn sqrt(x: f64) -> f64 {
     select_implementation! {
         name: sqrt,
@@ -45,7 +45,7 @@ pub fn sqrt(x: f64) -> f64 {
 
 /// The square root of `x` (f128).
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn sqrtf128(x: f128) -> f128 {
     return super::generic::sqrt(x);
 }
diff --git a/libm/src/math/sqrtf.rs b/libm/src/math/sqrtf.rs
deleted file mode 100644
index c28a705e3..000000000
--- a/libm/src/math/sqrtf.rs
+++ /dev/null
@@ -1,15 +0,0 @@
-/// The square root of `x` (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn sqrtf(x: f32) -> f32 {
-    select_implementation! {
-        name: sqrtf,
-        use_arch: any(
-            all(target_arch = "aarch64", target_feature = "neon"),
-            all(target_arch = "wasm32", intrinsics_enabled),
-            target_feature = "sse2"
-        ),
-        args: x,
-    }
-
-    super::generic::sqrt(x)
-}
diff --git a/libm/src/math/sqrtf128.rs b/libm/src/math/sqrtf128.rs
deleted file mode 100644
index eaef6ae0c..000000000
--- a/libm/src/math/sqrtf128.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-/// The square root of `x` (f128).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn sqrtf128(x: f128) -> f128 {
-    return super::generic::sqrt(x);
-}
diff --git a/libm/src/math/sqrtf16.rs b/libm/src/math/sqrtf16.rs
deleted file mode 100644
index 7bedb7f8b..000000000
--- a/libm/src/math/sqrtf16.rs
+++ /dev/null
@@ -1,11 +0,0 @@
-/// The square root of `x` (f16).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn sqrtf16(x: f16) -> f16 {
-    select_implementation! {
-        name: sqrtf16,
-        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
-        args: x,
-    }
-
-    return super::generic::sqrt(x);
-}
diff --git a/libm/src/math/support/big.rs b/libm/src/math/support/big.rs
index f24c063cd..b7f128542 100644
--- a/libm/src/math/support/big.rs
+++ b/libm/src/math/support/big.rs
@@ -11,10 +11,10 @@ const U128_LO_MASK: u128 = u64::MAX as u128;
 
 /// A 256-bit unsigned integer represented as two 128-bit native-endian limbs.
 #[allow(non_camel_case_types)]
-#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
 pub struct u256 {
-    pub lo: u128,
     pub hi: u128,
+    pub lo: u128,
 }
 
 impl u256 {
@@ -28,17 +28,17 @@ impl u256 {
     pub fn signed(self) -> i256 {
         i256 {
             lo: self.lo,
-            hi: self.hi,
+            hi: self.hi as i128,
         }
     }
 }
 
 /// A 256-bit signed integer represented as two 128-bit native-endian limbs.
 #[allow(non_camel_case_types)]
-#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
 pub struct i256 {
+    pub hi: i128,
     pub lo: u128,
-    pub hi: u128,
 }
 
 impl i256 {
@@ -47,7 +47,7 @@ impl i256 {
     pub fn unsigned(self) -> u256 {
         u256 {
             lo: self.lo,
-            hi: self.hi,
+            hi: self.hi as u128,
         }
     }
 }
@@ -73,17 +73,17 @@ impl MinInt for i256 {
 
     type Unsigned = u256;
 
-    const SIGNED: bool = false;
+    const SIGNED: bool = true;
     const BITS: u32 = 256;
     const ZERO: Self = Self { lo: 0, hi: 0 };
     const ONE: Self = Self { lo: 1, hi: 0 };
     const MIN: Self = Self {
-        lo: 0,
-        hi: 1 << 127,
+        lo: u128::MIN,
+        hi: i128::MIN,
     };
     const MAX: Self = Self {
         lo: u128::MAX,
-        hi: u128::MAX << 1,
+        hi: i128::MAX,
     };
 }
 
@@ -109,60 +109,86 @@ macro_rules! impl_common {
             }
         }
 
-        impl ops::Shl<u32> for $ty {
+        impl ops::Add<Self> for $ty {
             type Output = Self;
 
-            fn shl(self, _rhs: u32) -> Self::Output {
-                unimplemented!("only used to meet trait bounds")
+            fn add(self, rhs: Self) -> Self::Output {
+                let (lo, carry) = self.lo.overflowing_add(rhs.lo);
+                let (hi, of) = Int::carrying_add(self.hi, rhs.hi, carry);
+                debug_assert!(!of, "attempt to add with overflow");
+                Self { lo, hi }
             }
         }
-    };
-}
 
-impl_common!(i256);
-impl_common!(u256);
+        impl ops::Sub<Self> for $ty {
+            type Output = Self;
 
-impl ops::Add<Self> for u256 {
-    type Output = Self;
+            fn sub(self, rhs: Self) -> Self::Output {
+                let (lo, borrow) = self.lo.overflowing_sub(rhs.lo);
+                let (hi, of) = Int::borrowing_sub(self.hi, rhs.hi, borrow);
+                debug_assert!(!of, "attempt to subtract with overflow");
+                Self { lo, hi }
+            }
+        }
 
-    fn add(self, rhs: Self) -> Self::Output {
-        let (lo, carry) = self.lo.overflowing_add(rhs.lo);
-        let hi = self.hi.wrapping_add(carry as u128).wrapping_add(rhs.hi);
+        impl ops::Shl<u32> for $ty {
+            type Output = Self;
 
-        Self { lo, hi }
-    }
-}
+            fn shl(mut self, rhs: u32) -> Self::Output {
+                debug_assert!(rhs < Self::BITS, "attempt to shift left with overflow");
 
-impl ops::Shr<u32> for u256 {
-    type Output = Self;
+                let half_bits = Self::BITS / 2;
+                let low_mask = half_bits - 1;
+                let s = rhs & low_mask;
 
-    fn shr(mut self, rhs: u32) -> Self::Output {
-        debug_assert!(rhs < Self::BITS, "attempted to shift right with overflow");
-        if rhs >= Self::BITS {
-            return Self::ZERO;
-        }
+                let lo = self.lo;
+                let hi = self.hi;
 
-        if rhs == 0 {
-            return self;
-        }
+                self.lo = lo << s;
 
-        if rhs < 128 {
-            self.lo >>= rhs;
-            self.lo |= self.hi << (128 - rhs);
-        } else {
-            self.lo = self.hi >> (rhs - 128);
+                if rhs & half_bits == 0 {
+                    self.hi = (lo >> (low_mask ^ s) >> 1) as _;
+                    self.hi |= hi << s;
+                } else {
+                    self.hi = self.lo as _;
+                    self.lo = 0;
+                }
+                self
+            }
         }
 
-        if rhs < 128 {
-            self.hi >>= rhs;
-        } else {
-            self.hi = 0;
-        }
+        impl ops::Shr<u32> for $ty {
+            type Output = Self;
 
-        self
-    }
+            fn shr(mut self, rhs: u32) -> Self::Output {
+                debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
+
+                let half_bits = Self::BITS / 2;
+                let low_mask = half_bits - 1;
+                let s = rhs & low_mask;
+
+                let lo = self.lo;
+                let hi = self.hi;
+
+                self.hi = hi >> s;
+
+                #[allow(unused_comparisons)]
+                if rhs & half_bits == 0 {
+                    self.lo = (hi << (low_mask ^ s) << 1) as _;
+                    self.lo |= lo >> s;
+                } else {
+                    self.lo = self.hi as _;
+                    self.hi = if hi < 0 { !0 } else { 0 };
+                }
+                self
+            }
+        }
+    };
 }
 
+impl_common!(i256);
+impl_common!(u256);
+
 impl HInt for u128 {
     type D = u256;
 
@@ -200,7 +226,7 @@ impl HInt for u128 {
     }
 
     fn widen_hi(self) -> Self::D {
-        self.widen() << <Self as MinInt>::BITS
+        u256 { lo: 0, hi: self }
     }
 }
 
@@ -208,11 +234,10 @@ impl HInt for i128 {
     type D = i256;
 
     fn widen(self) -> Self::D {
-        let mut ret = self.unsigned().zero_widen().signed();
-        if self.is_negative() {
-            ret.hi = u128::MAX;
+        i256 {
+            lo: self as u128,
+            hi: if self < 0 { -1 } else { 0 },
         }
-        ret
     }
 
     fn zero_widen(self) -> Self::D {
@@ -228,7 +253,7 @@ impl HInt for i128 {
     }
 
     fn widen_hi(self) -> Self::D {
-        self.widen() << <Self as MinInt>::BITS
+        i256 { lo: 0, hi: self }
     }
 }
 
@@ -252,6 +277,6 @@ impl DInt for i256 {
     }
 
     fn hi(self) -> Self::H {
-        self.hi as i128
+        self.hi
     }
 }
diff --git a/libm/src/math/support/big/tests.rs b/libm/src/math/support/big/tests.rs
index d2010f021..d54706c72 100644
--- a/libm/src/math/support/big/tests.rs
+++ b/libm/src/math/support/big/tests.rs
@@ -36,7 +36,7 @@ fn widen_i128() {
         (LOHI_SPLIT as i128).widen(),
         i256 {
             lo: LOHI_SPLIT,
-            hi: u128::MAX
+            hi: -1,
         }
     );
     assert_eq!((-1i128).zero_widen().unsigned(), (u128::MAX).widen());
@@ -275,3 +275,64 @@ fn shr_u256_overflow() {
     assert_eq!(u256::MAX >> 257, u256::ZERO);
     assert_eq!(u256::MAX >> u32::MAX, u256::ZERO);
 }
+
+#[test]
+fn u256_ord() {
+    let _1 = u256::ONE;
+    let _2 = _1 + _1;
+    for x in u8::MIN..u8::MAX {
+        let y = x + 1;
+        let wx = (x as u128).widen_hi();
+        let wy = (y as u128).widen_hi();
+        assert!([wx, wx + _1, wx + _2, wy, wy + _1, wy + _2].is_sorted());
+    }
+}
+#[test]
+fn i256_ord() {
+    let _1 = i256::ONE;
+    let _2 = _1 + _1;
+    for x in i8::MIN..i8::MAX {
+        let y = x + 1;
+        let wx = (x as i128).widen_hi();
+        let wy = (y as i128).widen_hi();
+        assert!([wx, wx + _1, wx + _2, wy - _2, wy - _1, wy].is_sorted());
+    }
+}
+
+#[test]
+fn u256_shifts() {
+    let _1 = u256::ONE;
+    for k in 0..255 {
+        let x = _1 << k;
+        let x2 = _1 << (k + 1);
+        assert!(x < x2);
+        assert_eq!(x << 1, x2);
+        assert_eq!(x + x, x2);
+        assert_eq!(x >> k, _1);
+        assert_eq!(x2 >> (k + 1), _1);
+    }
+}
+#[test]
+fn i256_shifts() {
+    let _1 = i256::ONE;
+    for k in 0..254 {
+        let x = _1 << k;
+        let x2 = _1 << (k + 1);
+        assert!(x < x2);
+        assert_eq!(x << 1, x2);
+        assert_eq!(x + x, x2);
+        assert_eq!(x >> k, _1);
+        assert_eq!(x2 >> (k + 1), _1);
+    }
+
+    let min = _1 << 255;
+    assert_eq!(min, i256::MIN);
+    let mut x = min;
+    for k in 0..255 {
+        assert_eq!(x, min >> k);
+        let y = x >> 1;
+        assert_eq!(y + y, x);
+        assert!(x < y);
+        x = y;
+    }
+}
diff --git a/libm/src/math/support/float_traits.rs b/libm/src/math/support/float_traits.rs
index 4c866ef10..fb790e696 100644
--- a/libm/src/math/support/float_traits.rs
+++ b/libm/src/math/support/float_traits.rs
@@ -6,6 +6,7 @@ use super::int_traits::{CastFrom, Int, MinInt};
 
 /// Trait for some basic operations on floats
 // #[allow(dead_code)]
+#[allow(dead_code)] // Some constants are only used with tests
 pub trait Float:
     Copy
     + fmt::Debug
@@ -189,6 +190,15 @@ pub trait Float:
             Self::ONE.copysign(self)
         }
     }
+
+    /// Make a best-effort attempt to canonicalize the number. Note that this is allowed
+    /// to be a nop and does not always quiet sNaNs.
+    fn canonicalize(self) -> Self {
+        // FIXME: LLVM often removes this. We should determine whether we can remove the operation,
+        // or switch to something based on `llvm.canonicalize` (which has crashes,
+        // <https://github.com/llvm/llvm-project/issues/32650>).
+        self * Self::ONE
+    }
 }
 
 /// Access the associated `Int` type from a float (helper to avoid ambiguous associated types).
@@ -353,6 +363,7 @@ pub const fn f32_from_bits(bits: u32) -> f32 {
 }
 
 /// `f32::to_bits`
+#[allow(dead_code)] // workaround for false positive RUST-144060
 #[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
 pub const fn f32_to_bits(x: f32) -> u32 {
     // SAFETY: POD cast with no preconditions
@@ -367,6 +378,7 @@ pub const fn f64_from_bits(bits: u64) -> f64 {
 }
 
 /// `f64::to_bits`
+#[allow(dead_code)] // workaround for false positive RUST-144060
 #[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
 pub const fn f64_to_bits(x: f64) -> u64 {
     // SAFETY: POD cast with no preconditions
diff --git a/libm/src/math/support/hex_float.rs b/libm/src/math/support/hex_float.rs
index 85569d98a..c8558b900 100644
--- a/libm/src/math/support/hex_float.rs
+++ b/libm/src/math/support/hex_float.rs
@@ -1,8 +1,6 @@
 //! Utilities for working with hex float formats.
 
-use core::fmt;
-
-use super::{Float, Round, Status, f32_from_bits, f64_from_bits};
+use super::{Round, Status, f32_from_bits, f64_from_bits};
 
 /// Construct a 16-bit float from hex float representation (C-style)
 #[cfg(f16_enabled)]
@@ -352,133 +350,143 @@ const fn u128_ilog2(v: u128) -> u32 {
     u128::BITS - 1 - v.leading_zeros()
 }
 
-/// Format a floating point number as its IEEE hex (`%a`) representation.
-pub struct Hexf<F>(pub F);
+#[cfg(any(test, feature = "unstable-public-internals"))]
+mod hex_fmt {
+    use core::fmt;
 
-// Adapted from https://github.com/ericseppanen/hexfloat2/blob/a5c27932f0ff/src/format.rs
-#[cfg(not(feature = "compiler-builtins"))]
-fn fmt_any_hex<F: Float>(x: &F, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-    if x.is_sign_negative() {
-        write!(f, "-")?;
-    }
+    use crate::support::Float;
 
-    if x.is_nan() {
-        return write!(f, "NaN");
-    } else if x.is_infinite() {
-        return write!(f, "inf");
-    } else if *x == F::ZERO {
-        return write!(f, "0x0p+0");
-    }
+    /// Format a floating point number as its IEEE hex (`%a`) representation.
+    pub struct Hexf<F>(pub F);
 
-    let mut exponent = x.exp_unbiased();
-    let sig = x.to_bits() & F::SIG_MASK;
-
-    let bias = F::EXP_BIAS as i32;
-    // The mantissa MSB needs to be shifted up to the nearest nibble.
-    let mshift = (4 - (F::SIG_BITS % 4)) % 4;
-    let sig = sig << mshift;
-    // The width is rounded up to the nearest char (4 bits)
-    let mwidth = (F::SIG_BITS as usize + 3) / 4;
-    let leading = if exponent == -bias {
-        // subnormal number means we shift our output by 1 bit.
-        exponent += 1;
-        "0."
-    } else {
-        "1."
-    };
+    // Adapted from https://github.com/ericseppanen/hexfloat2/blob/a5c27932f0ff/src/format.rs
+    #[cfg(not(feature = "compiler-builtins"))]
+    pub(super) fn fmt_any_hex<F: Float>(x: &F, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if x.is_sign_negative() {
+            write!(f, "-")?;
+        }
 
-    write!(f, "0x{leading}{sig:0mwidth$x}p{exponent:+}")
-}
+        if x.is_nan() {
+            return write!(f, "NaN");
+        } else if x.is_infinite() {
+            return write!(f, "inf");
+        } else if *x == F::ZERO {
+            return write!(f, "0x0p+0");
+        }
 
-#[cfg(feature = "compiler-builtins")]
-fn fmt_any_hex<F: Float>(_x: &F, _f: &mut fmt::Formatter<'_>) -> fmt::Result {
-    unimplemented!()
-}
+        let mut exponent = x.exp_unbiased();
+        let sig = x.to_bits() & F::SIG_MASK;
+
+        let bias = F::EXP_BIAS as i32;
+        // The mantissa MSB needs to be shifted up to the nearest nibble.
+        let mshift = (4 - (F::SIG_BITS % 4)) % 4;
+        let sig = sig << mshift;
+        // The width is rounded up to the nearest char (4 bits)
+        let mwidth = (F::SIG_BITS as usize + 3) / 4;
+        let leading = if exponent == -bias {
+            // subnormal number means we shift our output by 1 bit.
+            exponent += 1;
+            "0."
+        } else {
+            "1."
+        };
 
-impl<F: Float> fmt::LowerHex for Hexf<F> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        cfg_if! {
-            if #[cfg(feature = "compiler-builtins")] {
-                let _ = f;
-                unimplemented!()
-            } else {
-                fmt_any_hex(&self.0, f)
+        write!(f, "0x{leading}{sig:0mwidth$x}p{exponent:+}")
+    }
+
+    #[cfg(feature = "compiler-builtins")]
+    pub(super) fn fmt_any_hex<F: Float>(_x: &F, _f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        unimplemented!()
+    }
+
+    impl<F: Float> fmt::LowerHex for Hexf<F> {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            cfg_if! {
+                if #[cfg(feature = "compiler-builtins")] {
+                    let _ = f;
+                    unimplemented!()
+                } else {
+                    fmt_any_hex(&self.0, f)
+                }
             }
         }
     }
-}
 
-impl<F: Float> fmt::LowerHex for Hexf<(F, F)> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        cfg_if! {
-            if #[cfg(feature = "compiler-builtins")] {
-                let _ = f;
-                unimplemented!()
-            } else {
-                write!(f, "({:x}, {:x})", Hexf(self.0.0), Hexf(self.0.1))
+    impl<F: Float> fmt::LowerHex for Hexf<(F, F)> {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            cfg_if! {
+                if #[cfg(feature = "compiler-builtins")] {
+                    let _ = f;
+                    unimplemented!()
+                } else {
+                    write!(f, "({:x}, {:x})", Hexf(self.0.0), Hexf(self.0.1))
+                }
             }
         }
     }
-}
 
-impl<F: Float> fmt::LowerHex for Hexf<(F, i32)> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        cfg_if! {
-            if #[cfg(feature = "compiler-builtins")] {
-                let _ = f;
-                unimplemented!()
-            } else {
-                write!(f, "({:x}, {:x})", Hexf(self.0.0), Hexf(self.0.1))
+    impl<F: Float> fmt::LowerHex for Hexf<(F, i32)> {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            cfg_if! {
+                if #[cfg(feature = "compiler-builtins")] {
+                    let _ = f;
+                    unimplemented!()
+                } else {
+                    write!(f, "({:x}, {:x})", Hexf(self.0.0), Hexf(self.0.1))
+                }
             }
         }
     }
-}
 
-impl fmt::LowerHex for Hexf<i32> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        cfg_if! {
-            if #[cfg(feature = "compiler-builtins")] {
-                let _ = f;
-                unimplemented!()
-            } else {
-                fmt::LowerHex::fmt(&self.0, f)
+    impl fmt::LowerHex for Hexf<i32> {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            cfg_if! {
+                if #[cfg(feature = "compiler-builtins")] {
+                    let _ = f;
+                    unimplemented!()
+                } else {
+                    fmt::LowerHex::fmt(&self.0, f)
+                }
             }
         }
     }
-}
 
-impl<T> fmt::Debug for Hexf<T>
-where
-    Hexf<T>: fmt::LowerHex,
-{
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        cfg_if! {
-            if #[cfg(feature = "compiler-builtins")] {
-                let _ = f;
-                unimplemented!()
-            } else {
-                fmt::LowerHex::fmt(self, f)
+    impl<T> fmt::Debug for Hexf<T>
+    where
+        Hexf<T>: fmt::LowerHex,
+    {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            cfg_if! {
+                if #[cfg(feature = "compiler-builtins")] {
+                    let _ = f;
+                    unimplemented!()
+                } else {
+                    fmt::LowerHex::fmt(self, f)
+                }
             }
         }
     }
-}
 
-impl<T> fmt::Display for Hexf<T>
-where
-    Hexf<T>: fmt::LowerHex,
-{
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        cfg_if! {
-            if #[cfg(feature = "compiler-builtins")] {
-                let _ = f;
-                unimplemented!()
-            } else {
-                fmt::LowerHex::fmt(self, f)
+    impl<T> fmt::Display for Hexf<T>
+    where
+        Hexf<T>: fmt::LowerHex,
+    {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            cfg_if! {
+                if #[cfg(feature = "compiler-builtins")] {
+                    let _ = f;
+                    unimplemented!()
+                } else {
+                    fmt::LowerHex::fmt(self, f)
+                }
             }
         }
     }
 }
 
+#[cfg(any(test, feature = "unstable-public-internals"))]
+pub use hex_fmt::*;
+
 #[cfg(test)]
 mod parse_tests {
     extern crate std;
@@ -1064,6 +1072,7 @@ mod print_tests {
     use std::string::ToString;
 
     use super::*;
+    use crate::support::Float;
 
     #[test]
     #[cfg(f16_enabled)]
diff --git a/libm/src/math/support/int_traits.rs b/libm/src/math/support/int_traits.rs
index 3ec1faba1..9d8826dfe 100644
--- a/libm/src/math/support/int_traits.rs
+++ b/libm/src/math/support/int_traits.rs
@@ -1,6 +1,7 @@
 use core::{cmp, fmt, ops};
 
 /// Minimal integer implementations needed on all integer types, including wide integers.
+#[allow(dead_code)] // Some constants are only used with tests
 pub trait MinInt:
     Copy
     + fmt::Debug
@@ -36,8 +37,6 @@ pub trait Int:
     + fmt::Display
     + fmt::Binary
     + fmt::LowerHex
-    + PartialEq
-    + PartialOrd
     + ops::AddAssign
     + ops::SubAssign
     + ops::MulAssign
@@ -78,6 +77,7 @@ pub trait Int:
     fn unsigned(self) -> Self::Unsigned;
     fn from_unsigned(unsigned: Self::Unsigned) -> Self;
     fn abs(self) -> Self;
+    fn unsigned_abs(self) -> Self::Unsigned;
 
     fn from_bool(b: bool) -> Self;
 
@@ -100,7 +100,10 @@ pub trait Int:
     fn rotate_left(self, other: u32) -> Self;
     fn overflowing_add(self, other: Self) -> (Self, bool);
     fn overflowing_sub(self, other: Self) -> (Self, bool);
+    fn carrying_add(self, other: Self, carry: bool) -> (Self, bool);
+    fn borrowing_sub(self, other: Self, borrow: bool) -> (Self, bool);
     fn leading_zeros(self) -> u32;
+    fn trailing_zeros(self) -> u32;
     fn ilog2(self) -> u32;
 }
 
@@ -166,12 +169,30 @@ macro_rules! int_impl_common {
             <Self>::leading_zeros(self)
         }
 
+        fn trailing_zeros(self) -> u32 {
+            <Self>::trailing_zeros(self)
+        }
+
         fn ilog2(self) -> u32 {
             // On our older MSRV, this resolves to the trait method. Which won't actually work,
             // but this is only called behind other gates.
             #[allow(clippy::incompatible_msrv)]
             <Self>::ilog2(self)
         }
+
+        fn carrying_add(self, other: Self, carry: bool) -> (Self, bool) {
+            let (ab, of1) = self.overflowing_add(other);
+            let (abc, of2) = ab.overflowing_add(Self::from_bool(carry));
+            // `of1 && of2` is possible with signed integers if a negative sum
+            // overflows to `MAX` and adding the carry overflows again back to `MIN`
+            (abc, of1 ^ of2)
+        }
+
+        fn borrowing_sub(self, other: Self, borrow: bool) -> (Self, bool) {
+            let (ab, of1) = self.overflowing_sub(other);
+            let (abc, of2) = ab.overflowing_sub(Self::from_bool(borrow));
+            (abc, of1 ^ of2)
+        }
     };
 }
 
@@ -203,6 +224,10 @@ macro_rules! int_impl {
                 unimplemented!()
             }
 
+            fn unsigned_abs(self) -> Self {
+                unimplemented!()
+            }
+
             // It makes writing macros easier if this is implemented for both signed and unsigned
             #[allow(clippy::wrong_self_convention)]
             fn from_unsigned(me: $uty) -> Self {
@@ -242,6 +267,10 @@ macro_rules! int_impl {
                 self.abs()
             }
 
+            fn unsigned_abs(self) -> Self::Unsigned {
+                self.unsigned_abs()
+            }
+
             fn from_unsigned(me: $uty) -> Self {
                 me as $ity
             }
@@ -365,14 +394,19 @@ impl_h_int!(
 /// Trait to express (possibly lossy) casting of integers
 pub trait CastInto<T: Copy>: Copy {
     /// By default, casts should be exact.
+    #[track_caller]
     fn cast(self) -> T;
 
     /// Call for casts that are expected to truncate.
+    ///
+    /// In practice, this is exactly the same as `cast`; the main difference is to document intent
+    /// in code. `cast` may panic in debug mode.
     fn cast_lossy(self) -> T;
 }
 
 pub trait CastFrom<T: Copy>: Copy {
     /// By default, casts should be exact.
+    #[track_caller]
     fn cast_from(value: T) -> Self;
 
     /// Call for casts that are expected to truncate.
diff --git a/libm/src/math/support/macros.rs b/libm/src/math/support/macros.rs
index 0b72db0e4..550d2e92e 100644
--- a/libm/src/math/support/macros.rs
+++ b/libm/src/math/support/macros.rs
@@ -137,16 +137,18 @@ macro_rules! hf128 {
 #[cfg(test)]
 macro_rules! assert_biteq {
     ($left:expr, $right:expr, $($tt:tt)*) => {{
-        use $crate::support::Int;
         let l = $left;
         let r = $right;
-        let bits = Int::leading_zeros(l.to_bits() - l.to_bits()); // hack to get the width from the value
+        // hack to get width from a value
+        let bits = $crate::support::Int::leading_zeros(l.to_bits() - l.to_bits());
         assert!(
-            l.biteq(r),
-            "{}\nl: {l:?} ({lb:#0width$x})\nr: {r:?} ({rb:#0width$x})",
+            $crate::support::Float::biteq(l, r),
+            "{}\nl: {l:?} ({lb:#0width$x} {lh})\nr: {r:?} ({rb:#0width$x} {rh})",
             format_args!($($tt)*),
             lb = l.to_bits(),
+            lh = $crate::support::Hexf(l),
             rb = r.to_bits(),
+            rh = $crate::support::Hexf(r),
             width = ((bits / 4) + 2) as usize,
 
         );
diff --git a/libm/src/math/support/mod.rs b/libm/src/math/support/mod.rs
index a4f596ab8..b2d7bd8d5 100644
--- a/libm/src/math/support/mod.rs
+++ b/libm/src/math/support/mod.rs
@@ -11,10 +11,15 @@ mod int_traits;
 
 #[allow(unused_imports)]
 pub use big::{i256, u256};
+// Clippy seems to have a false positive
+#[allow(unused_imports, clippy::single_component_path_imports)]
+pub(crate) use cfg_if;
 pub use env::{FpResult, Round, Status};
 #[allow(unused_imports)]
 pub use float_traits::{DFloat, Float, HFloat, IntTy};
 pub(crate) use float_traits::{f32_from_bits, f64_from_bits};
+#[cfg(any(test, feature = "unstable-public-internals"))]
+pub use hex_float::Hexf;
 #[cfg(f16_enabled)]
 #[allow(unused_imports)]
 pub use hex_float::hf16;
@@ -22,7 +27,7 @@ pub use hex_float::hf16;
 #[allow(unused_imports)]
 pub use hex_float::hf128;
 #[allow(unused_imports)]
-pub use hex_float::{Hexf, hf32, hf64};
+pub use hex_float::{hf32, hf64};
 pub use int_traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt};
 
 /// Hint to the compiler that the current path is cold.
diff --git a/libm/src/math/tan.rs b/libm/src/math/tan.rs
index a072bdec5..79c1bad56 100644
--- a/libm/src/math/tan.rs
+++ b/libm/src/math/tan.rs
@@ -43,7 +43,7 @@ use super::{k_tan, rem_pio2};
 /// The tangent of `x` (f64).
 ///
 /// `x` is specified in radians.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn tan(x: f64) -> f64 {
     let x1p120 = f32::from_bits(0x7b800000); // 0x1p120f === 2 ^ 120
 
diff --git a/libm/src/math/tanf.rs b/libm/src/math/tanf.rs
index 8bcf9581f..a615573d8 100644
--- a/libm/src/math/tanf.rs
+++ b/libm/src/math/tanf.rs
@@ -27,7 +27,7 @@ const T4_PIO2: f64 = 4. * FRAC_PI_2; /* 0x401921FB, 0x54442D18 */
 /// The tangent of `x` (f32).
 ///
 /// `x` is specified in radians.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn tanf(x: f32) -> f32 {
     let x64 = x as f64;
 
diff --git a/libm/src/math/tanh.rs b/libm/src/math/tanh.rs
index cc0abe4fc..c99cc2a70 100644
--- a/libm/src/math/tanh.rs
+++ b/libm/src/math/tanh.rs
@@ -8,7 +8,7 @@ use super::expm1;
 /// The hyperbolic tangent of `x` (f64).
 ///
 /// `x` is specified in radians.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn tanh(mut x: f64) -> f64 {
     let mut uf: f64 = x;
     let mut ui: u64 = f64::to_bits(uf);
diff --git a/libm/src/math/tanhf.rs b/libm/src/math/tanhf.rs
index fffbba6c6..3cbd5917f 100644
--- a/libm/src/math/tanhf.rs
+++ b/libm/src/math/tanhf.rs
@@ -3,7 +3,7 @@ use super::expm1f;
 /// The hyperbolic tangent of `x` (f32).
 ///
 /// `x` is specified in radians.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn tanhf(mut x: f32) -> f32 {
     /* x = |x| */
     let mut ix = x.to_bits();
diff --git a/libm/src/math/tgamma.rs b/libm/src/math/tgamma.rs
index 305986064..41415d9d1 100644
--- a/libm/src/math/tgamma.rs
+++ b/libm/src/math/tgamma.rs
@@ -131,7 +131,7 @@ fn s(x: f64) -> f64 {
 }
 
 /// The [Gamma function](https://en.wikipedia.org/wiki/Gamma_function) (f64).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn tgamma(mut x: f64) -> f64 {
     let u: u64 = x.to_bits();
     let absx: f64;
diff --git a/libm/src/math/tgammaf.rs b/libm/src/math/tgammaf.rs
index fe178f7a3..a63a2a318 100644
--- a/libm/src/math/tgammaf.rs
+++ b/libm/src/math/tgammaf.rs
@@ -1,7 +1,7 @@
 use super::tgamma;
 
 /// The [Gamma function](https://en.wikipedia.org/wiki/Gamma_function) (f32).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn tgammaf(x: f32) -> f32 {
     tgamma(x as f64) as f32
 }
diff --git a/libm/src/math/trunc.rs b/libm/src/math/trunc.rs
index fa50d55e1..20d52a111 100644
--- a/libm/src/math/trunc.rs
+++ b/libm/src/math/trunc.rs
@@ -2,7 +2,7 @@
 ///
 /// This effectively removes the decimal part of the number, leaving the integral part.
 #[cfg(f16_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn truncf16(x: f16) -> f16 {
     super::generic::trunc(x)
 }
@@ -10,7 +10,7 @@ pub fn truncf16(x: f16) -> f16 {
 /// Rounds the number toward 0 to the closest integral value (f32).
 ///
 /// This effectively removes the decimal part of the number, leaving the integral part.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn truncf(x: f32) -> f32 {
     select_implementation! {
         name: truncf,
@@ -24,7 +24,7 @@ pub fn truncf(x: f32) -> f32 {
 /// Rounds the number toward 0 to the closest integral value (f64).
 ///
 /// This effectively removes the decimal part of the number, leaving the integral part.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn trunc(x: f64) -> f64 {
     select_implementation! {
         name: trunc,
@@ -39,7 +39,7 @@ pub fn trunc(x: f64) -> f64 {
 ///
 /// This effectively removes the decimal part of the number, leaving the integral part.
 #[cfg(f128_enabled)]
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+#[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn truncf128(x: f128) -> f128 {
     super::generic::trunc(x)
 }
diff --git a/libm/src/math/truncf.rs b/libm/src/math/truncf.rs
deleted file mode 100644
index 14533a267..000000000
--- a/libm/src/math/truncf.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-/// Rounds the number toward 0 to the closest integral value (f32).
-///
-/// This effectively removes the decimal part of the number, leaving the integral part.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn truncf(x: f32) -> f32 {
-    select_implementation! {
-        name: truncf,
-        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
-        args: x,
-    }
-
-    super::generic::trunc(x)
-}
-
-// PowerPC tests are failing on LLVM 13: https://github.com/rust-lang/rust/issues/88520
-#[cfg(not(target_arch = "powerpc64"))]
-#[cfg(test)]
-mod tests {
-    #[test]
-    fn sanity_check() {
-        assert_eq!(super::truncf(1.1), 1.0);
-    }
-}
diff --git a/libm/src/math/truncf128.rs b/libm/src/math/truncf128.rs
deleted file mode 100644
index 9dccc0d0e..000000000
--- a/libm/src/math/truncf128.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-/// Rounds the number toward 0 to the closest integral value (f128).
-///
-/// This effectively removes the decimal part of the number, leaving the integral part.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn truncf128(x: f128) -> f128 {
-    super::generic::trunc(x)
-}
diff --git a/libm/src/math/truncf16.rs b/libm/src/math/truncf16.rs
deleted file mode 100644
index d7c3d225c..000000000
--- a/libm/src/math/truncf16.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-/// Rounds the number toward 0 to the closest integral value (f16).
-///
-/// This effectively removes the decimal part of the number, leaving the integral part.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn truncf16(x: f16) -> f16 {
-    super::generic::trunc(x)
-}
diff --git a/rust-version b/rust-version
new file mode 100644
index 000000000..3928504c8
--- /dev/null
+++ b/rust-version
@@ -0,0 +1 @@
+ffb9d94dcf4ade0d534842be3672d5e9f47e1333
diff --git a/thumbv6m-linux-eabi.json b/thumbv6m-linux-eabi.json
deleted file mode 100644
index ac736eae6..000000000
--- a/thumbv6m-linux-eabi.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-    "abi-blacklist": [
-        "stdcall",
-        "fastcall",
-        "vectorcall",
-        "win64",
-        "sysv64"
-    ],
-    "arch": "arm",
-    "data-layout": "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64",
-    "env": "",
-    "executables": true,
-    "features": "+strict-align",
-    "linker": "arm-none-eabi-gcc",
-    "linker-flavor": "gcc",
-    "llvm-target": "thumbv6m-none-eabi",
-    "max-atomic-width": 0,
-    "os": "linux",
-    "panic-strategy": "abort",
-    "pre-link-args": {
-        "gcc": ["-nostartfiles"]
-    },
-    "relocation-model": "static",
-    "target-endian": "little",
-    "target-pointer-width": "32",
-    "target-c-int-width": "32",
-    "vendor": ""
-}
diff --git a/thumbv7em-linux-eabi.json b/thumbv7em-linux-eabi.json
deleted file mode 100644
index b6d4a6bda..000000000
--- a/thumbv7em-linux-eabi.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "abi-blacklist": [
-        "stdcall",
-        "fastcall",
-        "vectorcall",
-        "win64",
-        "sysv64"
-    ],
-    "arch": "arm",
-    "data-layout": "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64",
-    "env": "",
-    "executables": true,
-    "linker": "arm-none-eabi-gcc",
-    "linker-flavor": "gcc",
-    "llvm-target": "thumbv7em-none-eabi",
-    "max-atomic-width": 32,
-    "os": "linux",
-    "panic-strategy": "abort",
-    "pre-link-args": {
-        "gcc": ["-nostartfiles"]
-    },
-    "relocation-model": "static",
-    "target-endian": "little",
-    "target-pointer-width": "32",
-    "target-c-int-width": "32",
-    "vendor": ""
-}
diff --git a/thumbv7em-linux-eabihf.json b/thumbv7em-linux-eabihf.json
deleted file mode 100644
index 81cfcd48d..000000000
--- a/thumbv7em-linux-eabihf.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-    "abi-blacklist": [
-        "stdcall",
-        "fastcall",
-        "vectorcall",
-        "win64",
-        "sysv64"
-    ],
-    "arch": "arm",
-    "data-layout": "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64",
-    "env": "",
-    "executables": true,
-    "features": "+vfp4,+d16,+fp-only-sp",
-    "linker": "arm-none-eabi-gcc",
-    "linker-flavor": "gcc",
-    "llvm-target": "thumbv7em-none-eabihf",
-    "max-atomic-width": 32,
-    "os": "linux",
-    "panic-strategy": "abort",
-    "pre-link-args": {
-        "gcc": ["-nostartfiles"]
-    },
-    "relocation-model": "static",
-    "target-endian": "little",
-    "target-pointer-width": "32",
-    "target-c-int-width": "32",
-    "vendor": ""
-}
diff --git a/thumbv7m-linux-eabi.json b/thumbv7m-linux-eabi.json
deleted file mode 100644
index abe037c5b..000000000
--- a/thumbv7m-linux-eabi.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "abi-blacklist": [
-        "stdcall",
-        "fastcall",
-        "vectorcall",
-        "win64",
-        "sysv64"
-    ],
-    "arch": "arm",
-    "data-layout": "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64",
-    "env": "",
-    "executables": true,
-    "linker": "arm-none-eabi-gcc",
-    "linker-flavor": "gcc",
-    "llvm-target": "thumbv7m-none-eabi",
-    "max-atomic-width": 32,
-    "os": "linux",
-    "panic-strategy": "abort",
-    "pre-link-args": {
-        "gcc": ["-nostartfiles"]
-    },
-    "relocation-model": "static",
-    "target-endian": "little",
-    "target-pointer-width": "32",
-    "target-c-int-width": "32",
-    "vendor": ""
-}
diff --git a/triagebot.toml b/triagebot.toml
new file mode 100644
index 000000000..eba5cdd88
--- /dev/null
+++ b/triagebot.toml
@@ -0,0 +1,21 @@
+## See <https://forge.rust-lang.org/triagebot/index.html> for documentation
+## of these features.
+
+# Warns when a PR contains merge commits
+# Documentation at: https://forge.rust-lang.org/triagebot/no-merge.html
+[no-merges]
+exclude_titles = ["Rustc pull update"]
+
+# Canonicalize issue numbers to avoid closing the wrong issue
+# when commits are included in subtrees, as well as warning links in commits.
+# Documentation at: https://forge.rust-lang.org/triagebot/issue-links.html
+[issue-links]
+check-commits = false
+
+# Prevents mentions in commits to avoid users being spammed
+# Documentation at: https://forge.rust-lang.org/triagebot/no-mentions.html
+[no-mentions]
+
+# Enable issue transfers within the org
+# Documentation at: https://forge.rust-lang.org/triagebot/transfer.html
+[transfer]