From f2002d48a36b8c2ef57e164cac6441236ac3e78d Mon Sep 17 00:00:00 2001 From: Taiki Endo Date: Fri, 6 Oct 2023 00:45:25 +0900 Subject: [PATCH] aarch64: Use .arch_extension directive instead of #[target_feature] Support outline-atomics on pre-1.61 rustc. https://developer.arm.com/documentation/100067/0612/armclang-Integrated-Assembler/AArch32-Target-selection-directives?lang=en --- README.md | 2 +- build.rs | 13 --------- src/imp/atomic128/aarch64.rs | 52 ++++++++++++++++++++++++++++++++---- src/lib.rs | 11 +------- 4 files changed, 49 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index dbc347e1..774ed64f 100644 --- a/README.md +++ b/README.md @@ -170,7 +170,7 @@ RUSTFLAGS="--cfg portable_atomic_no_outline_atomics" cargo ... If dynamic dispatching by run-time CPU feature detection is enabled, it allows maintaining support for older CPUs while using features that are not supported on older CPUs, such as CMPXCHG16B (x86_64) and FEAT_LSE (aarch64). Note: - - Dynamic detection is currently only enabled in Rust 1.61+ for aarch64, in Rust 1.59+ (AVX) or 1.69+ (CMPXCHG16B) for x86_64, nightly only for powerpc64 (disabled by default), otherwise it works the same as when this cfg is set. + - Dynamic detection is currently only enabled in Rust 1.59+ for aarch64, in Rust 1.59+ (AVX) or 1.69+ (CMPXCHG16B) for x86_64, nightly only for powerpc64 (disabled by default), otherwise it works the same as when this cfg is set. - If the required target features are enabled at compile-time, the atomic operations are inlined. - This is compatible with no-std (as with all features except `std`). - On some targets, run-time detection is disabled by default mainly for compatibility with older versions of operating systems or incomplete build environments, and can be enabled by `--cfg portable_atomic_outline_atomics`. (When both cfg are enabled, `*_no_*` cfg is preferred.) diff --git a/build.rs b/build.rs index 69cb36b1..a9f9f007 100644 --- a/build.rs +++ b/build.rs @@ -192,19 +192,6 @@ fn main() { target_feature_if("cmpxchg16b", has_cmpxchg16b, &version, Some(69), true); } "aarch64" => { - // aarch64_target_feature stabilized in Rust 1.61 (nightly-2022-03-16): https://github.com/rust-lang/rust/pull/90621 - if !version.probe(61, 2022, 3, 15) { - if version.nightly && is_allowed_feature("aarch64_target_feature") { - // The part of this feature we use has not been changed since 1.27 - // (https://github.com/rust-lang/rust/commit/1217d70465edb2079880347fea4baaac56895f51) - // until it was stabilized in nightly-2022-03-16, so it can be safely enabled in - // nightly, which is older than nightly-2022-03-16. - println!("cargo:rustc-cfg=portable_atomic_unstable_aarch64_target_feature"); - } else { - // On aarch64, when aarch64_target_feature is not available, outline-atomics is also not available. - println!("cargo:rustc-cfg=portable_atomic_no_outline_atomics"); - } - } // For Miri and ThreadSanitizer. // https://github.com/rust-lang/rust/pull/97423 merged in Rust 1.64 (nightly-2022-06-30). if version.nightly && version.probe(64, 2022, 6, 29) { diff --git a/src/imp/atomic128/aarch64.rs b/src/imp/atomic128/aarch64.rs index b1881742..c6dfe83f 100644 --- a/src/imp/atomic128/aarch64.rs +++ b/src/imp/atomic128/aarch64.rs @@ -155,6 +155,45 @@ macro_rules! debug_assert_lse { }; } +// Refs: https://developer.arm.com/documentation/100067/0612/armclang-Integrated-Assembler/AArch32-Target-selection-directives?lang=en +// +// This is similar to #[target_feature(enable = "lse")], except that there are +// no compiler guarantees regarding (un)inlining, and the scope is within an asm +// block rather than a function. We use this directive to support outline-atomics +// on pre-1.61 rustc (aarch64_target_feature stabilized in Rust 1.61). +// +// The .arch_extension directive is effective until the end of the assembly block and +// is not propagated to subsequent code, so the end_lse macro is unneeded. +// https://godbolt.org/z/4oMEW8vWc +// https://github.com/torvalds/linux/commit/e0d5896bd356cd577f9710a02d7a474cdf58426b +// https://github.com/torvalds/linux/commit/dd1f6308b28edf0452dd5dc7877992903ec61e69 +// (It seems GCC effectively ignores this directive and always allow FEAT_LSE instructions: https://godbolt.org/z/W9W6rensG) +// +// The .arch directive has a similar effect, but we don't use it due to the following issue: +// https://github.com/torvalds/linux/commit/dd1f6308b28edf0452dd5dc7877992903ec61e69 +// +// Note: If FEAT_LSE is not available at compile-time, we must guarantee that +// the function that uses it is not inlined into a function where it is not +// clear whether FEAT_LSE is available. Otherwise, (even if we checked whether +// FEAT_LSE is available at run-time) optimizations that reorder its +// instructions across the if condition might introduce undefined behavior. +// (see also https://rust-lang.github.io/rfcs/2045-target-feature.html#safely-inlining-target_feature-functions-on-more-contexts) +// However, our code uses the ifunc helper macro that works with function pointers, +// so we don't have to worry about this unless calling without helper macro. +#[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))] +#[cfg(not(portable_atomic_no_outline_atomics))] +macro_rules! start_lse { + () => { + ".arch_extension lse" + }; +} +#[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))] +macro_rules! start_lse { + () => { + "" + }; +} + #[cfg(target_endian = "little")] macro_rules! select_le_or_be { ($le:expr, $be:expr) => { @@ -289,6 +328,7 @@ unsafe fn _atomic_load_casp(src: *mut u128, order: Ordering) -> u128 { macro_rules! atomic_load { ($acquire:tt, $release:tt) => { asm!( + start_lse!(), concat!("casp", $acquire, $release, " x2, x3, x2, x3, [{src}]"), src = in(reg) ptr_reg!(src), // must be allocated to even/odd register pair @@ -551,7 +591,9 @@ unsafe fn atomic_compare_exchange( #[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))] let prev = { fn_alias! { - #[target_feature(enable = "lse")] + // inline(never) is just a hint and also not strictly necessary + // because we use ifunc helper macro, but used for clarity. + #[inline(never)] unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128; atomic_compare_exchange_casp_relaxed = _atomic_compare_exchange_casp(Ordering::Relaxed, Ordering::Relaxed); @@ -660,10 +702,6 @@ unsafe fn atomic_compare_exchange( portable_atomic_target_feature = "lse", not(portable_atomic_no_outline_atomics), ))] -#[cfg_attr( - not(any(target_feature = "lse", portable_atomic_target_feature = "lse")), - target_feature(enable = "lse") -)] #[inline] unsafe fn _atomic_compare_exchange_casp( dst: *mut u128, @@ -690,6 +728,7 @@ unsafe fn _atomic_compare_exchange_casp( macro_rules! cmpxchg { ($acquire:tt, $release:tt, $fence:tt) => { asm!( + start_lse!(), concat!("casp", $acquire, $release, " x6, x7, x4, x5, [{dst}]"), $fence, dst = in(reg) ptr_reg!(dst), @@ -848,6 +887,7 @@ unsafe fn _atomic_swap_casp(dst: *mut u128, val: u128, order: Ordering) -> u128 macro_rules! swap { ($acquire:tt, $release:tt, $fence:tt) => { asm!( + start_lse!(), // If FEAT_LSE2 is not supported, this works like byte-wise atomic. // This is not single-copy atomic reads, but this is ok because subsequent // CAS will check for consistency. @@ -1014,6 +1054,7 @@ macro_rules! atomic_rmw_cas_3 { macro_rules! op { ($acquire:tt, $release:tt, $fence:tt) => { asm!( + start_lse!(), // If FEAT_LSE2 is not supported, this works like byte-wise atomic. // This is not single-copy atomic reads, but this is ok because subsequent // CAS will check for consistency. @@ -1140,6 +1181,7 @@ macro_rules! atomic_rmw_cas_2 { macro_rules! op { ($acquire:tt, $release:tt, $fence:tt) => { asm!( + start_lse!(), // If FEAT_LSE2 is not supported, this works like byte-wise atomic. // This is not single-copy atomic reads, but this is ok because subsequent // CAS will check for consistency. diff --git a/src/lib.rs b/src/lib.rs index d221c5af..95af37b5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -164,7 +164,7 @@ RUSTFLAGS="--cfg portable_atomic_no_outline_atomics" cargo ... If dynamic dispatching by run-time CPU feature detection is enabled, it allows maintaining support for older CPUs while using features that are not supported on older CPUs, such as CMPXCHG16B (x86_64) and FEAT_LSE (aarch64). Note: - - Dynamic detection is currently only enabled in Rust 1.61+ for aarch64, in Rust 1.59+ (AVX) or 1.69+ (CMPXCHG16B) for x86_64, nightly only for powerpc64 (disabled by default), otherwise it works the same as when this cfg is set. + - Dynamic detection is currently only enabled in Rust 1.59+ for aarch64, in Rust 1.59+ (AVX) or 1.69+ (CMPXCHG16B) for x86_64, nightly only for powerpc64 (disabled by default), otherwise it works the same as when this cfg is set. - If the required target features are enabled at compile-time, the atomic operations are inlined. - This is compatible with no-std (as with all features except `std`). - On some targets, run-time detection is disabled by default mainly for compatibility with older versions of operating systems or incomplete build environments, and can be enabled by `--cfg portable_atomic_outline_atomics`. (When both cfg are enabled, `*_no_*` cfg is preferred.) @@ -258,20 +258,11 @@ RUSTFLAGS="--cfg portable_atomic_no_outline_atomics" cargo ... // These features are already stabilized or have already been removed from compilers, // and can safely be enabled for old nightly as long as version detection works. // - cfg(target_has_atomic) -// - #[target_feature(enable = "lse")] on AArch64 // - #[target_feature(enable = "cmpxchg16b")] on x86_64 // - asm! on ARM, AArch64, RISC-V, x86_64 // - llvm_asm! on AVR (tier 3) and MSP430 (tier 3) // - #[instruction_set] on non-Linux/Android pre-v6 ARM (tier 3) #![cfg_attr(portable_atomic_unstable_cfg_target_has_atomic, feature(cfg_target_has_atomic))] -#![cfg_attr( - all( - target_arch = "aarch64", - portable_atomic_unstable_aarch64_target_feature, - not(portable_atomic_no_outline_atomics), - ), - feature(aarch64_target_feature) -)] #![cfg_attr( all( target_arch = "x86_64",