diff --git a/.github/.cspell/project-dictionary.txt b/.github/.cspell/project-dictionary.txt
index 5d458ad8..8c0ed8fb 100644
--- a/.github/.cspell/project-dictionary.txt
+++ b/.github/.cspell/project-dictionary.txt
@@ -26,6 +26,7 @@ cmovge
 cmovl
 cmpd
 cmpld
+cmpw
 cmpxchg
 cpsid
 cpsie
@@ -108,6 +109,7 @@ movq
 mpidr
 mspdebug
 mstatus
+mstatush
 mvfr
 negs
 neoverse
@@ -123,6 +125,7 @@ pointee
 prctl
 prefetcher
 PRIMASK
+pstq
 quadword
 RAII
 rcpc
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 1ceb2668..c45d2406 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -18,6 +18,7 @@ portable-atomic/
 │   ├── imp/
 │   │   ├── atomic128/            -- 128-bit atomic implementations on 64-bit architectures (mainly by asm)
 │   │   ├── atomic64/             -- 64-bit atomic implementations on 32-bit architectures (mainly by asm)
+│   │   ├── avr.rs                -- atomic implementation for AVR (by asm)
 │   │   ├── core_atomic.rs        -- wrapper for core::sync::atomic types
 │   │   ├── detect/               -- Run-time CPU feature detection implementations used for outline-atomics
 │   │   ├── fallback/             -- fallback implementation based on global locks
diff --git a/build.rs b/build.rs
index 4b5c2fea..0861dd97 100644
--- a/build.rs
+++ b/build.rs
@@ -377,15 +377,14 @@ fn main() {
             if !version.probe(83, 2024, 9, 27) || needs_target_feature_fallback(&version, None) {
                 let target_endian =
                     env::var("CARGO_CFG_TARGET_ENDIAN").expect("CARGO_CFG_TARGET_ENDIAN not set");
-                // powerpc64le is pwr8+ by default https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L702
+                // powerpc64le is pwr8 by default https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L702
                 // See also https://github.com/rust-lang/rust/issues/59932
-                let mut has_pwr8_features = target_endian == "little";
-                // https://github.com/llvm/llvm-project/commit/549e118e93c666914a1045fde38a2cac33e1e445
+                let mut pwr8_features = target_endian == "little";
                 if let Some(cpu) = &target_cpu() {
                     if let Some(mut cpu_version) = strip_prefix(cpu, "pwr") {
                         cpu_version = strip_suffix(cpu_version, "x").unwrap_or(cpu_version); // for pwr5x and pwr6x
                         if let Ok(cpu_version) = cpu_version.parse::<u32>() {
-                            has_pwr8_features = cpu_version >= 8;
+                            pwr8_features = cpu_version >= 8;
                         }
                     } else {
                         // https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L702
@@ -393,11 +392,12 @@ fn main() {
                         // On the minimum external LLVM version of the oldest rustc version which we can use asm_experimental_arch
                         // on this target (see CI config for more), "future" is based on pwr10 features.
                         // https://github.com/llvm/llvm-project/blob/llvmorg-12.0.0/llvm/lib/Target/PowerPC/PPC.td#L370
-                        has_pwr8_features = cpu == "ppc64le" || cpu == "future";
+                        pwr8_features = cpu == "future" || cpu == "ppc64le";
                     }
                 }
+                // power8 features: https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L409
                 // lqarx and stqcx.
-                target_feature_fallback("quadword-atomics", has_pwr8_features);
+                target_feature_fallback("quadword-atomics", pwr8_features);
             }
         }
         "s390x" => {
@@ -421,12 +421,14 @@ fn main() {
             }
             // As of rustc 1.80, target_feature "fast-serialization"/"load-store-on-cond"/"distinct-ops"/"miscellaneous-extensions-3" is not available on rustc side:
             // https://github.com/rust-lang/rust/blob/1.80.0/compiler/rustc_target/src/target_features.rs
+            // arch9 features: https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/SystemZ/SystemZFeatures.td#L103
             // bcr 14,0
             target_feature_fallback("fast-serialization", arch9_features);
             // {l,st}oc{,g}{,r}
             target_feature_fallback("load-store-on-cond", arch9_features);
             // {al,sl,n,o,x}{,g}rk
             target_feature_fallback("distinct-ops", arch9_features);
+            // arch13 features: https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/SystemZ/SystemZFeatures.td#L301
             // nand (nnr{,g}k), select (sel{,g}r), etc.
             target_feature_fallback("miscellaneous-extensions-3", arch13_features);
         }
diff --git a/src/imp/atomic128/powerpc64.rs b/src/imp/atomic128/powerpc64.rs
index b87d1011..fff46999 100644
--- a/src/imp/atomic128/powerpc64.rs
+++ b/src/imp/atomic128/powerpc64.rs
@@ -3,13 +3,15 @@
 /*
 128-bit atomic implementation on PowerPC64.
 
-powerpc64 on pwr8+ support 128-bit atomics (load/store/LL/SC):
-https://github.com/llvm/llvm-project/commit/549e118e93c666914a1045fde38a2cac33e1e445
-https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/test/CodeGen/PowerPC/atomics-i128-ldst.ll
-https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/test/CodeGen/PowerPC/atomics-i128.ll
+This architecture provides the following 128-bit atomic instructions:
 
-powerpc64le is pwr8+ by default https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L702
-See also https://github.com/rust-lang/rust/issues/59932
+- lq/stq: load/store (ISA 2.07 or later, included in the Linux Compliancy subset and AIX Compliancy subset)
+- lqarx/stqcx.: LL/SC (ISA 2.07 or later, included in the Linux Compliancy subset and AIX Compliancy subset)
+- plq/pstq: load/store (ISA 3.1 or later, included in the Linux Compliancy subset and AIX Compliancy subset)
+
+See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and
+detailed description of the atomic and synchronize instructions in this architecture:
+https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#powerpc
 
 Note that we do not separate LL and SC into separate functions, but handle
 them within a single asm block. This is because it is theoretically possible
@@ -20,13 +22,16 @@ Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't
 this module and use intrinsics.rs instead.
 
 Refs:
-- Power ISA https://openpowerfoundation.org/specifications/isa
-- AIX Assembler language reference https://www.ibm.com/docs/en/aix/7.3?topic=aix-assembler-language-reference
-- atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
+- Power ISA
+  https://openpowerfoundation.org/specifications/isa
+- AIX Assembler language reference
+  https://www.ibm.com/docs/en/aix/7.3?topic=aix-assembler-language-reference
+- atomic-maybe-uninit
+  https://github.com/taiki-e/atomic-maybe-uninit
 
 Generated asm:
-- powerpc64 (pwr8) https://godbolt.org/z/KPfsKd81K
-- powerpc64le https://godbolt.org/z/5dcbKqdne
+- powerpc64 (pwr8) https://godbolt.org/z/TjKsPbWc6
+- powerpc64le https://godbolt.org/z/5WqPGhb3Y
 */
 
 include!("macros.rs");
@@ -91,9 +96,10 @@ macro_rules! debug_assert_pwr8 {
 // This is similar to #[target_feature(enable = "quadword-atomics")], except that there are
 // no compiler guarantees regarding (un)inlining, and the scope is within an asm
 // block rather than a function. We use this directive because #[target_feature(enable = "quadword-atomics")]
-// is not supported as of Rust 1.70-nightly.
+// is unstable and unavailable on old nightly and incompatible with rustc_codegen_cranelift:
+// https://github.com/rust-lang/rustc_codegen_cranelift/issues/1400#issuecomment-1774599775
 //
-// start_pwr8 and end_pwr8 must be used in pairs.
+// Note: start_pwr8 and end_pwr8 must be used in pairs.
 //
 // Note: If power8 instructions are not available at compile-time, we must guarantee that
 // the function that uses it is not inlined into a function where it is not
@@ -118,19 +124,42 @@ macro_rules! atomic_rmw {
     ($op:ident, $order:ident) => {
         match $order {
             Ordering::Relaxed => $op!("", ""),
-            Ordering::Acquire => $op!("lwsync", ""),
+            Ordering::Acquire => $op!("isync", ""),
             Ordering::Release => $op!("", "lwsync"),
-            Ordering::AcqRel => $op!("lwsync", "lwsync"),
-            Ordering::SeqCst => $op!("lwsync", "sync"),
+            Ordering::AcqRel => $op!("isync", "lwsync"),
+            Ordering::SeqCst => $op!("isync", "sync"),
             _ => unreachable!(),
         }
     };
 }
+macro_rules! atomic_cas {
+    ($op:ident, $success:ident, $failure:ident) => {
+        if $failure == Ordering::Relaxed {
+            match $success {
+                Ordering::Relaxed => $op!("", "", ""),
+                Ordering::Acquire => $op!("", "isync", ""),
+                Ordering::Release => $op!("", "", "lwsync"),
+                Ordering::AcqRel => $op!("", "isync", "lwsync"),
+                Ordering::SeqCst => $op!("", "isync", "sync"),
+                _ => unreachable!(),
+            }
+        } else {
+            let order = crate::utils::upgrade_success_ordering($success, $failure);
+            match order {
+                // Relaxed and Release are covered in $failure == Relaxed branch.
+                Ordering::Acquire => $op!("isync", "", ""),
+                Ordering::AcqRel => $op!("isync", "", "lwsync"),
+                Ordering::SeqCst => $op!("isync", "", "sync"),
+                _ => unreachable!(),
+            }
+        }
+    };
+}
 
 // Extracts and checks the EQ bit of cr0.
-#[inline(always)]
-fn extract_cr0(r: u64) -> bool {
-    r & 0x20000000 != 0
+#[inline]
+fn test_cr0_eq(cr: u64) -> bool {
+    cr & 0x20000000 != 0
 }
 
 // If quadword-atomics is available at compile-time, we can always use pwr8_fn.
@@ -194,31 +223,29 @@ unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 {
 unsafe fn atomic_load_pwr8(src: *mut u128, order: Ordering) -> u128 {
     debug_assert!(src as usize % 16 == 0);
     debug_assert_pwr8!();
+    let (out_hi, out_lo);
 
     // SAFETY: the caller must uphold the safety contract.
     //
     // Refs: Section 3.3.4 "Fixed Point Load and Store Quadword Instructions" of Power ISA 3.1C Book I
     unsafe {
-        let (out_hi, out_lo);
         macro_rules! atomic_load_acquire {
             ($release:tt) => {
                 asm!(
                     start_pwr8!(),
                     $release,
-                    "lq %r4, 0({src})",
-                    // Lightweight acquire sync
-                    // Refs: https://github.com/boostorg/atomic/blob/boost-1.79.0/include/boost/atomic/detail/core_arch_ops_gcc_ppc.hpp#L47-L62
-                    "cmpd %cr7, %r4, %r4",
-                    "bne- %cr7, 2f",
-                    "2:",
-                    "isync",
+                    "lq %r4, 0({src})", // atomic { r4:r5 = *src }
+                    "cmpw %r4, %r4",    // if r4 == r4 { cr0.EQ = 1 } else { cr0.EQ = 0 }
+                    "bne- %cr0, 2f",    // if unlikely(cr0.EQ == 0) { jump 'never }
+                    "2:", // 'never:
+                    "isync",            // fence (works in combination with a branch that depends on the loaded value)
                     end_pwr8!(),
                     src = in(reg_nonzero) ptr_reg!(src),
                     // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
                     // We cannot use r1 (sp) and r2 (system reserved), so start with r4 or grater.
                     out("r4") out_hi,
                     out("r5") out_lo,
-                    out("cr7") _,
+                    out("cr0") _,
                     options(nostack, preserves_flags),
                 )
             };
@@ -227,7 +254,7 @@ unsafe fn atomic_load_pwr8(src: *mut u128, order: Ordering) -> u128 {
             Ordering::Relaxed => {
                 asm!(
                     start_pwr8!(),
-                    "lq %r4, 0({src})",
+                    "lq %r4, 0({src})", // atomic { r4:r5 = *src }
                     end_pwr8!(),
                     src = in(reg_nonzero) ptr_reg!(src),
                     // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
@@ -306,18 +333,18 @@ unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
 unsafe fn atomic_store_pwr8(dst: *mut u128, val: u128, order: Ordering) {
     debug_assert!(dst as usize % 16 == 0);
     debug_assert_pwr8!();
+    let val = U128 { whole: val };
 
     // SAFETY: the caller must uphold the safety contract.
     //
     // Refs: Section 3.3.4 "Fixed Point Load and Store Quadword Instructions" of Power ISA 3.1C Book I
     unsafe {
-        let val = U128 { whole: val };
         macro_rules! atomic_store {
             ($release:tt) => {
                 asm!(
                     start_pwr8!(),
-                    $release,
-                    "stq %r4, 0({dst})",
+                    $release,            // fence
+                    "stq %r4, 0({dst})", // atomic { *dst = r4:r5 }
                     end_pwr8!(),
                     dst = in(reg_nonzero) ptr_reg!(dst),
                     // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
@@ -345,21 +372,84 @@ unsafe fn atomic_compare_exchange(
     success: Ordering,
     failure: Ordering,
 ) -> Result<u128, u128> {
-    let success = crate::utils::upgrade_success_ordering(success, failure);
-
     #[cfg(any(
         target_feature = "quadword-atomics",
         portable_atomic_target_feature = "quadword-atomics",
     ))]
     // SAFETY: the caller must uphold the safety contract.
     // cfg guarantees that quadword atomics instructions are available at compile-time.
-    let (prev, ok) = unsafe { atomic_compare_exchange_pwr8(dst, old, new, success) };
+    let (prev, ok) = unsafe { atomic_compare_exchange_pwr8(dst, old, new, success, failure) };
     #[cfg(not(any(
         target_feature = "quadword-atomics",
         portable_atomic_target_feature = "quadword-atomics",
     )))]
     // SAFETY: the caller must uphold the safety contract.
-    let (prev, ok) = unsafe { atomic_compare_exchange_ifunc(dst, old, new, success) };
+    let (prev, ok) = {
+        fn_alias! {
+            // inline(never) is just a hint and also not strictly necessary
+            // because we use ifunc helper macro, but used for clarity.
+            #[inline(never)]
+            unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool);
+            pwr8_relaxed_fn = atomic_compare_exchange_pwr8(Ordering::Relaxed, Ordering::Relaxed);
+            pwr8_acquire_fn = atomic_compare_exchange_pwr8(Ordering::Acquire, Ordering::Acquire);
+            pwr8_release_fn = atomic_compare_exchange_pwr8(Ordering::Release, Ordering::Relaxed);
+            pwr8_acqrel_fn = atomic_compare_exchange_pwr8(Ordering::AcqRel, Ordering::Acquire);
+            pwr8_seqcst_fn = atomic_compare_exchange_pwr8(Ordering::SeqCst, Ordering::SeqCst);
+        }
+        // SAFETY: the caller must uphold the safety contract.
+        // we only calls pwr8_fn if quadword-atomics is available.
+        unsafe {
+            let success = crate::utils::upgrade_success_ordering(success, failure);
+            match success {
+                Ordering::Relaxed => {
+                    ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
+                        if detect::detect().has_quadword_atomics() {
+                            pwr8_relaxed_fn
+                        } else {
+                            fallback::atomic_compare_exchange_non_seqcst
+                        }
+                    })
+                }
+                Ordering::Acquire => {
+                    ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
+                        if detect::detect().has_quadword_atomics() {
+                            pwr8_acquire_fn
+                        } else {
+                            fallback::atomic_compare_exchange_non_seqcst
+                        }
+                    })
+                }
+                Ordering::Release => {
+                    ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
+                        if detect::detect().has_quadword_atomics() {
+                            pwr8_release_fn
+                        } else {
+                            fallback::atomic_compare_exchange_non_seqcst
+                        }
+                    })
+                }
+                Ordering::AcqRel => {
+                    ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
+                        if detect::detect().has_quadword_atomics() {
+                            pwr8_acqrel_fn
+                        } else {
+                            fallback::atomic_compare_exchange_non_seqcst
+                        }
+                    })
+                }
+                Ordering::SeqCst => {
+                    ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
+                        if detect::detect().has_quadword_atomics() {
+                            pwr8_seqcst_fn
+                        } else {
+                            fallback::atomic_compare_exchange_seqcst
+                        }
+                    })
+                }
+                _ => unreachable!(),
+            }
+        }
+    };
     if ok {
         Ok(prev)
     } else {
@@ -371,36 +461,37 @@ unsafe fn atomic_compare_exchange_pwr8(
     dst: *mut u128,
     old: u128,
     new: u128,
-    order: Ordering,
+    success: Ordering,
+    failure: Ordering,
 ) -> (u128, bool) {
     debug_assert!(dst as usize % 16 == 0);
     debug_assert_pwr8!();
+    let old = U128 { whole: old };
+    let new = U128 { whole: new };
+    let (mut prev_hi, mut prev_lo);
+    let mut r;
 
     // SAFETY: the caller must uphold the safety contract.
     //
     // Refs: Section 4.6.2.2 "128-bit Load And Reserve and Store Conditional Instructions" of Power ISA 3.1C Book II
     unsafe {
-        let old = U128 { whole: old };
-        let new = U128 { whole: new };
-        let (mut prev_hi, mut prev_lo);
-        let mut r;
         macro_rules! cmpxchg {
-            ($acquire:tt, $release:tt) => {
+            ($acquire_always:tt, $acquire_success:tt, $release:tt) => {
                 asm!(
                     start_pwr8!(),
-                    $release,
-                    "2:",
-                        "lqarx %r8, 0, {dst}",
-                        "xor {tmp_lo}, %r9, {old_lo}",
-                        "xor {tmp_hi}, %r8, {old_hi}",
-                        "or. {tmp_lo}, {tmp_lo}, {tmp_hi}",
-                        "bne %cr0, 3f", // jump if compare failed
-                        "stqcx. %r6, 0, {dst}",
-                        "bne %cr0, 2b", // continue loop if store failed
-                    "3:",
-                    // if compare failed EQ bit is cleared, if stqcx succeeds EQ bit is set.
-                    "mfcr {tmp_lo}",
-                    $acquire,
+                    $release,                               // fence
+                    "2:", // 'retry:
+                        "lqarx %r8, 0, {dst}",              // atomic { RESERVE = (dst, 16); r8:r9 = *dst }
+                        "xor {tmp_lo}, %r9, {old_lo}",      // tmp_lo = r9 ^ old_lo
+                        "xor {tmp_hi}, %r8, {old_hi}",      // tmp_hi = r8 ^ old_hi
+                        "or. {tmp_lo}, {tmp_lo}, {tmp_hi}", // tmp_lo |= tmp_hi; if tmp_lo == 0 { cr0.EQ = 1 } else { cr0.EQ = 0 }
+                        "bne %cr0, 3f",                     // if cr0.EQ == 0 { jump 'cmp-fail }
+                        "stqcx. %r6, 0, {dst}",             // atomic { if RESERVE == (dst, 16) { *dst = r6:r7; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None }
+                        "bne %cr0, 2b",                     // if cr0.EQ == 0 { jump 'retry }
+                        $acquire_success,                   // fence
+                    "3:", // 'cmp-fail:
+                    $acquire_always,                        // fence
+                    "mfcr {tmp_lo}",                        // tmp_lo = zero_extend(cr)
                     end_pwr8!(),
                     dst = in(reg_nonzero) ptr_reg!(dst),
                     old_hi = in(reg) old.pair.hi,
@@ -418,8 +509,9 @@ unsafe fn atomic_compare_exchange_pwr8(
                 )
             };
         }
-        atomic_rmw!(cmpxchg, order);
-        (U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole, extract_cr0(r))
+        atomic_cas!(cmpxchg, success, failure);
+        // if compare failed EQ bit is cleared, if store succeeds EQ bit is set.
+        (U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole, test_cr0_eq(r))
     }
 }
 
@@ -441,11 +533,9 @@ unsafe fn atomic_compare_exchange_weak(
     success: Ordering,
     failure: Ordering,
 ) -> Result<u128, u128> {
-    let success = crate::utils::upgrade_success_ordering(success, failure);
-
     // SAFETY: the caller must uphold the safety contract.
     // cfg guarantees that quadword atomics instructions are available at compile-time.
-    let (prev, ok) = unsafe { atomic_compare_exchange_weak_pwr8(dst, old, new, success) };
+    let (prev, ok) = unsafe { atomic_compare_exchange_weak_pwr8(dst, old, new, success, failure) };
     if ok {
         Ok(prev)
     } else {
@@ -461,34 +551,35 @@ unsafe fn atomic_compare_exchange_weak_pwr8(
     dst: *mut u128,
     old: u128,
     new: u128,
-    order: Ordering,
+    success: Ordering,
+    failure: Ordering,
 ) -> (u128, bool) {
     debug_assert!(dst as usize % 16 == 0);
     debug_assert_pwr8!();
+    let old = U128 { whole: old };
+    let new = U128 { whole: new };
+    let (mut prev_hi, mut prev_lo);
+    let mut r;
 
     // SAFETY: the caller must uphold the safety contract.
     //
     // Refs: Section 4.6.2.2 "128-bit Load And Reserve and Store Conditional Instructions" of Power ISA 3.1C Book II
     unsafe {
-        let old = U128 { whole: old };
-        let new = U128 { whole: new };
-        let (mut prev_hi, mut prev_lo);
-        let mut r;
         macro_rules! cmpxchg_weak {
-            ($acquire:tt, $release:tt) => {
+            ($acquire_always:tt, $acquire_success:tt, $release:tt) => {
                 asm!(
                     start_pwr8!(),
-                    $release,
-                    "lqarx %r8, 0, {dst}",
-                    "xor {tmp_lo}, %r9, {old_lo}",
-                    "xor {tmp_hi}, %r8, {old_hi}",
-                    "or. {tmp_lo}, {tmp_lo}, {tmp_hi}",
-                    "bne %cr0, 3f", // jump if compare failed
-                    "stqcx. %r6, 0, {dst}",
-                    "3:",
-                    // if compare or stqcx failed EQ bit is cleared, if stqcx succeeds EQ bit is set.
-                    "mfcr {tmp_lo}",
-                    $acquire,
+                    $release,                           // fence
+                    "lqarx %r8, 0, {dst}",              // atomic { RESERVE = (dst, 16); r8:r9 = *dst }
+                    "xor {tmp_lo}, %r9, {old_lo}",      // tmp_lo = r9 ^ old_lo
+                    "xor {tmp_hi}, %r8, {old_hi}",      // tmp_hi = r8 ^ old_hi
+                    "or. {tmp_lo}, {tmp_lo}, {tmp_hi}", // tmp_lo |= tmp_hi; if tmp_lo == 0 { cr0.EQ = 1 } else { cr0.EQ = 0 }
+                    "bne %cr0, 3f",                     // if cr0.EQ == 0 { jump 'cmp-fail }
+                    "stqcx. %r6, 0, {dst}",             // atomic { if RESERVE == (dst, 16) { *dst = r6:r7; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None }
+                    $acquire_success,                   // fence
+                    "3:", // 'cmp-fail:
+                    $acquire_always,                    // fence
+                    "mfcr {tmp_lo}",                    // tmp_lo = zero_extend(cr)
                     end_pwr8!(),
                     dst = in(reg_nonzero) ptr_reg!(dst),
                     old_hi = in(reg) old.pair.hi,
@@ -506,8 +597,9 @@ unsafe fn atomic_compare_exchange_weak_pwr8(
                 )
             };
         }
-        atomic_rmw!(cmpxchg_weak, order);
-        (U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole, extract_cr0(r))
+        atomic_cas!(cmpxchg_weak, success, failure);
+        // if compare or store failed EQ bit is cleared, if store succeeds EQ bit is set.
+        (U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole, test_cr0_eq(r))
     }
 }
 
@@ -516,21 +608,21 @@ unsafe fn atomic_compare_exchange_weak_pwr8(
 unsafe fn atomic_swap_pwr8(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     debug_assert!(dst as usize % 16 == 0);
     debug_assert_pwr8!();
+    let val = U128 { whole: val };
+    let (mut prev_hi, mut prev_lo);
 
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
-        let val = U128 { whole: val };
-        let (mut prev_hi, mut prev_lo);
         macro_rules! swap {
             ($acquire:tt, $release:tt) => {
                 asm!(
                     start_pwr8!(),
-                    $release,
-                    "2:",
-                        "lqarx %r6, 0, {dst}",
-                        "stqcx. %r8, 0, {dst}",
-                        "bne %cr0, 2b",
-                    $acquire,
+                    $release,                   // fence
+                    "2:", // 'retry:
+                        "lqarx %r6, 0, {dst}",  // atomic { RESERVE = (dst, 16); r6:r7 = *dst }
+                        "stqcx. %r8, 0, {dst}", // atomic { if RESERVE == (dst, 16) { *dst = r8:r9; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None }
+                        "bne %cr0, 2b",         // if cr0.EQ == 0 { jump 'retry }
+                    $acquire,                   // fence
                     end_pwr8!(),
                     dst = in(reg_nonzero) ptr_reg!(dst),
                     // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
@@ -562,21 +654,22 @@ macro_rules! atomic_rmw_ll_sc_3 {
         unsafe fn $name(dst: *mut u128, val: u128, order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
             debug_assert_pwr8!();
+            let val = U128 { whole: val };
+            let (mut prev_hi, mut prev_lo);
+
             // SAFETY: the caller must uphold the safety contract.
             unsafe {
-                let val = U128 { whole: val };
-                let (mut prev_hi, mut prev_lo);
                 macro_rules! op {
                     ($acquire:tt, $release:tt) => {
                         asm!(
                             start_pwr8!(),
-                            $release,
-                            "2:",
-                                "lqarx %r6, 0, {dst}",
+                            $release,                   // fence
+                            "2:", // 'retry:
+                                "lqarx %r6, 0, {dst}",  // atomic { RESERVE = (dst, 16); r6:r7 = *dst }
                                 $($op)*
-                                "stqcx. %r8, 0, {dst}",
-                                "bne %cr0, 2b",
-                            $acquire,
+                                "stqcx. %r8, 0, {dst}", // atomic { if RESERVE == (dst, 16) { *dst = r8:r9; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None }
+                                "bne %cr0, 2b",         // if cr0.EQ == 0 { jump 'retry }
+                            $acquire,                   // fence
                             end_pwr8!(),
                             dst = in(reg_nonzero) ptr_reg!(dst),
                             val_hi = in(reg) val.pair.hi,
@@ -611,20 +704,21 @@ macro_rules! atomic_rmw_ll_sc_2 {
         unsafe fn $name(dst: *mut u128, order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
             debug_assert_pwr8!();
+            let (mut prev_hi, mut prev_lo);
+
             // SAFETY: the caller must uphold the safety contract.
             unsafe {
-                let (mut prev_hi, mut prev_lo);
                 macro_rules! op {
                     ($acquire:tt, $release:tt) => {
                         asm!(
                             start_pwr8!(),
-                            $release,
-                            "2:",
-                                "lqarx %r6, 0, {dst}",
+                            $release,                   // fence
+                            "2:", // 'retry:
+                                "lqarx %r6, 0, {dst}",  // atomic { RESERVE = (dst, 16); r6:r7 = *dst }
                                 $($op)*
-                                "stqcx. %r8, 0, {dst}",
-                                "bne %cr0, 2b",
-                            $acquire,
+                                "stqcx. %r8, 0, {dst}", // atomic { if RESERVE == (dst, 16) { *dst = r8:r9; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None }
+                                "bne %cr0, 2b",         // if cr0.EQ == 0 { jump 'retry }
+                            $acquire,                   // fence
                             end_pwr8!(),
                             dst = in(reg_nonzero) ptr_reg!(dst),
                             $($reg)*
@@ -648,71 +742,71 @@ macro_rules! atomic_rmw_ll_sc_2 {
 
 atomic_rmw_ll_sc_3! {
     atomic_add_pwr8, [out("xer") _,],
-    "addc %r9, {val_lo}, %r7",
-    "adde %r8, {val_hi}, %r6",
+    "addc %r9, {val_lo}, %r7", // r9 = val_lo + r7; xer.CA = carry
+    "adde %r8, {val_hi}, %r6", // r8 = val_hi + r6 + xer.CA
 }
 atomic_rmw_ll_sc_3! {
     atomic_sub_pwr8, [out("xer") _,],
-    "subc %r9, %r7, {val_lo}",
-    "subfe %r8, {val_hi}, %r6",
+    "subc %r9, %r7, {val_lo}",  // r9 = val_lo - r7; xer.CA = borrow
+    "subfe %r8, {val_hi}, %r6", // r8 = val_hi - r6 - xer.CA
 }
 atomic_rmw_ll_sc_3! {
     atomic_and_pwr8, [],
-    "and %r9, {val_lo}, %r7",
-    "and %r8, {val_hi}, %r6",
+    "and %r9, {val_lo}, %r7", // r9 = val_lo & r7
+    "and %r8, {val_hi}, %r6", // r8 = val_hi & r6
 }
 atomic_rmw_ll_sc_3! {
     atomic_nand_pwr8, [],
-    "nand %r9, {val_lo}, %r7",
-    "nand %r8, {val_hi}, %r6",
+    "nand %r9, {val_lo}, %r7", // r9 = !(val_lo & r7)
+    "nand %r8, {val_hi}, %r6", // r8 = !(val_hi & r6)
 }
 atomic_rmw_ll_sc_3! {
     atomic_or_pwr8, [],
-    "or %r9, {val_lo}, %r7",
-    "or %r8, {val_hi}, %r6",
+    "or %r9, {val_lo}, %r7", // r9 = val_lo | r7
+    "or %r8, {val_hi}, %r6", // r8 = val_hi | r6
 }
 atomic_rmw_ll_sc_3! {
     atomic_xor_pwr8, [],
-    "xor %r9, {val_lo}, %r7",
-    "xor %r8, {val_hi}, %r6",
+    "xor %r9, {val_lo}, %r7", // r9 = val_lo ^ r7
+    "xor %r8, {val_hi}, %r6", // r8 = val_hi ^ r6
 }
 atomic_rmw_ll_sc_3! {
     atomic_max_pwr8, [out("cr1") _,],
-    "cmpld %r7, {val_lo}",        // (unsigned) compare lo 64-bit, store result to cr0
-    "iselgt %r9, %r7, {val_lo}",  // select lo 64-bit based on GT bit in cr0
-    "cmpd %cr1, %r6, {val_hi}",   // (signed) compare hi 64-bit, store result to cr1
-    "isel %r8, %r7, {val_lo}, 5", // select lo 64-bit based on GT bit in cr1
-    "cmpld %r6, {val_hi}",        // (unsigned) compare hi 64-bit, store result to cr0
-    "iseleq %r9, %r9, %r8",       // select lo 64-bit based on EQ bit in cr0
-    "isel %r8, %r6, {val_hi}, 5", // select hi 64-bit based on GT bit in cr1
+    "cmpld %r7, {val_lo}",        // if r7(u) < val_lo(u) { cr0 = { LT: 1, ..0 } } else if r7(u) > val_lo(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } }
+    "iselgt %r9, %r7, {val_lo}",  // if cr0.GT == 1 { r9 = r7 } else { r9 = val_lo }
+    "cmpd %cr1, %r6, {val_hi}",   // if r6(i) < val_hi(i) { cr1 = { LT: 1, ..0 } } else if r6(i) > val_hi(i) { cr1 = { GT: 1, ..0 } } else { cr1 = { EQ: 1, ..0 } }
+    "isel %r8, %r7, {val_lo}, 5", // if cr1.GT == 1 { r8 = r7 } else { r8 = val_lo }
+    "cmpld %r6, {val_hi}",        // if r6(u) < val_hi(u) { cr0 = { LT: 1, ..0 } } else if r6(u) > val_hi(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } }
+    "iseleq %r9, %r9, %r8",       // if cr0.EQ == 1 { r9 = r9 } else { r9 = r8 }
+    "isel %r8, %r6, {val_hi}, 5", // if cr1.GT == 1 { r8 = r6 } else { r8 = val_hi }
 }
 atomic_rmw_ll_sc_3! {
     atomic_umax_pwr8, [],
-    "cmpld %r7, {val_lo}",       // compare lo 64-bit, store result to cr0
-    "iselgt %r9, %r7, {val_lo}", // select lo 64-bit based on GT bit in cr0
-    "cmpld %r6, {val_hi}",       // compare hi 64-bit, store result to cr0
-    "iselgt %r8, %r7, {val_lo}", // select lo 64-bit based on GT bit in cr0
-    "iseleq %r9, %r9, %r8",      // select lo 64-bit based on EQ bit in cr0
-    "iselgt %r8, %r6, {val_hi}", // select hi 64-bit based on GT bit in cr0
+    "cmpld %r7, {val_lo}",       // if r7(u) < val_lo(u) { cr0 = { LT: 1, ..0 } } else if r7(u) > val_lo(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } }
+    "iselgt %r9, %r7, {val_lo}", // if cr0.GT == 1 { r9 = r7 } else { r9 = val_lo }
+    "cmpld %r6, {val_hi}",       // if r6(u) < val_hi(u) { cr0 = { LT: 1, ..0 } } else if r6(u) > val_hi(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } }
+    "iselgt %r8, %r7, {val_lo}", // if cr0.GT == 1 { r8 = r7 } else { r8 = val_lo }
+    "iseleq %r9, %r9, %r8",      // if cr0.EQ == 1 { r9 = r9 } else { r9 = r8 }
+    "iselgt %r8, %r6, {val_hi}", // if cr0.GT == 1 { r8 = r6 } else { r8 = val_hi }
 }
 atomic_rmw_ll_sc_3! {
     atomic_min_pwr8, [out("cr1") _,],
-    "cmpld %r7, {val_lo}",        // (unsigned) compare lo 64-bit, store result to cr0
-    "isellt %r9, %r7, {val_lo}",  // select lo 64-bit based on LT bit in cr0
-    "cmpd %cr1, %r6, {val_hi}",   // (signed) compare hi 64-bit, store result to cr1
-    "isel %r8, %r7, {val_lo}, 4", // select lo 64-bit based on LT bit in cr1
-    "cmpld %r6, {val_hi}",        // (unsigned) compare hi 64-bit, store result to cr0
-    "iseleq %r9, %r9, %r8",       // select lo 64-bit based on EQ bit in cr0
-    "isel %r8, %r6, {val_hi}, 4", // select hi 64-bit based on LT bit in cr1
+    "cmpld %r7, {val_lo}",        // if r7(u) < val_lo(u) { cr0 = { LT: 1, ..0 } } else if r7(u) > val_lo(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } }
+    "isellt %r9, %r7, {val_lo}",  // if cr0.LT == 1 { r9 = r7 } else { r9 = val_lo }
+    "cmpd %cr1, %r6, {val_hi}",   // if r6(i) < val_hi(i) { cr1 = { LT: 1, ..0 } } else if r6(i) > val_hi(i) { cr1 = { GT: 1, ..0 } } else { cr1 = { EQ: 1, ..0 } }
+    "isel %r8, %r7, {val_lo}, 4", // if cr1.LT == 1 { r8 = r7 } else { r8 = val_lo }
+    "cmpld %r6, {val_hi}",        // if r6(u) < val_hi(u) { cr0 = { LT: 1, ..0 } } else if r6(u) > val_hi(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } }
+    "iseleq %r9, %r9, %r8",       // if cr0.EQ == 1 { r9 = r9 } else { r9 = r8 }
+    "isel %r8, %r6, {val_hi}, 4", // if cr1.LT == 1 { r8 = r6 } else { r8 = val_hi }
 }
 atomic_rmw_ll_sc_3! {
     atomic_umin_pwr8, [],
-    "cmpld %r7, {val_lo}",       // compare lo 64-bit, store result to cr0
-    "isellt %r9, %r7, {val_lo}", // select lo 64-bit based on LT bit in cr0
-    "cmpld %r6, {val_hi}",       // compare hi 64-bit, store result to cr0
-    "isellt %r8, %r7, {val_lo}", // select lo 64-bit based on LT bit in cr0
-    "iseleq %r9, %r9, %r8",      // select lo 64-bit based on EQ bit in cr0
-    "isellt %r8, %r6, {val_hi}", // select hi 64-bit based on LT bit in cr0
+    "cmpld %r7, {val_lo}",       // if r7(u) < val_lo(u) { cr0 = { LT: 1, ..0 } } else if r7(u) > val_lo(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } }
+    "isellt %r9, %r7, {val_lo}", // if cr0.LT == 1 { r9 = r7 } else { r9 = val_lo }
+    "cmpld %r6, {val_hi}",       // if r6(u) < val_hi(u) { cr0 = { LT: 1, ..0 } } else if r6(u) > val_hi(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } }
+    "isellt %r8, %r7, {val_lo}", // if cr0.LT == 1 { r8 = r7 } else { r8 = val_lo }
+    "iseleq %r9, %r9, %r8",      // if cr0.EQ == 1 { r9 = r9 } else { r9 = r8 }
+    "isellt %r8, %r6, {val_hi}", // if cr0.LT == 1 { r8 = r6 } else { r8 = val_hi }
 }
 
 #[inline]
@@ -724,15 +818,15 @@ unsafe fn atomic_not_pwr8(dst: *mut u128, order: Ordering) -> u128 {
 #[cfg(not(portable_atomic_pre_llvm_16))]
 atomic_rmw_ll_sc_2! {
     atomic_neg_pwr8, [out("xer") _,],
-    "subfic %r9, %r7, 0",
-    "subfze %r8, %r6",
+    "subfic %r9, %r7, 0", // r9 = 0 - r7; xer.CA = borrow
+    "subfze %r8, %r6",    // r8 = 0 - r6 - xer.CA
 }
 // LLVM 15 miscompiles subfic.
 #[cfg(portable_atomic_pre_llvm_16)]
 atomic_rmw_ll_sc_2! {
     atomic_neg_pwr8, [zero = in(reg) 0_u64, out("xer") _,],
-    "subc %r9, {zero}, %r7",
-    "subfze %r8, %r6",
+    "subc %r9, {zero}, %r7", // r9 = 0 - r7; xer.CA = borrow
+    "subfze %r8, %r6",       // r8 = 0 - r6 - xer.CA
 }
 
 macro_rules! select_atomic_rmw {
@@ -822,16 +916,6 @@ macro_rules! select_atomic_rmw {
     };
 }
 
-#[cfg(not(any(
-    target_feature = "quadword-atomics",
-    portable_atomic_target_feature = "quadword-atomics",
-)))]
-select_atomic_rmw! {
-    unsafe fn atomic_compare_exchange_ifunc(dst: *mut u128, old: u128, new: u128) -> (u128, bool);
-    pwr8 = atomic_compare_exchange_pwr8;
-    non_seqcst_fallback = atomic_compare_exchange_non_seqcst;
-    seqcst_fallback = atomic_compare_exchange_seqcst;
-}
 select_atomic_rmw! {
     unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128;
     pwr8 = atomic_swap_pwr8;
diff --git a/src/imp/atomic128/riscv64.rs b/src/imp/atomic128/riscv64.rs
index 74fda982..cc33818b 100644
--- a/src/imp/atomic128/riscv64.rs
+++ b/src/imp/atomic128/riscv64.rs
@@ -1,10 +1,18 @@
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
 /*
-128-bit atomic implementation on riscv64 using amocas.q (DWCAS).
+128-bit atomic implementation on riscv64.
+
+This architecture provides the following 128-bit atomic instructions:
+
+- amocas.q: CAS (Zacas extension)
+
+See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and
+detailed description of the atomic and synchronize instructions in this architecture:
+https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#risc-v
 
 Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use
-this module and use intrinsics.rs instead.
+this module and use fallback implementation instead.
 
 Refs:
 - RISC-V Instruction Set Manual
@@ -167,18 +175,18 @@ unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 {
 unsafe fn atomic_load_zacas(src: *mut u128, order: Ordering) -> u128 {
     debug_assert!(src as usize % 16 == 0);
     debug_assert_zacas!();
+    let (out_lo, out_hi);
 
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
-        let (out_lo, out_hi);
         // LLVM doesn't support `.option arch, +zabha` directive as of LLVM 19 because it is experimental.
         // So, we currently always using .4byte directive.
         // macro_rules! load {
         //     ($fence:tt, $asm_order:tt) => {
         //         asm!(
         //             start_zacas!(),
-        //             $fence,
-        //             concat!("amocas.q", $asm_order, " a2, a2, 0({src})"),
+        //             $fence,                                               // fence
+        //             concat!("amocas.q", $asm_order, " a2, a2, 0({src})"), // atomic { if *dst == a2:a3 { *dst = a2:a3 } else { a2:a3 = *dst } }
         //             end_zacas!(),
         //             src = in(reg) ptr_reg!(src),
         //             inout("a2") 0_u64 => out_lo,
@@ -191,8 +199,8 @@ unsafe fn atomic_load_zacas(src: *mut u128, order: Ordering) -> u128 {
         macro_rules! load {
             ($fence:tt, $insn_order:tt) => {
                 asm!(
-                    $fence,
-                    // 4: 2{8,c,a,e}c5462f     	amocas.q{,.aq,.rl,.aqrl}	a2, a2, (a0)
+                    $fence,                                  // fence
+                    // amocas.q{,.aq,.rl,.aqrl} a2, a2, (a0) // atomic { if *a0 == a2:a3 { *a0 = a2:a3 } else { a2:a3 = *a0 } }
                     concat!(".4byte 0x2", $insn_order, "c5462f"),
                     in("a0") ptr_reg!(src),
                     inout("a2") 0_u64 => out_lo,
@@ -316,20 +324,20 @@ unsafe fn atomic_compare_exchange_zacas(
     debug_assert!(dst as usize % 16 == 0);
     debug_assert_zacas!();
     let order = crate::utils::upgrade_success_ordering(success, failure);
+    let old = U128 { whole: old };
+    let new = U128 { whole: new };
+    let (prev_lo, prev_hi);
 
     // SAFETY: the caller must uphold the safety contract.
-    let prev = unsafe {
-        let old = U128 { whole: old };
-        let new = U128 { whole: new };
-        let (prev_lo, prev_hi);
+    unsafe {
         // LLVM doesn't support `.option arch, +zabha` directive as of LLVM 19 because it is experimental.
         // So, we currently always using .4byte directive.
         // macro_rules! cmpxchg {
         //     ($fence:tt, $asm_order:tt) => {
         //         asm!(
         //             start_zacas!(),
-        //             $fence,
-        //             concat!("amocas.q", $asm_order, " a4, a2, 0({dst})"),
+        //             $fence,                                               // fence
+        //             concat!("amocas.q", $asm_order, " a4, a2, 0({dst})"), // atomic { if *dst == a4:a5 { *dst = a2:a3 } else { a4:a5 = *dst } }
         //             end_zacas!(),
         //             dst = in(reg) ptr_reg!(dst),
         //             // must be allocated to even/odd register pair
@@ -346,8 +354,8 @@ unsafe fn atomic_compare_exchange_zacas(
         macro_rules! cmpxchg {
             ($fence:tt, $insn_order:tt) => {
                 asm!(
-                    $fence,
-                    // c: 2{8,c,a,e}c5472f     	amocas.q{,.aq,.rl,.aqrl}	a4, a2, (a0)
+                    $fence,                                  // fence
+                    // amocas.q{,.aq,.rl,.aqrl} a4, a2, (a0) // atomic { if *a0 == a4:a5 { *a0 = a2:a3 } else { a4:a5 = *a0 } }
                     concat!(".4byte 0x2", $insn_order, "c5472f"),
                     in("a0") ptr_reg!(dst),
                     // must be allocated to even/odd register pair
@@ -361,23 +369,24 @@ unsafe fn atomic_compare_exchange_zacas(
             };
         }
         atomic_rmw_amocas_order_insn!(cmpxchg, order, failure = failure);
-        U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
-    };
-    (prev, prev == old)
+        let prev = U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole;
+        (prev, prev == old.whole)
+    }
 }
 
 // amocas is always strong.
 use self::atomic_compare_exchange as atomic_compare_exchange_weak;
 
-// 128-bit atomic load by two 64-bit atomic loads. (see arm_linux.rs for more)
+// 128-bit atomic load by two 64-bit atomic loads.
 #[inline]
 unsafe fn byte_wise_atomic_load(src: *const u128) -> u128 {
+    let (out_lo, out_hi);
+
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
-        let (out_lo, out_hi);
         asm!(
-            "ld {out_lo}, ({src})",
-            "ld {out_hi}, 8({src})",
+            "ld {out_lo}, ({src})",  // atomic { out_lo = *src }
+            "ld {out_hi}, 8({src})", // atomic { out_hi = *src.add(8) }
             src = in(reg) ptr_reg!(src),
             out_lo = out(reg) out_lo,
             out_hi = out(reg) out_hi,
@@ -387,28 +396,10 @@ unsafe fn byte_wise_atomic_load(src: *const u128) -> u128 {
     }
 }
 
-#[inline(always)]
-unsafe fn atomic_update_zacas<F>(dst: *mut u128, order: Ordering, mut f: F) -> u128
-where
-    F: FnMut(u128) -> u128,
-{
-    // SAFETY: the caller must uphold the safety contract.
-    unsafe {
-        let mut prev = byte_wise_atomic_load(dst);
-        loop {
-            let next = f(prev);
-            match atomic_compare_exchange_weak(dst, prev, next, order, Ordering::Relaxed) {
-                Ok(x) => return x,
-                Err(x) => prev = x,
-            }
-        }
-    }
-}
-
 macro_rules! select_atomic_rmw {
     (
         unsafe fn $name:ident(dst: *mut u128 $(, $($arg:tt)*)?) $(-> $ret_ty:ty)? {
-            $($zacas_fn_body:tt)*
+            |$zacas_fn_binding:ident| $($zacas_fn_body:tt)*
         }
         zacas = $zacas_fn:ident;
         non_seqcst_fallback = $non_seqcst_fallback_fn:ident;
@@ -417,7 +408,26 @@ macro_rules! select_atomic_rmw {
         #[inline]
         unsafe fn $zacas_fn(dst: *mut u128 $(, $($arg)*)?, order: Ordering) $(-> $ret_ty)? {
             // SAFETY: the caller must uphold the safety contract.
-            unsafe { atomic_update_zacas(dst, order, $($zacas_fn_body)*) }
+            unsafe {
+                // This is not single-copy atomic reads, but this is ok because subsequent
+                // CAS will check for consistency.
+                //
+                // Note that the C++20 memory model does not allow mixed-sized atomic access,
+                // so we must use inline assembly to implement byte_wise_atomic_load.
+                // (i.e., byte-wise atomic based on the standard library's atomic types
+                // cannot be used here).
+                let mut prev = byte_wise_atomic_load(dst);
+                loop {
+                    let next = {
+                        let $zacas_fn_binding = prev;
+                        $($zacas_fn_body)*
+                    };
+                    match atomic_compare_exchange_weak(dst, prev, next, order, Ordering::Relaxed) {
+                        Ok(x) => return x,
+                        Err(x) => prev = x,
+                    }
+                }
+            }
         }
         // If zacas is available at compile-time, we can always use zacas_fn.
         #[cfg(any(
@@ -501,7 +511,7 @@ macro_rules! select_atomic_rmw {
 
 select_atomic_rmw! {
     unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128 {
-        |_| val
+        |_x| val
     }
     zacas = atomic_swap_zacas;
     non_seqcst_fallback = atomic_swap_non_seqcst;
@@ -603,7 +613,7 @@ select_atomic_rmw! {
 }
 select_atomic_rmw! {
     unsafe fn atomic_neg(dst: *mut u128) -> u128 {
-        u128::wrapping_neg
+        |x| x.wrapping_neg()
     }
     zacas = atomic_neg_zacas;
     non_seqcst_fallback = atomic_neg_non_seqcst;
diff --git a/src/imp/atomic128/s390x.rs b/src/imp/atomic128/s390x.rs
index d77db40f..9ff9adda 100644
--- a/src/imp/atomic128/s390x.rs
+++ b/src/imp/atomic128/s390x.rs
@@ -3,13 +3,16 @@
 /*
 128-bit atomic implementation on s390x.
 
-s390x has 128-bit atomic load/store/CAS instructions and other operations are emulated by CAS loop.
-https://github.com/llvm/llvm-project/commit/a11f63a952664f700f076fd754476a2b9eb158cc
-https://github.com/llvm/llvm-project/commit/c568927f3e2e7d9804ea74ecbf11c16c014ddcbc
+This architecture provides the following 128-bit atomic instructions:
 
-128-bit atomic instructions (lpq,stpq,cdsg) has been present since
-[the First Edition of the Principles of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/dz9zr000.pdf).
-LLVM's minimal supported architecture level is z10 (the Eighth Edition of the PoP):
+- LPQ/STPQ: load/store (arch1 or later, i.e., baseline)
+- CDSG: CAS (arch1 or later, i.e., baseline)
+
+See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and
+detailed description of the atomic and synchronize instructions in this architecture:
+https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#s390x
+
+LLVM's minimal supported architecture level is arch8 (z10):
 https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/SystemZ/SystemZProcessors.td#L16-L17
 This does not appear to have changed since the current s390x backend was added in LLVM 3.3:
 https://github.com/llvm/llvm-project/commit/5f613dfd1f7edb0ae95d521b7107b582d9df5103#diff-cbaef692b3958312e80fd5507a7e2aff071f1acb086f10e8a96bc06a7bb289db
@@ -18,13 +21,16 @@ Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't
 this module and use intrinsics.rs instead.
 
 Refs:
-- z/Architecture Principles of Operation https://publibfp.dhe.ibm.com/epubs/pdf/a227832d.pdf
-- atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
+- z/Architecture Principles of Operation, Fourteenth Edition (SA22-7832-13)
+  https://publibfp.dhe.ibm.com/epubs/pdf/a227832d.pdf
+- atomic-maybe-uninit
+  https://github.com/taiki-e/atomic-maybe-uninit
 
 Generated asm:
-- s390x https://godbolt.org/z/osTYK1Mfz
-- s390x (z196) https://godbolt.org/z/K71PKbnPT
-- s390x (z15) https://godbolt.org/z/dfP1YKc1d
+- s390x https://godbolt.org/z/oPxYYEvPG
+- s390x (z196) https://godbolt.org/z/M69KrKT7Y
+- s390x (z15,-vector) https://godbolt.org/z/Wec8b3ada
+- s390x (z15) https://godbolt.org/z/KxWcrbfYh
 */
 
 include!("macros.rs");
@@ -33,6 +39,26 @@ use core::{arch::asm, sync::atomic::Ordering};
 
 use crate::utils::{Pair, U128};
 
+// bcr 14,0 requires fast-BCR-serialization facility added in arch9 (z196).
+#[cfg(any(
+    target_feature = "fast-serialization",
+    portable_atomic_target_feature = "fast-serialization",
+))]
+macro_rules! serialization {
+    () => {
+        "bcr 14, 0"
+    };
+}
+#[cfg(not(any(
+    target_feature = "fast-serialization",
+    portable_atomic_target_feature = "fast-serialization",
+)))]
+macro_rules! serialization {
+    () => {
+        "bcr 15, 0"
+    };
+}
+
 // Use distinct operands on z196 or later, otherwise split to lgr and $op.
 #[cfg(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops"))]
 macro_rules! distinct_op {
@@ -76,7 +102,7 @@ macro_rules! select_op {
 }
 
 // Extracts and checks condition code.
-#[inline(always)]
+#[inline]
 fn extract_cc(r: i64) -> bool {
     r.wrapping_add(-268435456) & (1 << 31) != 0
 }
@@ -84,13 +110,13 @@ fn extract_cc(r: i64) -> bool {
 #[inline]
 unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 {
     debug_assert!(src as usize % 16 == 0);
+    let (out_hi, out_lo);
 
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         // atomic load is always SeqCst.
-        let (out_hi, out_lo);
         asm!(
-            "lpq %r0, 0({src})",
+            "lpq %r0, 0({src})", // atomic { r0:r1 = *src }
             src = in(reg) ptr_reg!(src),
             // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
             out("r0") out_hi,
@@ -104,15 +130,15 @@ unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 {
 #[inline]
 unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
     debug_assert!(dst as usize % 16 == 0);
+    let val = U128 { whole: val };
 
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
-        let val = U128 { whole: val };
         macro_rules! atomic_store {
-            ($fence:tt) => {
+            ($acquire:expr) => {
                 asm!(
-                    "stpq %r0, 0({dst})",
-                    $fence,
+                    "stpq %r0, 0({dst})", // atomic { *dst = r0:r1 }
+                    $acquire,             // fence
                     dst = in(reg) ptr_reg!(dst),
                     // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
                     in("r0") val.pair.hi,
@@ -124,17 +150,7 @@ unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
         match order {
             // Relaxed and Release stores are equivalent.
             Ordering::Relaxed | Ordering::Release => atomic_store!(""),
-            // bcr 14,0 (fast-BCR-serialization) requires z196 or later.
-            #[cfg(any(
-                target_feature = "fast-serialization",
-                portable_atomic_target_feature = "fast-serialization",
-            ))]
-            Ordering::SeqCst => atomic_store!("bcr 14, 0"),
-            #[cfg(not(any(
-                target_feature = "fast-serialization",
-                portable_atomic_target_feature = "fast-serialization",
-            )))]
-            Ordering::SeqCst => atomic_store!("bcr 15, 0"),
+            Ordering::SeqCst => atomic_store!(serialization!()),
             _ => unreachable!(),
         }
     }
@@ -149,17 +165,17 @@ unsafe fn atomic_compare_exchange(
     _failure: Ordering,
 ) -> Result<u128, u128> {
     debug_assert!(dst as usize % 16 == 0);
-
+    let old = U128 { whole: old };
+    let new = U128 { whole: new };
+    let (prev_hi, prev_lo);
     let r;
+
     // SAFETY: the caller must uphold the safety contract.
     let prev = unsafe {
         // atomic CAS is always SeqCst.
-        let old = U128 { whole: old };
-        let new = U128 { whole: new };
-        let (prev_hi, prev_lo);
         asm!(
-            "cdsg %r0, %r12, 0({dst})",
-            "ipm {r}",
+            "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:13 } else { cc = 1; r0:r1 = *dst } }
+            "ipm {r}",                  // r[:] = cc
             dst = in(reg) ptr_reg!(dst),
             r = lateout(reg) r,
             // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
@@ -182,6 +198,28 @@ unsafe fn atomic_compare_exchange(
 // cdsg is always strong.
 use self::atomic_compare_exchange as atomic_compare_exchange_weak;
 
+// 128-bit atomic load by two 64-bit atomic loads.
+#[cfg(not(any(
+    target_feature = "load-store-on-cond",
+    portable_atomic_target_feature = "load-store-on-cond",
+)))]
+#[inline]
+unsafe fn byte_wise_atomic_load(src: *const u128) -> u128 {
+    // SAFETY: the caller must uphold the safety contract.
+    unsafe {
+        let (out_hi, out_lo);
+        asm!(
+            "lg {out_hi}, 8({src})", // atomic { out_hi = *src.add(8) }
+            "lg {out_lo}, 0({src})", // atomic { out_lo = *src }
+            src = in(reg) src,
+            out_hi = out(reg) out_hi,
+            out_lo = out(reg) out_lo,
+            options(pure, nostack, preserves_flags, readonly),
+        );
+        U128 { pair: Pair { hi: out_hi, lo: out_lo } }.whole
+    }
+}
+
 #[cfg(not(any(
     target_feature = "load-store-on-cond",
     portable_atomic_target_feature = "load-store-on-cond",
@@ -193,9 +231,14 @@ where
 {
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
-        // This is a private function and all instances of `f` only operate on the value
-        // loaded, so there is no need to synchronize the first load/failed CAS.
-        let mut prev = atomic_load(dst, Ordering::Relaxed);
+        // This is not single-copy atomic reads, but this is ok because subsequent
+        // CAS will check for consistency.
+        //
+        // Note that the C++20 memory model does not allow mixed-sized atomic access,
+        // so we must use inline assembly to implement byte_wise_atomic_load.
+        // (i.e., byte-wise atomic based on the standard library's atomic types
+        // cannot be used here).
+        let mut prev = byte_wise_atomic_load(dst);
         loop {
             let next = f(prev);
             match atomic_compare_exchange_weak(dst, prev, next, order, Ordering::Relaxed) {
@@ -209,6 +252,8 @@ where
 #[inline]
 unsafe fn atomic_swap(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
     debug_assert!(dst as usize % 16 == 0);
+    let val = U128 { whole: val };
+    let (mut prev_hi, mut prev_lo);
 
     // SAFETY: the caller must uphold the safety contract.
     //
@@ -218,13 +263,12 @@ unsafe fn atomic_swap(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
     // Do not use atomic_rmw_cas_3 because it needs extra LGR to implement swap.
     unsafe {
         // atomic swap is always SeqCst.
-        let val = U128 { whole: val };
-        let (mut prev_hi, mut prev_lo);
         asm!(
-            "lpq %r0, 0({dst})",
-            "2:",
-                "cdsg %r0, %r12, 0({dst})",
-                "jl 2b",
+            "lg %r0, 8({dst})",             // atomic { r0 = *dst.add(8) }
+            "lg %r1, 0({dst})",             // atomic { r1 = *dst }
+            "2:", // 'retry:
+                "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:r13 } else { cc = 1; r0:r1 = *dst } }
+                "jl 2b",                    // if cc == 1 { jump 'retry }
             dst = in(reg) ptr_reg!(dst),
             // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
             out("r0") prev_hi,
@@ -252,17 +296,19 @@ macro_rules! atomic_rmw_cas_3 {
         #[inline]
         unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
+            let val = U128 { whole: val };
+            let (mut prev_hi, mut prev_lo);
+
             // SAFETY: the caller must uphold the safety contract.
             unsafe {
                 // atomic RMW is always SeqCst.
-                let val = U128 { whole: val };
-                let (mut prev_hi, mut prev_lo);
                 asm!(
-                    "lpq %r0, 0({dst})",
-                    "2:",
+                    "lg %r0, 8({dst})",             // atomic { r0 = *dst.add(8) }
+                    "lg %r1, 0({dst})",             // atomic { r1 = *dst }
+                    "2:", // 'retry:
                         $($op)*
-                        "cdsg %r0, %r12, 0({dst})",
-                        "jl 2b",
+                        "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:r13 } else { cc = 1; r0:r1 = *dst } }
+                        "jl 2b",                    // if cc == 1 { jump 'retry }
                     dst = in(reg) ptr_reg!(dst),
                     val_hi = in(reg) val.pair.hi,
                     val_lo = in(reg) val.pair.lo,
@@ -293,16 +339,18 @@ macro_rules! atomic_rmw_cas_2 {
         #[inline]
         unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
+            let (mut prev_hi, mut prev_lo);
+
             // SAFETY: the caller must uphold the safety contract.
             unsafe {
                 // atomic RMW is always SeqCst.
-                let (mut prev_hi, mut prev_lo);
                 asm!(
-                    "lpq %r0, 0({dst})",
-                    "2:",
+                    "lg %r0, 8({dst})",             // atomic { r0 = *dst.add(8) }
+                    "lg %r1, 0({dst})",             // atomic { r1 = *dst }
+                    "2:", // 'retry:
                         $($op)*
-                        "cdsg %r0, %r12, 0({dst})",
-                        "jl 2b",
+                        "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:r13 } else { cc = 1; r0:r1 = *dst } }
+                        "jl 2b",                    // if cc == 1 { jump 'retry }
                     dst = in(reg) ptr_reg!(dst),
                     $($reg)*
                     // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
@@ -321,20 +369,20 @@ macro_rules! atomic_rmw_cas_2 {
 
 atomic_rmw_cas_3! {
     atomic_add, [],
-    distinct_op!("algr", "%r13", "%r1", "{val_lo}"),
-    "lgr %r12, %r0",
-    "alcgr %r12, {val_hi}",
+    distinct_op!("algr", "%r13", "%r1", "{val_lo}"), // r13 = r1 + val_lo; cc = zero | carry
+    "lgr %r12, %r0",                                 // r12 = r0
+    "alcgr %r12, {val_hi}",                          // r12 += val_hi + carry
 }
 atomic_rmw_cas_3! {
     atomic_sub, [],
-    distinct_op!("slgr", "%r13", "%r1", "{val_lo}"),
-    "lgr %r12, %r0",
-    "slbgr %r12, {val_hi}",
+    distinct_op!("slgr", "%r13", "%r1", "{val_lo}"), // r13 = r1 - val_lo; cc = zero | borrow
+    "lgr %r12, %r0",                                 // r12 = r0
+    "slbgr %r12, {val_hi}",                          // r12 -= val_hi + borrow
 }
 atomic_rmw_cas_3! {
     atomic_and, [],
-    distinct_op!("ngr", "%r13", "%r1", "{val_lo}"),
-    distinct_op!("ngr", "%r12", "%r0", "{val_hi}"),
+    distinct_op!("ngr", "%r13", "%r1", "{val_lo}"), // r13 = r1 & val_lo
+    distinct_op!("ngr", "%r12", "%r0", "{val_hi}"), // r12 = r0 & val_hi
 }
 
 // Use nngrk on z15 or later.
@@ -344,8 +392,8 @@ atomic_rmw_cas_3! {
 ))]
 atomic_rmw_cas_3! {
     atomic_nand, [],
-    "nngrk %r13, %r1, {val_lo}",
-    "nngrk %r12, %r0, {val_hi}",
+    "nngrk %r13, %r1, {val_lo}", // r13 = !(r1 & val_lo)
+    "nngrk %r12, %r0, {val_hi}", // r12 = !(r0 & val_hi)
 }
 #[cfg(not(any(
     target_feature = "miscellaneous-extensions-3",
@@ -353,23 +401,23 @@ atomic_rmw_cas_3! {
 )))]
 atomic_rmw_cas_3! {
     atomic_nand, [],
-    distinct_op!("ngr", "%r13", "%r1", "{val_lo}"),
-    distinct_op!("ngr", "%r12", "%r0", "{val_hi}"),
-    "lcgr %r13, %r13",
-    "aghi %r13, -1",
-    "lcgr %r12, %r12",
-    "aghi %r12, -1",
+    distinct_op!("ngr", "%r13", "%r1", "{val_lo}"), // r13 = r1 & val_lo
+    distinct_op!("ngr", "%r12", "%r0", "{val_hi}"), // r12 = r0 & val_hi
+    "lcgr %r13, %r13",                              // r13 = !r13 + 1
+    "aghi %r13, -1",                                // r13 -= 1
+    "lcgr %r12, %r12",                              // r12 = !r12 + 1
+    "aghi %r12, -1",                                // r12 -= 1
 }
 
 atomic_rmw_cas_3! {
     atomic_or, [],
-    distinct_op!("ogr", "%r13", "%r1", "{val_lo}"),
-    distinct_op!("ogr", "%r12", "%r0", "{val_hi}"),
+    distinct_op!("ogr", "%r13", "%r1", "{val_lo}"), // r13 = r1 | val_lo
+    distinct_op!("ogr", "%r12", "%r0", "{val_hi}"), // r12 = r0 | val_hi
 }
 atomic_rmw_cas_3! {
     atomic_xor, [],
-    distinct_op!("xgr", "%r13", "%r1", "{val_lo}"),
-    distinct_op!("xgr", "%r12", "%r0", "{val_hi}"),
+    distinct_op!("xgr", "%r13", "%r1", "{val_lo}"), // r13 = r1 ^ val_lo
+    distinct_op!("xgr", "%r12", "%r0", "{val_hi}"), // r12 = r0 ^ val_hi
 }
 
 #[cfg(any(
@@ -378,12 +426,12 @@ atomic_rmw_cas_3! {
 ))]
 atomic_rmw_cas_3! {
     atomic_max, [],
-    "clgr %r1, {val_lo}",
-    select_op!("h", "%r12", "%r1", "{val_lo}"),
-    "cgr %r0, {val_hi}",
-    select_op!("h", "%r13", "%r1", "{val_lo}"),
-    "locgre %r13, %r12",
-    select_op!("h", "%r12", "%r0", "{val_hi}"),
+    "clgr %r1, {val_lo}",                       // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 }
+    select_op!("h", "%r12", "%r1", "{val_lo}"), // if cc == 2 { r12 = r1 } else { r12 = val_lo }
+    "cgr %r0, {val_hi}",                        // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 }
+    select_op!("h", "%r13", "%r1", "{val_lo}"), // if cc == 2 { r13 = r1 } else { r13 = val_lo }
+    "locgre %r13, %r12",                        // if cc == 0 { r13 = r12 }
+    select_op!("h", "%r12", "%r0", "{val_hi}"), // if cc == 2 { r12 = r0 } else { r12 = val_hi }
 }
 #[cfg(any(
     target_feature = "load-store-on-cond",
@@ -391,13 +439,13 @@ atomic_rmw_cas_3! {
 ))]
 atomic_rmw_cas_3! {
     atomic_umax, [tmp = out(reg) _,],
-    "clgr %r1, {val_lo}",
-    select_op!("h", "{tmp}", "%r1", "{val_lo}"),
-    "clgr %r0, {val_hi}",
-    select_op!("h", "%r12", "%r0", "{val_hi}"),
-    select_op!("h", "%r13", "%r1", "{val_lo}"),
-    "cgr %r0, {val_hi}",
-    "locgre %r13, {tmp}",
+    "clgr %r1, {val_lo}",                        // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 }
+    select_op!("h", "{tmp}", "%r1", "{val_lo}"), // if cc == 2 { tmp = r1 } else { tmp = val_lo }
+    "clgr %r0, {val_hi}",                        // if r0(u) < val_hi(u) { cc = 1 } else if r0(u) > val_hi(u) { cc = 2 } else { cc = 0 }
+    select_op!("h", "%r12", "%r0", "{val_hi}"),  // if cc == 2 { r12 = r0 } else { r12 = val_hi }
+    select_op!("h", "%r13", "%r1", "{val_lo}"),  // if cc == 2 { r13 = r1 } else { r13 = val_lo }
+    "cgr %r0, {val_hi}",                         // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 }
+    "locgre %r13, {tmp}",                        // if cc == 0 { r13 = tmp }
 }
 #[cfg(any(
     target_feature = "load-store-on-cond",
@@ -405,12 +453,12 @@ atomic_rmw_cas_3! {
 ))]
 atomic_rmw_cas_3! {
     atomic_min, [],
-    "clgr %r1, {val_lo}",
-    select_op!("l", "%r12", "%r1", "{val_lo}"),
-    "cgr %r0, {val_hi}",
-    select_op!("l", "%r13", "%r1", "{val_lo}"),
-    "locgre %r13, %r12",
-    select_op!("l", "%r12", "%r0", "{val_hi}"),
+    "clgr %r1, {val_lo}",                       // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 }
+    select_op!("l", "%r12", "%r1", "{val_lo}"), // if cc == 1 { r12 = r1 } else { r12 = val_lo }
+    "cgr %r0, {val_hi}",                        // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 }
+    select_op!("l", "%r13", "%r1", "{val_lo}"), // if cc == 1 { r13 = r1 } else { r13 = val_lo }
+    "locgre %r13, %r12",                        // if cc == 0 { r13 = r12 }
+    select_op!("l", "%r12", "%r0", "{val_hi}"), // if cc == 1 { r12 = r0 } else { r12 = val_hi }
 }
 #[cfg(any(
     target_feature = "load-store-on-cond",
@@ -418,13 +466,13 @@ atomic_rmw_cas_3! {
 ))]
 atomic_rmw_cas_3! {
     atomic_umin, [tmp = out(reg) _,],
-    "clgr %r1, {val_lo}",
-    select_op!("l", "{tmp}", "%r1", "{val_lo}"),
-    "clgr %r0, {val_hi}",
-    select_op!("l", "%r12", "%r0", "{val_hi}"),
-    select_op!("l", "%r13", "%r1", "{val_lo}"),
-    "cgr %r0, {val_hi}",
-    "locgre %r13, {tmp}",
+    "clgr %r1, {val_lo}",                        // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 }
+    select_op!("l", "{tmp}", "%r1", "{val_lo}"), // if cc == 1 { tmp = r1 } else { tmp = val_lo }
+    "clgr %r0, {val_hi}",                        // if r0(u) < val_hi(u) { cc = 1 } else if r0(u) > val_hi(u) { cc = 2 } else { cc = 0 }
+    select_op!("l", "%r12", "%r0", "{val_hi}"),  // if cc == 1 { r12 = r0 } else { r12 = val_hi }
+    select_op!("l", "%r13", "%r1", "{val_lo}"),  // if cc == 1 { r13 = r1 } else { r13 = val_lo }
+    "cgr %r0, {val_hi}",                         // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 }
+    "locgre %r13, {tmp}",                        // if cc == 0 { r13 = tmp }
 }
 // We use atomic_update for atomic min/max on pre-z196 because
 // z10 doesn't seem to have a good way to implement 128-bit min/max.
@@ -438,26 +486,26 @@ atomic_rmw_by_atomic_update!(cmp);
 
 atomic_rmw_cas_2! {
     atomic_not, [],
-    "lcgr %r13, %r1",
-    "aghi %r13, -1",
-    "lcgr %r12, %r0",
-    "aghi %r12, -1",
+    "lcgr %r13, %r1", // r13 = !r1 + 1
+    "aghi %r13, -1",  // r13 -= 1
+    "lcgr %r12, %r0", // r12 = !r0 + 1
+    "aghi %r12, -1",  // r12 -= 1
 }
 
 #[cfg(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops"))]
 atomic_rmw_cas_2! {
     atomic_neg, [zero = in(reg) 0_u64,],
-    "slgrk %r13, {zero}, %r1",
-    "lghi %r12, 0",
-    "slbgr %r12, %r0",
+    "slgrk %r13, {zero}, %r1", // r13 = 0 - r1; cc = zero | borrow
+    "lghi %r12, 0",            // r12 = 0
+    "slbgr %r12, %r0",         // r12 -= r0 + borrow
 }
 #[cfg(not(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops")))]
 atomic_rmw_cas_2! {
     atomic_neg, [],
-    "lghi %r13, 0",
-    "slgr %r13, %r1",
-    "lghi %r12, 0",
-    "slbgr %r12, %r0",
+    "lghi %r13, 0",    // r13 = 0
+    "slgr %r13, %r1",  // r13 -= r1; cc = zero | borrow
+    "lghi %r12, 0",    // r12 = 0
+    "slbgr %r12, %r0", // r12 -= r0 + borrow
 }
 
 #[inline]
diff --git a/src/imp/atomic64/riscv32.rs b/src/imp/atomic64/riscv32.rs
index 8e3084f8..c4c7c315 100644
--- a/src/imp/atomic64/riscv32.rs
+++ b/src/imp/atomic64/riscv32.rs
@@ -1,7 +1,15 @@
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
 /*
-64-bit atomic implementation on riscv32 using amocas.d (DWCAS).
+64-bit atomic implementation on riscv32.
+
+This architecture provides the following 64-bit atomic instructions:
+
+- amocas.d: CAS (Zacas extension)
+
+See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and
+detailed description of the atomic and synchronize instructions in this architecture:
+https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#risc-v
 
 Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use
 this module and use fallback implementation instead.
@@ -169,18 +177,18 @@ unsafe fn atomic_load(src: *mut u64, order: Ordering) -> u64 {
 unsafe fn atomic_load_zacas(src: *mut u64, order: Ordering) -> u64 {
     debug_assert!(src as usize % 8 == 0);
     debug_assert_zacas!();
+    let (out_lo, out_hi);
 
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
-        let (out_lo, out_hi);
         // LLVM doesn't support `.option arch, +zabha` directive as of LLVM 19 because it is experimental.
         // So, we currently always using .4byte directive.
         // macro_rules! load {
         //     ($fence:tt, $asm_order:tt) => {
         //         asm!(
         //             start_zacas!(),
-        //             $fence,
-        //             concat!("amocas.d", $asm_order, " a2, a2, 0({src})"),
+        //             $fence,                                               // fence
+        //             concat!("amocas.d", $asm_order, " a2, a2, 0({src})"), // atomic { if *dst == a2:a3 { *dst = a2:a3 } else { a2:a3 = *dst } }
         //             end_zacas!(),
         //             src = in(reg) ptr_reg!(src),
         //             inout("a2") 0_u32 => out_lo,
@@ -193,8 +201,8 @@ unsafe fn atomic_load_zacas(src: *mut u64, order: Ordering) -> u64 {
         macro_rules! load {
             ($fence:tt, $insn_order:tt) => {
                 asm!(
-                    $fence,
-                    // 4: 2{8,c,a,e}c5362f     	amocas.d{,.aq,.rl,.aqrl}	a2, a2, (a0)
+                    $fence,                                  // fence
+                    // amocas.d{,.aq,.rl,.aqrl} a2, a2, (a0) // atomic { if *a0 == a2:a3 { *a0 = a2:a3 } else { a2:a3 = *a0 } }
                     concat!(".4byte 0x2", $insn_order, "c5362f"),
                     in("a0") ptr_reg!(src),
                     inout("a2") 0_u32 => out_lo,
@@ -318,20 +326,20 @@ unsafe fn atomic_compare_exchange_zacas(
     debug_assert!(dst as usize % 8 == 0);
     debug_assert_zacas!();
     let order = crate::utils::upgrade_success_ordering(success, failure);
+    let old = U64 { whole: old };
+    let new = U64 { whole: new };
+    let (prev_lo, prev_hi);
 
     // SAFETY: the caller must uphold the safety contract.
-    let prev = unsafe {
-        let old = U64 { whole: old };
-        let new = U64 { whole: new };
-        let (prev_lo, prev_hi);
+    unsafe {
         // LLVM doesn't support `.option arch, +zabha` directive as of LLVM 19 because it is experimental.
         // So, we currently always using .4byte directive.
         // macro_rules! cmpxchg {
         //     ($fence:tt, $asm_order:tt) => {
         //         asm!(
         //             start_zacas!(),
-        //             $fence,
-        //             concat!("amocas.d", $asm_order, " a4, a2, 0({dst})"),
+        //             $fence,                                               // fence
+        //             concat!("amocas.d", $asm_order, " a4, a2, 0({dst})"), // atomic { if *dst == a4:a5 { *dst = a2:a3 } else { a4:a5 = *dst } }
         //             end_zacas!(),
         //             dst = in(reg) ptr_reg!(dst),
         //             // must be allocated to even/odd register pair
@@ -348,8 +356,8 @@ unsafe fn atomic_compare_exchange_zacas(
         macro_rules! cmpxchg {
             ($fence:tt, $insn_order:tt) => {
                 asm!(
-                    $fence,
-                    // 10: 2{8,c,a,e}c5372f     	amocas.d{,.aq,.rl,.aqrl}	a4, a2, (a0)
+                    $fence,                                  // fence
+                    // amocas.d{,.aq,.rl,.aqrl} a4, a2, (a0) // atomic { if *a0 == a4:a5 { *a0 = a2:a3 } else { a4:a5 = *a0 } }
                     concat!(".4byte 0x2", $insn_order, "c5372f"),
                     in("a0") ptr_reg!(dst),
                     // must be allocated to even/odd register pair
@@ -363,23 +371,24 @@ unsafe fn atomic_compare_exchange_zacas(
             };
         }
         atomic_rmw_amocas_order_insn!(cmpxchg, order, failure = failure);
-        U64 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
-    };
-    (prev, prev == old)
+        let prev = U64 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole;
+        (prev, prev == old.whole)
+    }
 }
 
 // amocas is always strong.
 use self::atomic_compare_exchange as atomic_compare_exchange_weak;
 
-// 64-bit atomic load by two 32-bit atomic loads. (see arm_linux.rs for more)
+// 64-bit atomic load by two 32-bit atomic loads.
 #[inline]
 unsafe fn byte_wise_atomic_load(src: *const u64) -> u64 {
+    let (out_lo, out_hi);
+
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
-        let (out_lo, out_hi);
         asm!(
-            "lw {out_lo}, ({src})",
-            "lw {out_hi}, 4({src})",
+            "lw {out_lo}, ({src})",  // atomic { out_lo = *src }
+            "lw {out_hi}, 4({src})", // atomic { out_hi = *src.add(4) }
             src = in(reg) ptr_reg!(src),
             out_lo = out(reg) out_lo,
             out_hi = out(reg) out_hi,
@@ -389,28 +398,10 @@ unsafe fn byte_wise_atomic_load(src: *const u64) -> u64 {
     }
 }
 
-#[inline(always)]
-unsafe fn atomic_update_zacas<F>(dst: *mut u64, order: Ordering, mut f: F) -> u64
-where
-    F: FnMut(u64) -> u64,
-{
-    // SAFETY: the caller must uphold the safety contract.
-    unsafe {
-        let mut prev = byte_wise_atomic_load(dst);
-        loop {
-            let next = f(prev);
-            match atomic_compare_exchange_weak(dst, prev, next, order, Ordering::Relaxed) {
-                Ok(x) => return x,
-                Err(x) => prev = x,
-            }
-        }
-    }
-}
-
 macro_rules! select_atomic_rmw {
     (
         unsafe fn $name:ident(dst: *mut u64 $(, $($arg:tt)*)?) $(-> $ret_ty:ty)? {
-            $($zacas_fn_body:tt)*
+            |$zacas_fn_binding:ident| $($zacas_fn_body:tt)*
         }
         zacas = $zacas_fn:ident;
         non_seqcst_fallback = $non_seqcst_fallback_fn:ident;
@@ -419,7 +410,26 @@ macro_rules! select_atomic_rmw {
         #[inline]
         unsafe fn $zacas_fn(dst: *mut u64 $(, $($arg)*)?, order: Ordering) $(-> $ret_ty)? {
             // SAFETY: the caller must uphold the safety contract.
-            unsafe { atomic_update_zacas(dst, order, $($zacas_fn_body)*) }
+            unsafe {
+                // This is not single-copy atomic reads, but this is ok because subsequent
+                // CAS will check for consistency.
+                //
+                // Note that the C++20 memory model does not allow mixed-sized atomic access,
+                // so we must use inline assembly to implement byte_wise_atomic_load.
+                // (i.e., byte-wise atomic based on the standard library's atomic types
+                // cannot be used here).
+                let mut prev = byte_wise_atomic_load(dst);
+                loop {
+                    let next = {
+                        let $zacas_fn_binding = prev;
+                        $($zacas_fn_body)*
+                    };
+                    match atomic_compare_exchange_weak(dst, prev, next, order, Ordering::Relaxed) {
+                        Ok(x) => return x,
+                        Err(x) => prev = x,
+                    }
+                }
+            }
         }
         // If zacas is available at compile-time, we can always use zacas_fn.
         #[cfg(any(
@@ -503,7 +513,7 @@ macro_rules! select_atomic_rmw {
 
 select_atomic_rmw! {
     unsafe fn atomic_swap(dst: *mut u64, val: u64) -> u64 {
-        |_| val
+        |_x| val
     }
     zacas = atomic_swap_zacas;
     non_seqcst_fallback = atomic_swap_non_seqcst;
@@ -605,7 +615,7 @@ select_atomic_rmw! {
 }
 select_atomic_rmw! {
     unsafe fn atomic_neg(dst: *mut u64) -> u64 {
-        u64::wrapping_neg
+        |x| x.wrapping_neg()
     }
     zacas = atomic_neg_zacas;
     non_seqcst_fallback = atomic_neg_non_seqcst;
diff --git a/src/imp/avr.rs b/src/imp/avr.rs
new file mode 100644
index 00000000..36f530e4
--- /dev/null
+++ b/src/imp/avr.rs
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+/*
+Atomic load/store implementation on AVR.
+
+Operations not supported here are provided by disabling interrupts.
+See also src/imp/interrupt/avr.rs.
+
+See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and
+detailed description of the atomic and synchronize instructions in this architecture:
+https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#avr
+
+Note: Ordering is always SeqCst.
+
+Refs:
+- AVR® Instruction Set Manual, Rev. DS40002198B
+  https://ww1.microchip.com/downloads/en/DeviceDoc/AVR-InstructionSet-Manual-DS40002198.pdf
+- atomic-maybe-uninit
+  https://github.com/taiki-e/atomic-maybe-uninit
+
+Generated asm:
+- avr https://godbolt.org/z/j49rYbj4d
+*/
+
+use core::{arch::asm, cell::UnsafeCell, sync::atomic::Ordering};
+
+macro_rules! atomic8 {
+    ($atomic_type:ident, $value_type:ty) => {
+        #[repr(transparent)]
+        pub(crate) struct $atomic_type {
+            v: UnsafeCell<$value_type>,
+        }
+
+        // Send is implicitly implemented for atomic integers, but not for atomic pointers.
+        // SAFETY: any data races are prevented by atomic operations.
+        unsafe impl Send for $atomic_type {}
+        // SAFETY: any data races are prevented by atomic operations.
+        unsafe impl Sync for $atomic_type {}
+
+        impl $atomic_type {
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn load(&self, order: Ordering) -> $value_type {
+                crate::utils::assert_load_ordering(order);
+                let src = self.v.get();
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    let out;
+                    asm!(
+                        "ld {out}, Z", // atomic { out = *Z }
+                        in("Z") src,
+                        out = out(reg) out,
+                        options(nostack, preserves_flags),
+                    );
+                    out
+                }
+            }
+
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn store(&self, val: $value_type, order: Ordering) {
+                crate::utils::assert_store_ordering(order);
+                let dst = self.v.get();
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    asm!(
+                        "st Z, {val}", // atomic { *Z = val }
+                        in("Z") dst,
+                        val = in(reg) val,
+                        options(nostack, preserves_flags),
+                    );
+                }
+            }
+        }
+    };
+}
+
+atomic8!(AtomicI8, i8);
+atomic8!(AtomicU8, u8);
diff --git a/src/imp/interrupt/README.md b/src/imp/interrupt/README.md
index bf471795..ccc74395 100644
--- a/src/imp/interrupt/README.md
+++ b/src/imp/interrupt/README.md
@@ -25,6 +25,7 @@ For some targets, the implementation can be changed by explicitly enabling featu
 Some operations don't require disabling interrupts:
 
 - On architectures except for AVR: loads and stores with pointer size or smaller
+- On AVR: 8-bit loads and stores
 - On MSP430 additionally: {8,16}-bit `add,sub,and,or,xor,not`
 - On RISC-V with the `zaamo` target feature (or `portable_atomic_target_feature="zaamo"` cfg or `force-amo` feature or `portable_atomic_force_amo` cfg) additionally: 32-bit(RV32)/{32,64}-bit(RV64) `swap,fetch_{add,sub,and,or,xor,not,max,min},add,sub,and,or,xor,not`, {8,16}-bit `fetch_{and,or,xor,not},and,or,xor,not`[^1], and all operations of `AtomicBool`
 
diff --git a/src/imp/interrupt/avr.rs b/src/imp/interrupt/avr.rs
index 6d479a68..e39f860d 100644
--- a/src/imp/interrupt/avr.rs
+++ b/src/imp/interrupt/avr.rs
@@ -3,8 +3,11 @@
 /*
 Adapted from https://github.com/Rahix/avr-device.
 
+See also src/imp/avr.rs.
+
 Refs:
-- AVR Instruction Set Manual https://ww1.microchip.com/downloads/en/DeviceDoc/AVR-InstructionSet-Manual-DS40002198.pdf
+- AVR® Instruction Set Manual, Rev. DS40002198B
+  https://ww1.microchip.com/downloads/en/DeviceDoc/AVR-InstructionSet-Manual-DS40002198.pdf
 
 Generated asm:
 - avr https://godbolt.org/z/W5jxGsToc
@@ -13,6 +16,9 @@ Generated asm:
 #[cfg(not(portable_atomic_no_asm))]
 use core::arch::asm;
 
+#[cfg(not(portable_atomic_no_asm))]
+pub(super) use super::super::avr as atomic;
+
 pub(super) type State = u8;
 
 /// Disables interrupts and returns the previous interrupt state.
@@ -27,9 +33,9 @@ pub(super) fn disable() -> State {
         // Refs: https://ww1.microchip.com/downloads/en/DeviceDoc/AVR-InstructionSet-Manual-DS40002198.pdf#page=58
         #[cfg(not(portable_atomic_no_asm))]
         asm!(
-            "in {0}, 0x3F",
-            "cli",
-            out(reg) sreg,
+            "in {sreg}, 0x3F", // sreg = SREG
+            "cli",             // SREG.I = 0
+            sreg = out(reg) sreg,
             options(nostack),
         );
         #[cfg(portable_atomic_no_asm)]
@@ -47,7 +53,7 @@ pub(super) fn disable() -> State {
 ///
 /// The state must be the one retrieved by the previous `disable`.
 #[inline(always)]
-pub(super) unsafe fn restore(sreg: State) {
+pub(super) unsafe fn restore(prev_sreg: State) {
     // SAFETY: the caller must guarantee that the state was retrieved by the previous `disable`,
     unsafe {
         // This clobbers the entire status register. See msp430.rs to safety on this.
@@ -55,8 +61,12 @@ pub(super) unsafe fn restore(sreg: State) {
         // Do not use `nomem` and `readonly` because prevent preceding memory accesses from being reordered after interrupts are enabled.
         // Do not use `preserves_flags` because OUT modifies the status register (SREG).
         #[cfg(not(portable_atomic_no_asm))]
-        asm!("out 0x3F, {0}", in(reg) sreg, options(nostack));
+        asm!(
+            "out 0x3F, {prev_sreg}", // SREG = prev_sreg
+            prev_sreg = in(reg) prev_sreg,
+            options(nostack),
+        );
         #[cfg(portable_atomic_no_asm)]
-        llvm_asm!("out 0x3F, $0" :: "r"(sreg) : "memory" : "volatile");
+        llvm_asm!("out 0x3F, $0" :: "r"(prev_sreg) : "memory" : "volatile");
     }
 }
diff --git a/src/imp/interrupt/mod.rs b/src/imp/interrupt/mod.rs
index 94fc4b76..d7713f7f 100644
--- a/src/imp/interrupt/mod.rs
+++ b/src/imp/interrupt/mod.rs
@@ -41,7 +41,10 @@ See also README.md of this directory.
 // CAS together with atomic load/store. The load/store will not be
 // called while interrupts are disabled, and since the load/store is
 // atomic, it is not affected by interrupts even if interrupts are enabled.
-#[cfg(not(any(target_arch = "avr", feature = "critical-section")))]
+#[cfg(not(any(
+    all(target_arch = "avr", portable_atomic_no_asm),
+    feature = "critical-section",
+)))]
 use self::arch::atomic;
 
 #[cfg(not(feature = "critical-section"))]
@@ -300,11 +303,17 @@ macro_rules! atomic_int {
             #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
             pub(crate) fn load(&self, order: Ordering) -> $int_type {
                 crate::utils::assert_load_ordering(order);
-                #[cfg(not(any(target_arch = "avr", feature = "critical-section")))]
+                #[cfg(not(any(
+                    all(target_arch = "avr", portable_atomic_no_asm),
+                    feature = "critical-section",
+                )))]
                 {
                     self.as_native().load(order)
                 }
-                #[cfg(any(target_arch = "avr", feature = "critical-section"))]
+                #[cfg(any(
+                    all(target_arch = "avr", portable_atomic_no_asm),
+                    feature = "critical-section",
+                ))]
                 // SAFETY: any data races are prevented by disabling interrupts (see
                 // module-level comments) and the raw pointer is valid because we got it
                 // from a reference.
@@ -315,18 +324,27 @@ macro_rules! atomic_int {
             #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
             pub(crate) fn store(&self, val: $int_type, order: Ordering) {
                 crate::utils::assert_store_ordering(order);
-                #[cfg(not(any(target_arch = "avr", feature = "critical-section")))]
+                #[cfg(not(any(
+                    all(target_arch = "avr", portable_atomic_no_asm),
+                    feature = "critical-section",
+                )))]
                 {
                     self.as_native().store(val, order);
                 }
-                #[cfg(any(target_arch = "avr", feature = "critical-section"))]
+                #[cfg(any(
+                    all(target_arch = "avr", portable_atomic_no_asm),
+                    feature = "critical-section",
+                ))]
                 // SAFETY: any data races are prevented by disabling interrupts (see
                 // module-level comments) and the raw pointer is valid because we got it
                 // from a reference.
                 with(|| unsafe { self.v.get().write(val) });
             }
 
-            #[cfg(not(any(target_arch = "avr", feature = "critical-section")))]
+            #[cfg(not(any(
+                all(target_arch = "avr", portable_atomic_no_asm),
+                feature = "critical-section",
+            )))]
             #[inline(always)]
             fn as_native(&self) -> &atomic::$atomic_type {
                 // SAFETY: $atomic_type and atomic::$atomic_type have the same layout and
@@ -841,9 +859,15 @@ macro_rules! atomic_int {
 }
 
 #[cfg(target_pointer_width = "16")]
+#[cfg(not(target_arch = "avr"))]
 atomic_int!(load_store_atomic, AtomicIsize, isize, 2);
 #[cfg(target_pointer_width = "16")]
+#[cfg(not(target_arch = "avr"))]
 atomic_int!(load_store_atomic, AtomicUsize, usize, 2);
+#[cfg(target_arch = "avr")]
+atomic_int!(all_critical_session, AtomicIsize, isize, 2);
+#[cfg(target_arch = "avr")]
+atomic_int!(all_critical_session, AtomicUsize, usize, 2);
 #[cfg(target_pointer_width = "32")]
 atomic_int!(load_store_atomic, AtomicIsize, isize, 4);
 #[cfg(target_pointer_width = "32")]
@@ -857,10 +881,22 @@ atomic_int!(load_store_atomic, AtomicIsize, isize, 16);
 #[cfg(target_pointer_width = "128")]
 atomic_int!(load_store_atomic, AtomicUsize, usize, 16);
 
+#[cfg(not(all(target_arch = "avr", portable_atomic_no_asm)))]
 atomic_int!(load_store_atomic[sub_word], AtomicI8, i8, 1);
+#[cfg(not(all(target_arch = "avr", portable_atomic_no_asm)))]
 atomic_int!(load_store_atomic[sub_word], AtomicU8, u8, 1);
+#[cfg(all(target_arch = "avr", portable_atomic_no_asm))]
+atomic_int!(all_critical_session, AtomicI8, i8, 1);
+#[cfg(all(target_arch = "avr", portable_atomic_no_asm))]
+atomic_int!(all_critical_session, AtomicU8, u8, 1);
+#[cfg(not(target_arch = "avr"))]
 atomic_int!(load_store_atomic[sub_word], AtomicI16, i16, 2);
+#[cfg(not(target_arch = "avr"))]
 atomic_int!(load_store_atomic[sub_word], AtomicU16, u16, 2);
+#[cfg(target_arch = "avr")]
+atomic_int!(all_critical_session, AtomicI16, i16, 2);
+#[cfg(target_arch = "avr")]
+atomic_int!(all_critical_session, AtomicU16, u16, 2);
 
 #[cfg(not(target_pointer_width = "16"))]
 atomic_int!(load_store_atomic, AtomicI32, i32, 4);
diff --git a/src/imp/interrupt/msp430.rs b/src/imp/interrupt/msp430.rs
index a60b8515..a7792f7a 100644
--- a/src/imp/interrupt/msp430.rs
+++ b/src/imp/interrupt/msp430.rs
@@ -5,7 +5,9 @@ Adapted from https://github.com/rust-embedded/msp430.
 
 See also src/imp/msp430.rs.
 
-Refs: https://www.ti.com/lit/ug/slau208q/slau208q.pdf
+Refs:
+- MSP430x5xx and MSP430x6xx Family User's Guide, Rev. Q
+  https://www.ti.com/lit/ug/slau208q/slau208q.pdf
 
 Generated asm:
 - msp430 https://godbolt.org/z/fc6h89xac
@@ -27,11 +29,12 @@ pub(super) fn disable() -> State {
     unsafe {
         // Do not use `nomem` and `readonly` because prevent subsequent memory accesses from being reordered before interrupts are disabled.
         // Do not use `preserves_flags` because DINT modifies the GIE (global interrupt enable) bit of the status register.
+        // See "NOTE: Enable and Disable Interrupt" of User's Guide for NOP: https://www.ti.com/lit/ug/slau208q/slau208q.pdf#page=60
         #[cfg(not(portable_atomic_no_asm))]
         asm!(
-            "mov r2, {0}",
-            "dint {{ nop",
-            out(reg) sr,
+            "mov r2, {sr}", // sr = SR
+            "dint {{ nop",  // SR.GIE = 0
+            sr = out(reg) sr,
             options(nostack),
         );
         #[cfg(portable_atomic_no_asm)]
@@ -49,7 +52,7 @@ pub(super) fn disable() -> State {
 ///
 /// The state must be the one retrieved by the previous `disable`.
 #[inline(always)]
-pub(super) unsafe fn restore(sr: State) {
+pub(super) unsafe fn restore(prev_sr: State) {
     // SAFETY: the caller must guarantee that the state was retrieved by the previous `disable`,
     unsafe {
         // This clobbers the entire status register, but we never explicitly modify
@@ -61,9 +64,14 @@ pub(super) unsafe fn restore(sr: State) {
         //
         // Do not use `nomem` and `readonly` because prevent preceding memory accesses from being reordered after interrupts are enabled.
         // Do not use `preserves_flags` because MOV modifies the status register.
+        // See "NOTE: Enable and Disable Interrupt" of User's Guide for NOP: https://www.ti.com/lit/ug/slau208q/slau208q.pdf#page=60
         #[cfg(not(portable_atomic_no_asm))]
-        asm!("nop {{ mov {0}, r2 {{ nop", in(reg) sr, options(nostack));
+        asm!(
+            "nop {{ mov {prev_sr}, r2 {{ nop", // SR = prev_sr
+            prev_sr = in(reg) prev_sr,
+            options(nostack),
+        );
         #[cfg(portable_atomic_no_asm)]
-        llvm_asm!("nop { mov $0, r2 { nop" :: "r"(sr) : "memory" : "volatile");
+        llvm_asm!("nop { mov $0, r2 { nop" :: "r"(prev_sr) : "memory" : "volatile");
     }
 }
diff --git a/src/imp/interrupt/riscv.rs b/src/imp/interrupt/riscv.rs
index 64c8e25a..affe417a 100644
--- a/src/imp/interrupt/riscv.rs
+++ b/src/imp/interrupt/riscv.rs
@@ -2,8 +2,11 @@
 
 /*
 Refs:
-- https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/machine.adoc#machine-status-mstatus-and-mstatush-registers
-- https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/supervisor.adoc#supervisor-status-sstatus-register
+- RISC-V Instruction Set Manual
+  Machine Status (mstatus and mstatush) Registers
+  https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/machine.adoc#machine-status-mstatus-and-mstatush-registers
+  Supervisor Status (sstatus) Register
+  https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/supervisor.adoc#supervisor-status-sstatus-register
 
 See also src/imp/riscv.rs.
 
@@ -62,7 +65,11 @@ pub(super) fn disable() -> State {
     // (see module-level comments of interrupt/mod.rs on the safety of using privileged instructions)
     unsafe {
         // Do not use `nomem` and `readonly` because prevent subsequent memory accesses from being reordered before interrupts are disabled.
-        asm!(concat!("csrrci {0}, ", status!(), ", ", mask!()), out(reg) status, options(nostack, preserves_flags));
+        asm!(
+            concat!("csrrci {status}, ", status!(), ", ", mask!()), // atomic { status = status!(); status!() &= !mask!() }
+            status = out(reg) status,
+            options(nostack, preserves_flags),
+        );
     }
     status
 }
@@ -79,7 +86,10 @@ pub(super) unsafe fn restore(status: State) {
         // and we've checked that interrupts were enabled before disabling interrupts.
         unsafe {
             // Do not use `nomem` and `readonly` because prevent preceding memory accesses from being reordered after interrupts are enabled.
-            asm!(concat!("csrsi ", status!(), ", ", mask!()), options(nostack, preserves_flags));
+            asm!(
+                concat!("csrsi ", status!(), ", ", mask!()), // atomic { status!() |= mask!() }
+                options(nostack, preserves_flags),
+            );
         }
     }
 }
diff --git a/src/imp/mod.rs b/src/imp/mod.rs
index 1f3d229d..dcc642ab 100644
--- a/src/imp/mod.rs
+++ b/src/imp/mod.rs
@@ -37,6 +37,12 @@
 )]
 mod core_atomic;
 
+// AVR
+#[cfg(target_arch = "avr")]
+#[cfg(not(portable_atomic_no_asm))]
+#[cfg(not(feature = "critical-section"))]
+mod avr;
+
 // MSP430
 #[cfg(target_arch = "msp430")]
 pub(crate) mod msp430;
diff --git a/src/imp/msp430.rs b/src/imp/msp430.rs
index 92f3f28c..2bc538b8 100644
--- a/src/imp/msp430.rs
+++ b/src/imp/msp430.rs
@@ -1,24 +1,32 @@
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
 /*
-Atomic load/store implementation on MSP430.
+Atomic implementation on MSP430.
 
 Adapted from https://github.com/pftbest/msp430-atomic.
 
 Operations not supported here are provided by disabling interrupts.
 See also src/imp/interrupt/msp430.rs.
 
+See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and
+detailed description of the atomic and synchronize instructions in this architecture:
+https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#msp430
+
 Note: Ordering is always SeqCst.
 
-Refs: https://www.ti.com/lit/ug/slau208q/slau208q.pdf
+Refs:
+- MSP430x5xx and MSP430x6xx Family User's Guide, Rev. Q
+  https://www.ti.com/lit/ug/slau208q/slau208q.pdf
+- atomic-maybe-uninit
+  https://github.com/taiki-e/atomic-maybe-uninit
 
 Generated asm:
-- msp430 https://godbolt.org/z/jaodMM4KM
+- msp430 https://godbolt.org/z/MGrd4jPoq
 */
 
 #[cfg(not(portable_atomic_no_asm))]
 use core::arch::asm;
-#[cfg(any(test, not(feature = "critical-section")))]
+#[cfg(not(feature = "critical-section"))]
 use core::cell::UnsafeCell;
 use core::sync::atomic::Ordering;
 
@@ -60,37 +68,23 @@ pub fn compiler_fence(order: Ordering) {
 }
 
 macro_rules! atomic {
-    (load_store, $([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $asm_suffix:tt) => {
-        #[cfg(any(test, not(feature = "critical-section")))]
+    (load_store, $([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $size:tt) => {
+        #[cfg(not(feature = "critical-section"))]
         #[repr(transparent)]
         pub(crate) struct $atomic_type $(<$($generics)*>)? {
             v: UnsafeCell<$value_type>,
         }
 
-        #[cfg(any(test, not(feature = "critical-section")))]
+        #[cfg(not(feature = "critical-section"))]
         // Send is implicitly implemented for atomic integers, but not for atomic pointers.
         // SAFETY: any data races are prevented by atomic operations.
         unsafe impl $(<$($generics)*>)? Send for $atomic_type $(<$($generics)*>)? {}
-        #[cfg(any(test, not(feature = "critical-section")))]
+        #[cfg(not(feature = "critical-section"))]
         // SAFETY: any data races are prevented by atomic operations.
         unsafe impl $(<$($generics)*>)? Sync for $atomic_type $(<$($generics)*>)? {}
 
-        #[cfg(any(test, not(feature = "critical-section")))]
+        #[cfg(not(feature = "critical-section"))]
         impl $(<$($generics)*>)? $atomic_type $(<$($generics)*>)? {
-            #[cfg(test)]
-            #[inline]
-            pub(crate) const fn new(v: $value_type) -> Self {
-                Self { v: UnsafeCell::new(v) }
-            }
-
-            #[cfg(test)]
-            #[inline]
-            pub(crate) fn is_lock_free() -> bool {
-                Self::IS_ALWAYS_LOCK_FREE
-            }
-            #[cfg(test)]
-            pub(crate) const IS_ALWAYS_LOCK_FREE: bool = true;
-
             #[inline]
             #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
             pub(crate) fn load(&self, order: Ordering) -> $value_type {
@@ -102,14 +96,14 @@ macro_rules! atomic {
                     let out;
                     #[cfg(not(portable_atomic_no_asm))]
                     asm!(
-                        concat!("mov", $asm_suffix, " @{src}, {out}"),
+                        concat!("mov.", $size, " @{src}, {out}"), // atomic { out = *src }
                         src = in(reg) src,
                         out = lateout(reg) out,
                         options(nostack, preserves_flags),
                     );
                     #[cfg(portable_atomic_no_asm)]
                     llvm_asm!(
-                        concat!("mov", $asm_suffix, " $1, $0")
+                        concat!("mov.", $size, " $1, $0")
                         : "=r"(out) : "*m"(src) : "memory" : "volatile"
                     );
                     out
@@ -126,23 +120,23 @@ macro_rules! atomic {
                 unsafe {
                     #[cfg(not(portable_atomic_no_asm))]
                     asm!(
-                        concat!("mov", $asm_suffix, " {val}, 0({dst})"),
+                        concat!("mov.", $size, " {val}, 0({dst})"), // atomic { *dst = val }
                         dst = in(reg) dst,
                         val = in(reg) val,
                         options(nostack, preserves_flags),
                     );
                     #[cfg(portable_atomic_no_asm)]
                     llvm_asm!(
-                        concat!("mov", $asm_suffix, " $1, $0")
+                        concat!("mov.", $size, " $1, $0")
                         :: "*m"(dst), "ir"(val) : "memory" : "volatile"
                     );
                 }
             }
         }
     };
-    ($([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $asm_suffix:tt) => {
-        atomic!(load_store, $([$($generics)*])? $atomic_type, $value_type, $asm_suffix);
-        #[cfg(any(test, not(feature = "critical-section")))]
+    ($([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $size:tt) => {
+        atomic!(load_store, $([$($generics)*])? $atomic_type, $value_type, $size);
+        #[cfg(not(feature = "critical-section"))]
         impl $(<$($generics)*>)? $atomic_type $(<$($generics)*>)? {
             #[inline]
             pub(crate) fn add(&self, val: $value_type, _order: Ordering) {
@@ -152,7 +146,7 @@ macro_rules! atomic {
                 unsafe {
                     #[cfg(not(portable_atomic_no_asm))]
                     asm!(
-                        concat!("add", $asm_suffix, " {val}, 0({dst})"),
+                        concat!("add.", $size, " {val}, 0({dst})"), // atomic { *dst += val }
                         dst = in(reg) dst,
                         val = in(reg) val,
                         // Do not use `preserves_flags` because ADD modifies the V, N, Z, and C bits of the status register.
@@ -160,7 +154,7 @@ macro_rules! atomic {
                     );
                     #[cfg(portable_atomic_no_asm)]
                     llvm_asm!(
-                        concat!("add", $asm_suffix, " $1, $0")
+                        concat!("add.", $size, " $1, $0")
                         :: "*m"(dst), "ir"(val) : "memory" : "volatile"
                     );
                 }
@@ -174,7 +168,7 @@ macro_rules! atomic {
                 unsafe {
                     #[cfg(not(portable_atomic_no_asm))]
                     asm!(
-                        concat!("sub", $asm_suffix, " {val}, 0({dst})"),
+                        concat!("sub.", $size, " {val}, 0({dst})"), // atomic { *dst -= val }
                         dst = in(reg) dst,
                         val = in(reg) val,
                         // Do not use `preserves_flags` because SUB modifies the V, N, Z, and C bits of the status register.
@@ -182,7 +176,7 @@ macro_rules! atomic {
                     );
                     #[cfg(portable_atomic_no_asm)]
                     llvm_asm!(
-                        concat!("sub", $asm_suffix, " $1, $0")
+                        concat!("sub.", $size, " $1, $0")
                         :: "*m"(dst), "ir"(val) : "memory" : "volatile"
                     );
                 }
@@ -196,7 +190,7 @@ macro_rules! atomic {
                 unsafe {
                     #[cfg(not(portable_atomic_no_asm))]
                     asm!(
-                        concat!("and", $asm_suffix, " {val}, 0({dst})"),
+                        concat!("and.", $size, " {val}, 0({dst})"), // atomic { *dst &= val }
                         dst = in(reg) dst,
                         val = in(reg) val,
                         // Do not use `preserves_flags` because AND modifies the V, N, Z, and C bits of the status register.
@@ -204,7 +198,7 @@ macro_rules! atomic {
                     );
                     #[cfg(portable_atomic_no_asm)]
                     llvm_asm!(
-                        concat!("and", $asm_suffix, " $1, $0")
+                        concat!("and.", $size, " $1, $0")
                         :: "*m"(dst), "ir"(val) : "memory" : "volatile"
                     );
                 }
@@ -218,14 +212,14 @@ macro_rules! atomic {
                 unsafe {
                     #[cfg(not(portable_atomic_no_asm))]
                     asm!(
-                        concat!("bis", $asm_suffix, " {val}, 0({dst})"),
+                        concat!("bis.", $size, " {val}, 0({dst})"), // atomic { *dst |= val }
                         dst = in(reg) dst,
                         val = in(reg) val,
                         options(nostack, preserves_flags),
                     );
                     #[cfg(portable_atomic_no_asm)]
                     llvm_asm!(
-                        concat!("bis", $asm_suffix, " $1, $0")
+                        concat!("bis.", $size, " $1, $0")
                         :: "*m"(dst), "ir"(val) : "memory" : "volatile"
                     );
                 }
@@ -239,7 +233,7 @@ macro_rules! atomic {
                 unsafe {
                     #[cfg(not(portable_atomic_no_asm))]
                     asm!(
-                        concat!("xor", $asm_suffix, " {val}, 0({dst})"),
+                        concat!("xor.", $size, " {val}, 0({dst})"), // atomic { *dst ^= val }
                         dst = in(reg) dst,
                         val = in(reg) val,
                         // Do not use `preserves_flags` because XOR modifies the V, N, Z, and C bits of the status register.
@@ -247,7 +241,7 @@ macro_rules! atomic {
                     );
                     #[cfg(portable_atomic_no_asm)]
                     llvm_asm!(
-                        concat!("xor", $asm_suffix, " $1, $0")
+                        concat!("xor.", $size, " $1, $0")
                         :: "*m"(dst), "ir"(val) : "memory" : "volatile"
                     );
                 }
@@ -261,26 +255,26 @@ macro_rules! atomic {
                 unsafe {
                     #[cfg(not(portable_atomic_no_asm))]
                     asm!(
-                        concat!("inv", $asm_suffix, " 0({dst})"),
+                        concat!("inv.", $size, " 0({dst})"), // atomic { *dst = !*dst }
                         dst = in(reg) dst,
                         // Do not use `preserves_flags` because INV modifies the V, N, Z, and C bits of the status register.
                         options(nostack),
                     );
                     #[cfg(portable_atomic_no_asm)]
                     llvm_asm!(
-                        concat!("inv", $asm_suffix, " $0")
+                        concat!("inv.", $size, " $0")
                         :: "*m"(dst) : "memory" : "volatile"
                     );
                 }
             }
         }
-    }
+    };
 }
 
-atomic!(AtomicI8, i8, ".b");
-atomic!(AtomicU8, u8, ".b");
-atomic!(AtomicI16, i16, ".w");
-atomic!(AtomicU16, u16, ".w");
-atomic!(AtomicIsize, isize, ".w");
-atomic!(AtomicUsize, usize, ".w");
-atomic!(load_store, [T] AtomicPtr, *mut T, ".w");
+atomic!(AtomicI8, i8, "b");
+atomic!(AtomicU8, u8, "b");
+atomic!(AtomicI16, i16, "w");
+atomic!(AtomicU16, u16, "w");
+atomic!(AtomicIsize, isize, "w");
+atomic!(AtomicUsize, usize, "w");
+atomic!(load_store, [T] AtomicPtr, *mut T, "w");
diff --git a/src/imp/riscv.rs b/src/imp/riscv.rs
index 178494bf..aba0ac9a 100644
--- a/src/imp/riscv.rs
+++ b/src/imp/riscv.rs
@@ -6,7 +6,11 @@ Atomic load/store implementation on RISC-V.
 This is for RISC-V targets without atomic CAS. (rustc doesn't provide atomics
 at all on such targets. https://github.com/rust-lang/rust/pull/114499)
 
-Also, optionally provides RMW implementation when force-amo or Zaamo target feature is enabled.
+Also, optionally provides RMW implementation when Zaamo extension or force-amo feature is enabled.
+
+See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and
+detailed description of the atomic and synchronize instructions in this architecture:
+https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#risc-v
 
 Refs:
 - RISC-V Instruction Set Manual
@@ -16,7 +20,8 @@ Refs:
   https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/zabha.adoc
 - RISC-V Atomics ABI Specification
   https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/draft-20240829-13bfa9f54634cb60d86b9b333e109f077805b4b3/riscv-atomic.adoc
-- atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
+- atomic-maybe-uninit
+  https://github.com/taiki-e/atomic-maybe-uninit
 
 Generated asm:
 - riscv64gc https://godbolt.org/z/Ws933n9jE
@@ -86,7 +91,7 @@ macro_rules! atomic_rmw_amo_ext {
     portable_atomic_target_feature = "zaamo",
 ))]
 macro_rules! atomic_rmw_amo {
-    ($op:ident, $dst:ident, $val:ident, $order:ident, $asm_suffix:tt) => {{
+    ($op:ident, $dst:ident, $val:ident, $order:ident, $size:tt) => {{
         let out;
         macro_rules! op {
             ($asm_order:tt) => {
@@ -97,12 +102,12 @@ macro_rules! atomic_rmw_amo {
                 asm!(
                     ".option push",
                     // https://github.com/riscv-non-isa/riscv-asm-manual/blob/ad0de8c004e29c9a7ac33cfd054f4d4f9392f2fb/src/asm-manual.adoc#arch
-                    // LLVM supports `.option arch` directive on LLVM 17+, so use .insn directive on old LLVM.
+                    // LLVM supports `.option arch` directive on LLVM 17+.
                     // https://github.com/llvm/llvm-project/commit/9e8ed3403c191ab9c4903e8eeb8f732ff8a43cb4
                     // Note that `.insn <value>` directive requires LLVM 19.
                     // https://github.com/llvm/llvm-project/commit/2a086dce691e3cc34a2fc27f4fb255bb2cbbfac9
-                    concat!(".option arch, ", atomic_rmw_amo_ext!($asm_suffix)),
-                    concat!("amo", stringify!($op), ".", $asm_suffix, $asm_order, " {out}, {val}, 0({dst})"),
+                    concat!(".option arch, ", atomic_rmw_amo_ext!($size)),
+                    concat!("amo", stringify!($op), ".", $size, $asm_order, " {out}, {val}, 0({dst})"), // atomic { _x = *dst; *dst = op(_x, val); out = _x }
                     ".option pop",
                     dst = in(reg) ptr_reg!($dst),
                     val = in(reg) $val,
@@ -136,7 +141,7 @@ fn sllw(val: u32, shift: u32) -> u32 {
     unsafe {
         let out;
         asm!(
-            concat!("sll", w!(), " {out}, {val}, {shift}"),
+            concat!("sll", w!(), " {out}, {val}, {shift}"), // out = val << shift & 31
             out = lateout(reg) out,
             val = in(reg) val,
             shift = in(reg) shift,
@@ -161,7 +166,7 @@ macro_rules! srlw {
             let shift: u32 = $shift;
             let out;
             asm!(
-                concat!("srl", w!(), " {out}, {val}, {shift}"),
+                concat!("srl", w!(), " {out}, {val}, {shift}"), // out = val >> shift & 31
                 out = lateout(reg) out,
                 val = in(reg) val,
                 shift = in(reg) shift,
@@ -173,7 +178,7 @@ macro_rules! srlw {
 }
 
 macro_rules! atomic_load_store {
-    ($([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $asm_suffix:tt) => {
+    ($([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $size:tt) => {
         #[repr(transparent)]
         pub(crate) struct $atomic_type $(<$($generics)*>)? {
             v: UnsafeCell<$value_type>,
@@ -216,9 +221,9 @@ macro_rules! atomic_load_store {
                     macro_rules! atomic_load {
                         ($acquire:tt, $release:tt) => {
                             asm!(
-                                $release,
-                                concat!("l", $asm_suffix, " {out}, 0({src})"),
-                                $acquire,
+                                $release,                                // fence
+                                concat!("l", $size, " {out}, 0({src})"), // atomic { out = *src }
+                                $acquire,                                // fence
                                 src = in(reg) ptr_reg!(src),
                                 out = lateout(reg) out,
                                 options(nostack, preserves_flags),
@@ -246,9 +251,9 @@ macro_rules! atomic_load_store {
                     macro_rules! atomic_store {
                         ($acquire:tt, $release:tt) => {
                             asm!(
-                                $release,
-                                concat!("s", $asm_suffix, " {val}, 0({dst})"),
-                                $acquire,
+                                $release,                                // fence
+                                concat!("s", $size, " {val}, 0({dst})"), // atomic { *dst = val }
+                                $acquire,                                // fence
                                 dst = in(reg) ptr_reg!(dst),
                                 val = in(reg) val,
                                 options(nostack, preserves_flags),
@@ -269,8 +274,8 @@ macro_rules! atomic_load_store {
 }
 
 macro_rules! atomic_ptr {
-    ($([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $asm_suffix:tt) => {
-        atomic_load_store!($([$($generics)*])? $atomic_type, $value_type, $asm_suffix);
+    ($([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $size:tt) => {
+        atomic_load_store!($([$($generics)*])? $atomic_type, $value_type, $size);
         #[cfg(any(
             test,
             portable_atomic_force_amo,
@@ -283,15 +288,15 @@ macro_rules! atomic_ptr {
                 let dst = self.v.get();
                 // SAFETY: any data races are prevented by atomic intrinsics and the raw
                 // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_rmw_amo!(swap, dst, val, order, $asm_suffix) }
+                unsafe { atomic_rmw_amo!(swap, dst, val, order, $size) }
             }
         }
     };
 }
 
 macro_rules! atomic {
-    ($atomic_type:ident, $value_type:ty, $asm_suffix:tt, $max:tt, $min:tt) => {
-        atomic_load_store!($atomic_type, $value_type, $asm_suffix);
+    ($atomic_type:ident, $value_type:ty, $size:tt, $max:tt, $min:tt) => {
+        atomic_load_store!($atomic_type, $value_type, $size);
         #[cfg(any(
             test,
             portable_atomic_force_amo,
@@ -321,7 +326,7 @@ macro_rules! atomic {
                 let dst = self.v.get();
                 // SAFETY: any data races are prevented by atomic intrinsics and the raw
                 // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_rmw_amo!(swap, dst, val, order, $asm_suffix) }
+                unsafe { atomic_rmw_amo!(swap, dst, val, order, $size) }
             }
 
             #[inline]
@@ -329,7 +334,7 @@ macro_rules! atomic {
                 let dst = self.v.get();
                 // SAFETY: any data races are prevented by atomic intrinsics and the raw
                 // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_rmw_amo!(add, dst, val, order, $asm_suffix) }
+                unsafe { atomic_rmw_amo!(add, dst, val, order, $size) }
             }
 
             #[inline]
@@ -342,7 +347,7 @@ macro_rules! atomic {
                 let dst = self.v.get();
                 // SAFETY: any data races are prevented by atomic intrinsics and the raw
                 // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_rmw_amo!(and, dst, val, order, $asm_suffix) }
+                unsafe { atomic_rmw_amo!(and, dst, val, order, $size) }
             }
 
             #[inline]
@@ -350,7 +355,7 @@ macro_rules! atomic {
                 let dst = self.v.get();
                 // SAFETY: any data races are prevented by atomic intrinsics and the raw
                 // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_rmw_amo!(or, dst, val, order, $asm_suffix) }
+                unsafe { atomic_rmw_amo!(or, dst, val, order, $size) }
             }
 
             #[inline]
@@ -358,7 +363,7 @@ macro_rules! atomic {
                 let dst = self.v.get();
                 // SAFETY: any data races are prevented by atomic intrinsics and the raw
                 // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_rmw_amo!(xor, dst, val, order, $asm_suffix) }
+                unsafe { atomic_rmw_amo!(xor, dst, val, order, $size) }
             }
 
             #[inline]
@@ -370,7 +375,7 @@ macro_rules! atomic {
                 let val: u64 = !0;
                 // SAFETY: any data races are prevented by atomic intrinsics and the raw
                 // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_rmw_amo!(xor, dst, val, order, $asm_suffix) }
+                unsafe { atomic_rmw_amo!(xor, dst, val, order, $size) }
             }
             #[cfg(not(any(
                 portable_atomic_unsafe_assume_single_core,
@@ -386,7 +391,7 @@ macro_rules! atomic {
                 let dst = self.v.get();
                 // SAFETY: any data races are prevented by atomic intrinsics and the raw
                 // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_rmw_amo!($max, dst, val, order, $asm_suffix) }
+                unsafe { atomic_rmw_amo!($max, dst, val, order, $size) }
             }
 
             #[inline]
@@ -394,7 +399,7 @@ macro_rules! atomic {
                 let dst = self.v.get();
                 // SAFETY: any data races are prevented by atomic intrinsics and the raw
                 // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_rmw_amo!($min, dst, val, order, $asm_suffix) }
+                unsafe { atomic_rmw_amo!($min, dst, val, order, $size) }
             }
         }
     };
@@ -446,11 +451,11 @@ zero_extend!(i8, u8);
 zero_extend!(i16, u16);
 
 macro_rules! atomic_sub_word {
-    ($atomic_type:ident, $value_type:ty, $asm_suffix:tt, $max:tt, $min:tt) => {
+    ($atomic_type:ident, $value_type:ty, $size:tt, $max:tt, $min:tt) => {
         #[cfg(any(target_feature = "zabha", portable_atomic_target_feature = "zabha"))]
-        atomic!($atomic_type, $value_type, $asm_suffix, $max, $min);
+        atomic!($atomic_type, $value_type, $size, $max, $min);
         #[cfg(not(any(target_feature = "zabha", portable_atomic_target_feature = "zabha")))]
-        atomic_load_store!($atomic_type, $value_type, $asm_suffix);
+        atomic_load_store!($atomic_type, $value_type, $size);
         #[cfg(any(
             test,
             portable_atomic_force_amo,