From 7387776db5a4f90556eef604c8c52f91e343855a Mon Sep 17 00:00:00 2001
From: sayantn <sayantan.chakraborty@students.iiserpune.ac.in>
Date: Tue, 25 Jun 2024 01:07:52 +0530
Subject: [PATCH] Fixed many intrinsics

fixed avx512-fma, mask-load/store stream, reduce-add and reduce-mul. and load/store of mask32 and mask64. added preserves-flags to load and store asm. fixed the missing list
---
 crates/core_arch/missing-x86.md          |  80 +-
 crates/core_arch/src/x86/avx.rs          |   8 +-
 crates/core_arch/src/x86/avx2.rs         |   2 -
 crates/core_arch/src/x86/avx512bw.rs     |  44 +-
 crates/core_arch/src/x86/avx512f.rs      | 982 ++++++++++-------------
 crates/core_arch/src/x86/avx512vbmi2.rs  |  24 +-
 crates/core_arch/src/x86_64/avx512f.rs   |  30 -
 crates/stdarch-verify/tests/x86-intel.rs |  15 +-
 8 files changed, 475 insertions(+), 710 deletions(-)
diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md
index 7c1057fc1a..f1fa8cbd16 100644
--- a/crates/core_arch/missing-x86.md
+++ b/crates/core_arch/missing-x86.md
@@ -53,6 +53,7 @@
 
 <details><summary>["AVX2"]</summary><p>
 
+  * [ ] [`_mm256_stream_load_si256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_load_si256)
   * [ ] [`_mm_broadcastsi128_si256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsi128_si256)
 </p></details>
 
@@ -174,6 +175,7 @@
   * [ ] [`_mm512_mask_i32logather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_pd)
   * [ ] [`_mm512_mask_i32loscatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_epi64)
   * [ ] [`_mm512_mask_i32loscatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_pd)
+  * [ ] [`_mm512_stream_load_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_load_si512)
   * [ ] [`_mm_mask_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd)
   * [ ] [`_mm_mask_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss)
   * [ ] [`_mm_mask_store_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd)
@@ -1539,95 +1541,25 @@
 
 <details><summary>["SSE"]</summary><p>
 
-  * [ ] [`_m_maskmovq`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq)
-  * [ ] [`_m_pavgb`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb)
-  * [ ] [`_m_pavgw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw)
-  * [ ] [`_m_pextrw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw)
-  * [ ] [`_m_pinsrw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pinsrw)
-  * [ ] [`_m_pmaxsw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw)
-  * [ ] [`_m_pmaxub`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub)
-  * [ ] [`_m_pminsw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw)
-  * [ ] [`_m_pminub`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub)
-  * [ ] [`_m_pmovmskb`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb)
-  * [ ] [`_m_pmulhuw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw)
-  * [ ] [`_m_psadbw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw)
-  * [ ] [`_m_pshufw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw)
-  * [ ] [`_mm_avg_pu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16)
-  * [ ] [`_mm_avg_pu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8)
-  * [ ] [`_mm_cvt_pi2ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps)
-  * [ ] [`_mm_cvt_ps2pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi)
-  * [ ] [`_mm_cvtpi16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps)
-  * [ ] [`_mm_cvtpi32_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps)
-  * [ ] [`_mm_cvtpi32x2_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps)
-  * [ ] [`_mm_cvtpi8_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps)
-  * [ ] [`_mm_cvtps_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16)
-  * [ ] [`_mm_cvtps_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32)
-  * [ ] [`_mm_cvtps_pi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8)
-  * [ ] [`_mm_cvtpu16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps)
-  * [ ] [`_mm_cvtpu8_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps)
-  * [ ] [`_mm_cvtt_ps2pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi)
-  * [ ] [`_mm_cvttps_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32)
-  * [ ] [`_mm_extract_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16)
   * [ ] [`_mm_free`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free)
-  * [ ] [`_mm_insert_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16)
-  * [ ] [`_mm_loadh_pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pi)
-  * [ ] [`_mm_loadl_pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pi)
   * [ ] [`_mm_malloc`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_malloc)
-  * [ ] [`_mm_maskmove_si64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64)
-  * [ ] [`_mm_max_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16)
-  * [ ] [`_mm_max_pu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8)
-  * [ ] [`_mm_min_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16)
-  * [ ] [`_mm_min_pu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8)
-  * [ ] [`_mm_movemask_pi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8)
-  * [ ] [`_mm_mulhi_pu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16)
-  * [ ] [`_mm_sad_pu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8)
-  * [ ] [`_mm_shuffle_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16)
-  * [ ] [`_mm_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_ps)
-  * [ ] [`_mm_storeh_pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pi)
-  * [ ] [`_mm_storel_pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pi)
-  * [ ] [`_mm_stream_pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi)
 </p></details>
 
 
 <details><summary>["SSE2"]</summary><p>
 
-  * [ ] [`_mm_add_si64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64)
-  * [ ] [`_mm_cvtpd_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32)
-  * [ ] [`_mm_cvtpi32_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd)
-  * [ ] [`_mm_cvttpd_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32)
   * [ ] [`_mm_loadu_si16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16)
   * [ ] [`_mm_loadu_si32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32)
-  * [ ] [`_mm_movepi64_pi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64)
-  * [ ] [`_mm_movpi64_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64)
-  * [ ] [`_mm_mul_su32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32)
-  * [ ] [`_mm_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64)
-  * [ ] [`_mm_set_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi64)
-  * [ ] [`_mm_setr_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64)
+  * [ ] [`_mm_loadu_si64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64)
   * [ ] [`_mm_storeu_si16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16)
   * [ ] [`_mm_storeu_si32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32)
   * [ ] [`_mm_storeu_si64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64)
-  * [ ] [`_mm_sub_si64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64)
 </p></details>
 
 
-<details><summary>["SSSE3"]</summary><p>
-
-  * [ ] [`_mm_abs_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16)
-  * [ ] [`_mm_abs_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32)
-  * [ ] [`_mm_abs_pi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8)
-  * [ ] [`_mm_alignr_pi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8)
-  * [ ] [`_mm_hadd_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16)
-  * [ ] [`_mm_hadd_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32)
-  * [ ] [`_mm_hadds_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16)
-  * [ ] [`_mm_hsub_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16)
-  * [ ] [`_mm_hsub_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi32)
-  * [ ] [`_mm_hsubs_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16)
-  * [ ] [`_mm_maddubs_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16)
-  * [ ] [`_mm_mulhrs_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16)
-  * [ ] [`_mm_shuffle_pi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8)
-  * [ ] [`_mm_sign_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16)
-  * [ ] [`_mm_sign_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32)
-  * [ ] [`_mm_sign_pi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8)
+<details><summary>["SSE4.1"]</summary><p>
+
+  * [ ] [`_mm_stream_load_si128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128)
 </p></details>
 
 
diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index 3a993bf785..db17be9f21 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -1715,11 +1715,11 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
 /// See [`_mm_sfence`] for details.
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
+#[cfg_attr(test, assert_instr(vmovntdq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
     crate::arch::asm!(
-        "vmovntps [{mem_addr}], {a}",
+        "vmovntdq [{mem_addr}], {a}",
         mem_addr = in(reg) mem_addr,
         a = in(ymm_reg) a,
         options(nostack, preserves_flags),
@@ -1742,12 +1742,12 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
 /// See [`_mm_sfence`] for details.
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd
+#[cfg_attr(test, assert_instr(vmovntpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
     crate::arch::asm!(
-        "vmovntps [{mem_addr}], {a}",
+        "vmovntpd [{mem_addr}], {a}",
         mem_addr = in(reg) mem_addr,
         a = in(ymm_reg) a,
         options(nostack, preserves_flags),
diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
index 5227b549dd..f3dd0c8e4a 100644
--- a/crates/core_arch/src/x86/avx2.rs
+++ b/crates/core_arch/src/x86/avx2.rs
@@ -3124,8 +3124,6 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
     transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
 }
 
-// TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
-
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 936d4a0041..2e3d2cc913 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -4583,7 +4583,7 @@ pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *con
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -4603,7 +4603,7 @@ pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -4623,7 +4623,7 @@ pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *cons
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -4643,7 +4643,7 @@ pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m5
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -4663,7 +4663,7 @@ pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *con
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -4683,7 +4683,7 @@ pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -4703,7 +4703,7 @@ pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *cons
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -4723,7 +4723,7 @@ pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m2
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -4743,7 +4743,7 @@ pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -4763,7 +4763,7 @@ pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -4783,7 +4783,7 @@ pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -4803,7 +4803,7 @@ pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -4821,7 +4821,7 @@ pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: _
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(zmm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -4838,7 +4838,7 @@ pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(zmm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -4855,7 +4855,7 @@ pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: _
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(ymm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -4872,7 +4872,7 @@ pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(ymm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -4889,7 +4889,7 @@ pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m12
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(xmm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -4906,7 +4906,7 @@ pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(xmm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -8761,7 +8761,7 @@ pub unsafe fn _mm_mask_testn_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(mov))] //should be kmovq
-pub unsafe fn _store_mask64(mem_addr: *mut u64, a: __mmask64) {
+pub unsafe fn _store_mask64(mem_addr: *mut __mmask64, a: __mmask64) {
     ptr::write(mem_addr as *mut __mmask64, a);
 }
 
@@ -8772,7 +8772,7 @@ pub unsafe fn _store_mask64(mem_addr: *mut u64, a: __mmask64) {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(mov))] //should be kmovd
-pub unsafe fn _store_mask32(mem_addr: *mut u32, a: __mmask32) {
+pub unsafe fn _store_mask32(mem_addr: *mut __mmask32, a: __mmask32) {
     ptr::write(mem_addr as *mut __mmask32, a);
 }
 
@@ -8783,7 +8783,7 @@ pub unsafe fn _store_mask32(mem_addr: *mut u32, a: __mmask32) {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(mov))] //should be kmovq
-pub unsafe fn _load_mask64(mem_addr: *const u64) -> __mmask64 {
+pub unsafe fn _load_mask64(mem_addr: *const __mmask64) -> __mmask64 {
     ptr::read(mem_addr as *const __mmask64)
 }
 
@@ -8794,7 +8794,7 @@ pub unsafe fn _load_mask64(mem_addr: *const u64) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(mov))] //should be kmovd
-pub unsafe fn _load_mask32(mem_addr: *const u32) -> __mmask32 {
+pub unsafe fn _load_mask32(mem_addr: *const __mmask32) -> __mmask32 {
     ptr::read(mem_addr as *const __mmask32)
 }
 
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 99d63734fd..20ca9d2a6d 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -50,7 +50,7 @@ pub unsafe fn _mm512_abs_epi32(a: __m512i) -> __m512i {
     let a = a.as_i32x16();
     // all-0 is a properly initialized i32x16
     let zero: i32x16 = mem::zeroed();
-    let sub = simd_sub(zero, a);
+    let sub = simd_neg(a);
     let cmp: i32x16 = simd_gt(a, zero);
     transmute(simd_select(cmp, a, sub))
 }
@@ -145,7 +145,7 @@ pub unsafe fn _mm512_abs_epi64(a: __m512i) -> __m512i {
     let a = a.as_i64x8();
     // all-0 is a properly initialized i64x8
     let zero: i64x8 = mem::zeroed();
-    let sub = simd_sub(zero, a);
+    let sub = simd_neg(a);
     let cmp: i64x8 = simd_gt(a, zero);
     transmute(simd_select(cmp, a, sub))
 }
@@ -186,7 +186,7 @@ pub unsafe fn _mm256_abs_epi64(a: __m256i) -> __m256i {
     let a = a.as_i64x4();
     // all-0 is a properly initialized i64x4
     let zero: i64x4 = mem::zeroed();
-    let sub = simd_sub(zero, a);
+    let sub = simd_neg(a);
     let cmp: i64x4 = simd_gt(a, zero);
     transmute(simd_select(cmp, a, sub))
 }
@@ -3414,11 +3414,13 @@ pub unsafe fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
-    transmute(vfmadd132ps(a.as_f32x16(), b.as_f32x16(), sub))
+    transmute(vfmadd132ps(
+        a.as_f32x16(),
+        b.as_f32x16(),
+        simd_neg(c.as_f32x16()),
+    ))
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3427,7 +3429,7 @@ pub unsafe fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
     let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
     transmute(simd_select_bitmask(k, fmsub, a.as_f32x16()))
@@ -3439,7 +3441,7 @@ pub unsafe fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
     let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
     let zero = _mm512_setzero_ps().as_f32x16();
@@ -3452,7 +3454,7 @@ pub unsafe fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m51
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
     let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
     transmute(simd_select_bitmask(k, fmsub, c.as_f32x16()))
@@ -3538,10 +3540,9 @@ pub unsafe fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
+    let sub = simd_neg(c.as_f64x8());
     transmute(vfmadd132pd(a.as_f64x8(), b.as_f64x8(), sub))
 }
 
@@ -3551,7 +3552,7 @@ pub unsafe fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
     let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
     transmute(simd_select_bitmask(k, fmsub, a.as_f64x8()))
@@ -3563,7 +3564,7 @@ pub unsafe fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m51
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
     let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
     let zero = _mm512_setzero_pd().as_f64x8();
@@ -3576,7 +3577,7 @@ pub unsafe fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m5
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
     let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
     transmute(simd_select_bitmask(k, fmsub, c.as_f64x8()))
@@ -3916,10 +3917,9 @@ pub unsafe fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mma
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
+    let sub = simd_neg(c.as_f32x16());
     transmute(vfmaddsub213ps(
         a.as_f32x16(),
         b.as_f32x16(),
@@ -3934,7 +3934,7 @@ pub unsafe fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
     let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
     transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x16()))
@@ -3946,7 +3946,7 @@ pub unsafe fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
     let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
     let zero = _mm512_setzero_ps().as_f32x16();
@@ -3959,7 +3959,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
     let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
     transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x16()))
@@ -4045,10 +4045,9 @@ pub unsafe fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
+    let sub = simd_neg(c.as_f64x8());
     transmute(vfmaddsub213pd(
         a.as_f64x8(),
         b.as_f64x8(),
@@ -4063,7 +4062,7 @@ pub unsafe fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
     let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
     transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x8()))
@@ -4075,7 +4074,7 @@ pub unsafe fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
     let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
     let zero = _mm512_setzero_pd().as_f64x8();
@@ -4088,7 +4087,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: _
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
     let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
     transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x8()))
@@ -4174,10 +4173,9 @@ pub unsafe fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mma
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f32x16());
+    let sub = simd_neg(a.as_f32x16());
     transmute(vfmadd132ps(sub, b.as_f32x16(), c.as_f32x16()))
 }
 
@@ -4187,7 +4185,7 @@ pub unsafe fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
     let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
     transmute(simd_select_bitmask(k, fnmadd, a.as_f32x16()))
@@ -4199,7 +4197,7 @@ pub unsafe fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m51
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
     let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
     let zero = _mm512_setzero_ps().as_f32x16();
@@ -4212,7 +4210,7 @@ pub unsafe fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m5
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
     let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
     transmute(simd_select_bitmask(k, fnmadd, c.as_f32x16()))
@@ -4298,10 +4296,9 @@ pub unsafe fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f64x8());
+    let sub = simd_neg(a.as_f64x8());
     transmute(vfmadd132pd(sub, b.as_f64x8(), c.as_f64x8()))
 }
 
@@ -4311,7 +4308,7 @@ pub unsafe fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
     let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
     transmute(simd_select_bitmask(k, fnmadd, a.as_f64x8()))
@@ -4323,7 +4320,7 @@ pub unsafe fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m5
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
     let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
     let zero = _mm512_setzero_pd().as_f64x8();
@@ -4336,7 +4333,7 @@ pub unsafe fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
     let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
     transmute(simd_select_bitmask(k, fnmadd, c.as_f64x8()))
@@ -4422,11 +4419,10 @@ pub unsafe fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    let zero: f32x16 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f32x16());
-    let subc = simd_sub(zero, c.as_f32x16());
+    let suba = simd_neg(a.as_f32x16());
+    let subc = simd_neg(c.as_f32x16());
     transmute(vfmadd132ps(suba, b.as_f32x16(), subc))
 }
 
@@ -4436,7 +4432,7 @@ pub unsafe fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
     let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
     transmute(simd_select_bitmask(k, fnmsub, a.as_f32x16()))
@@ -4448,7 +4444,7 @@ pub unsafe fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m51
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
     let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
     let zero = _mm512_setzero_ps().as_f32x16();
@@ -4461,7 +4457,7 @@ pub unsafe fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m5
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
     let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
     transmute(simd_select_bitmask(k, fnmsub, c.as_f32x16()))
@@ -4547,11 +4543,10 @@ pub unsafe fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let zero: f64x8 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f64x8());
-    let subc = simd_sub(zero, c.as_f64x8());
+    let suba = simd_neg(a.as_f64x8());
+    let subc = simd_neg(c.as_f64x8());
     transmute(vfmadd132pd(suba, b.as_f64x8(), subc))
 }
 
@@ -4561,7 +4556,7 @@ pub unsafe fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
     let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
     transmute(simd_select_bitmask(k, fnmsub, a.as_f64x8()))
@@ -4573,7 +4568,7 @@ pub unsafe fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m5
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
     let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
     let zero = _mm512_setzero_pd().as_f64x8();
@@ -4586,7 +4581,7 @@ pub unsafe fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
     let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
     transmute(simd_select_bitmask(k, fnmsub, c.as_f64x8()))
@@ -8377,7 +8372,7 @@ pub unsafe fn _mm512_mask3_fmadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -8385,8 +8380,7 @@ pub unsafe fn _mm512_fmsub_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
+    let sub = simd_neg(c.as_f32x16());
     let a = a.as_f32x16();
     let b = b.as_f32x16();
     let r = vfmadd132psround(a, b, sub, ROUNDING);
@@ -8406,7 +8400,7 @@ pub unsafe fn _mm512_fmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -8415,8 +8409,7 @@ pub unsafe fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
+    let sub = simd_neg(c.as_f32x16());
     let a = a.as_f32x16();
     let b = b.as_f32x16();
     let r = vfmadd132psround(a, b, sub, ROUNDING);
@@ -8436,7 +8429,7 @@ pub unsafe fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
     k: __mmask16,
@@ -8446,7 +8439,7 @@ pub unsafe fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
     let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
+    let sub = simd_neg(c.as_f32x16());
     let a = a.as_f32x16();
     let b = b.as_f32x16();
     let r = vfmadd132psround(a, b, sub, ROUNDING);
@@ -8466,7 +8459,7 @@ pub unsafe fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -8475,9 +8468,8 @@ pub unsafe fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
     k: __mmask16,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
     let c = c.as_f32x16();
-    let sub = simd_sub(zero, c);
+    let sub = simd_neg(c);
     let a = a.as_f32x16();
     let b = b.as_f32x16();
     let r = vfmadd132psround(a, b, sub, ROUNDING);
@@ -8497,7 +8489,7 @@ pub unsafe fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -8505,8 +8497,7 @@ pub unsafe fn _mm512_fmsub_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
+    let sub = simd_neg(c.as_f64x8());
     let a = a.as_f64x8();
     let b = b.as_f64x8();
     let r = vfmadd132pdround(a, b, sub, ROUNDING);
@@ -8526,7 +8517,7 @@ pub unsafe fn _mm512_fmsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -8535,8 +8526,7 @@ pub unsafe fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
+    let sub = simd_neg(c.as_f64x8());
     let a = a.as_f64x8();
     let b = b.as_f64x8();
     let r = vfmadd132pdround(a, b, sub, ROUNDING);
@@ -8556,7 +8546,7 @@ pub unsafe fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
     k: __mmask8,
@@ -8566,7 +8556,7 @@ pub unsafe fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
     let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
+    let sub = simd_neg(c.as_f64x8());
     let a = a.as_f64x8();
     let b = b.as_f64x8();
     let r = vfmadd132pdround(a, b, sub, ROUNDING);
@@ -8586,7 +8576,7 @@ pub unsafe fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -8595,9 +8585,8 @@ pub unsafe fn _mm512_mask3_fmsub_round_pd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
     let c = c.as_f64x8();
-    let sub = simd_sub(zero, c);
+    let sub = simd_neg(c);
     let a = a.as_f64x8();
     let b = b.as_f64x8();
     let r = vfmadd132pdround(a, b, sub, ROUNDING);
@@ -8849,7 +8838,7 @@ pub unsafe fn _mm512_mask3_fmaddsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -8857,8 +8846,7 @@ pub unsafe fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
+    let sub = simd_neg(c.as_f32x16());
     let a = a.as_f32x16();
     let b = b.as_f32x16();
     let r = vfmaddsub213ps(a, b, sub, ROUNDING);
@@ -8878,7 +8866,7 @@ pub unsafe fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -8887,8 +8875,7 @@ pub unsafe fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
+    let sub = simd_neg(c.as_f32x16());
     let a = a.as_f32x16();
     let b = b.as_f32x16();
     let r = vfmaddsub213ps(a, b, sub, ROUNDING);
@@ -8908,7 +8895,7 @@ pub unsafe fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
     k: __mmask16,
@@ -8918,7 +8905,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
     let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
+    let sub = simd_neg(c.as_f32x16());
     let a = a.as_f32x16();
     let b = b.as_f32x16();
     let r = vfmaddsub213ps(a, b, sub, ROUNDING);
@@ -8938,7 +8925,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -8947,9 +8934,8 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
     k: __mmask16,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
     let c = c.as_f32x16();
-    let sub = simd_sub(zero, c);
+    let sub = simd_neg(c);
     let a = a.as_f32x16();
     let b = b.as_f32x16();
     let r = vfmaddsub213ps(a, b, sub, ROUNDING);
@@ -8969,7 +8955,7 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -8977,8 +8963,7 @@ pub unsafe fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
+    let sub = simd_neg(c.as_f64x8());
     let a = a.as_f64x8();
     let b = b.as_f64x8();
     let r = vfmaddsub213pd(a, b, sub, ROUNDING);
@@ -8998,7 +8983,7 @@ pub unsafe fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9007,8 +8992,7 @@ pub unsafe fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
+    let sub = simd_neg(c.as_f64x8());
     let a = a.as_f64x8();
     let b = b.as_f64x8();
     let r = vfmaddsub213pd(a, b, sub, ROUNDING);
@@ -9028,7 +9012,7 @@ pub unsafe fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
     k: __mmask8,
@@ -9038,7 +9022,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
     let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
+    let sub = simd_neg(c.as_f64x8());
     let a = a.as_f64x8();
     let b = b.as_f64x8();
     let r = vfmaddsub213pd(a, b, sub, ROUNDING);
@@ -9058,7 +9042,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9067,9 +9051,8 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
     let c = c.as_f64x8();
-    let sub = simd_sub(zero, c);
+    let sub = simd_neg(c);
     let a = a.as_f64x8();
     let b = b.as_f64x8();
     let r = vfmaddsub213pd(a, b, sub, ROUNDING);
@@ -9089,7 +9072,7 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -9097,8 +9080,7 @@ pub unsafe fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f32x16());
+    let sub = simd_neg(a.as_f32x16());
     let b = b.as_f32x16();
     let c = c.as_f32x16();
     let r = vfmadd132psround(sub, b, c, ROUNDING);
@@ -9118,7 +9100,7 @@ pub unsafe fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -9127,8 +9109,7 @@ pub unsafe fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f32x16());
+    let sub = simd_neg(a.as_f32x16());
     let b = b.as_f32x16();
     let c = c.as_f32x16();
     let r = vfmadd132psround(sub, b, c, ROUNDING);
@@ -9148,7 +9129,7 @@ pub unsafe fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
     k: __mmask16,
@@ -9158,7 +9139,7 @@ pub unsafe fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
     let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f32x16());
+    let sub = simd_neg(a.as_f32x16());
     let b = b.as_f32x16();
     let c = c.as_f32x16();
     let r = vfmadd132psround(sub, b, c, ROUNDING);
@@ -9178,7 +9159,7 @@ pub unsafe fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -9187,8 +9168,7 @@ pub unsafe fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
     k: __mmask16,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f32x16());
+    let sub = simd_neg(a.as_f32x16());
     let b = b.as_f32x16();
     let c = c.as_f32x16();
     let r = vfmadd132psround(sub, b, c, ROUNDING);
@@ -9208,7 +9188,7 @@ pub unsafe fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9216,8 +9196,7 @@ pub unsafe fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f64x8());
+    let sub = simd_neg(a.as_f64x8());
     let b = b.as_f64x8();
     let c = c.as_f64x8();
     let r = vfmadd132pdround(sub, b, c, ROUNDING);
@@ -9237,7 +9216,7 @@ pub unsafe fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9246,9 +9225,8 @@ pub unsafe fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
     let a = a.as_f64x8();
-    let sub = simd_sub(zero, a);
+    let sub = simd_neg(a);
     let b = b.as_f64x8();
     let c = c.as_f64x8();
     let r = vfmadd132pdround(sub, b, c, ROUNDING);
@@ -9268,7 +9246,7 @@ pub unsafe fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
     k: __mmask8,
@@ -9278,7 +9256,7 @@ pub unsafe fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
     let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f64x8());
+    let sub = simd_neg(a.as_f64x8());
     let b = b.as_f64x8();
     let c = c.as_f64x8();
     let r = vfmadd132pdround(sub, b, c, ROUNDING);
@@ -9298,7 +9276,7 @@ pub unsafe fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9307,8 +9285,7 @@ pub unsafe fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f64x8());
+    let sub = simd_neg(a.as_f64x8());
     let b = b.as_f64x8();
     let c = c.as_f64x8();
     let r = vfmadd132pdround(sub, b, c, ROUNDING);
@@ -9328,7 +9305,7 @@ pub unsafe fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -9336,9 +9313,8 @@ pub unsafe fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f32x16());
-    let subc = simd_sub(zero, c.as_f32x16());
+    let suba = simd_neg(a.as_f32x16());
+    let subc = simd_neg(c.as_f32x16());
     let b = b.as_f32x16();
     let r = vfmadd132psround(suba, b, subc, ROUNDING);
     transmute(r)
@@ -9357,7 +9333,7 @@ pub unsafe fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -9366,10 +9342,9 @@ pub unsafe fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
     let a = a.as_f32x16();
-    let suba = simd_sub(zero, a);
-    let subc = simd_sub(zero, c.as_f32x16());
+    let suba = simd_neg(a);
+    let subc = simd_neg(c.as_f32x16());
     let b = b.as_f32x16();
     let r = vfmadd132psround(suba, b, subc, ROUNDING);
     transmute(simd_select_bitmask(k, r, a))
@@ -9388,7 +9363,7 @@ pub unsafe fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
     k: __mmask16,
@@ -9398,8 +9373,8 @@ pub unsafe fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
     let zero: f32x16 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f32x16());
-    let subc = simd_sub(zero, c.as_f32x16());
+    let suba = simd_neg(a.as_f32x16());
+    let subc = simd_neg(c.as_f32x16());
     let b = b.as_f32x16();
     let r = vfmadd132psround(suba, b, subc, ROUNDING);
     transmute(simd_select_bitmask(k, r, zero))
@@ -9418,7 +9393,7 @@ pub unsafe fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -9427,10 +9402,9 @@ pub unsafe fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
     k: __mmask16,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f32x16());
+    let suba = simd_neg(a.as_f32x16());
     let c = c.as_f32x16();
-    let subc = simd_sub(zero, c);
+    let subc = simd_neg(c);
     let b = b.as_f32x16();
     let r = vfmadd132psround(suba, b, subc, ROUNDING);
     transmute(simd_select_bitmask(k, r, c))
@@ -9449,7 +9423,7 @@ pub unsafe fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9457,9 +9431,8 @@ pub unsafe fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f64x8());
-    let subc = simd_sub(zero, c.as_f64x8());
+    let suba = simd_neg(a.as_f64x8());
+    let subc = simd_neg(c.as_f64x8());
     let b = b.as_f64x8();
     let r = vfmadd132pdround(suba, b, subc, ROUNDING);
     transmute(r)
@@ -9478,7 +9451,7 @@ pub unsafe fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9487,10 +9460,9 @@ pub unsafe fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
     let a = a.as_f64x8();
-    let suba = simd_sub(zero, a);
-    let subc = simd_sub(zero, c.as_f64x8());
+    let suba = simd_neg(a);
+    let subc = simd_neg(c.as_f64x8());
     let b = b.as_f64x8();
     let r = vfmadd132pdround(suba, b, subc, ROUNDING);
     transmute(simd_select_bitmask(k, r, a))
@@ -9509,7 +9481,7 @@ pub unsafe fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
     k: __mmask8,
@@ -9519,8 +9491,8 @@ pub unsafe fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
     let zero: f64x8 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f64x8());
-    let subc = simd_sub(zero, c.as_f64x8());
+    let suba = simd_neg(a.as_f64x8());
+    let subc = simd_neg(c.as_f64x8());
     let b = b.as_f64x8();
     let r = vfmadd132pdround(suba, b, subc, ROUNDING);
     transmute(simd_select_bitmask(k, r, zero))
@@ -9539,7 +9511,7 @@ pub unsafe fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fnmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9548,10 +9520,9 @@ pub unsafe fn _mm512_mask3_fnmsub_round_pd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f64x8());
+    let suba = simd_neg(a.as_f64x8());
     let c = c.as_f64x8();
-    let subc = simd_sub(zero, c);
+    let subc = simd_neg(c);
     let b = b.as_f64x8();
     let r = vfmadd132pdround(suba, b, subc, ROUNDING);
     transmute(simd_select_bitmask(k, r, c))
@@ -28037,11 +28008,11 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntpd
+#[cfg_attr(test, assert_instr(vmovntpd))]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
     crate::arch::asm!(
-        "vmovntps [{mem_addr}], {a}",
+        "vmovntpd [{mem_addr}], {a}",
         mem_addr = in(reg) mem_addr,
         a = in(zmm_reg) a,
         options(nostack, preserves_flags),
@@ -28063,11 +28034,11 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntdq
+#[cfg_attr(test, assert_instr(vmovntdq))]
 #[allow(clippy::cast_ptr_alignment)]
-pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) {
+pub unsafe fn _mm512_stream_si512(mem_addr: *mut i32, a: __m512i) {
     crate::arch::asm!(
-        "vmovntps [{mem_addr}], {a}",
+        "vmovntdq [{mem_addr}], {a}",
         mem_addr = in(reg) mem_addr,
         a = in(zmm_reg) a,
         options(nostack, preserves_flags),
@@ -31306,7 +31277,14 @@ pub unsafe fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_reduce_add_ps(a: __m512) -> f32 {
-    simd_reduce_add_unordered(a.as_f32x16())
+    // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ
+    let a = _mm256_add_ps(
+        simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+        simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+    );
+    let a = _mm_add_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+    let a = _mm_add_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+    simd_extract::<_, f32>(a, 0) + simd_extract::<_, f32>(a, 1)
 }
 
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
@@ -31316,11 +31294,7 @@ pub unsafe fn _mm512_reduce_add_ps(a: __m512) -> f32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_mask_reduce_add_ps(k: __mmask16, a: __m512) -> f32 {
-    simd_reduce_add_unordered(simd_select_bitmask(
-        k,
-        a.as_f32x16(),
-        _mm512_setzero_ps().as_f32x16(),
-    ))
+    _mm512_reduce_add_ps(simd_select_bitmask(k, a, _mm512_setzero_ps()))
 }
 
 /// Reduce the packed double-precision (64-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
@@ -31330,7 +31304,12 @@ pub unsafe fn _mm512_mask_reduce_add_ps(k: __mmask16, a: __m512) -> f32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_reduce_add_pd(a: __m512d) -> f64 {
-    simd_reduce_add_unordered(a.as_f64x8())
+    let a = _mm256_add_pd(
+        _mm512_extractf64x4_pd::<0>(a),
+        _mm512_extractf64x4_pd::<1>(a),
+    );
+    let a = _mm_add_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+    simd_extract::<_, f64>(a, 0) + simd_extract::<_, f64>(a, 1)
 }
 
 /// Reduce the packed double-precision (64-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
@@ -31340,11 +31319,7 @@ pub unsafe fn _mm512_reduce_add_pd(a: __m512d) -> f64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_mask_reduce_add_pd(k: __mmask8, a: __m512d) -> f64 {
-    simd_reduce_add_unordered(simd_select_bitmask(
-        k,
-        a.as_f64x8(),
-        _mm512_setzero_pd().as_f64x8(),
-    ))
+    _mm512_reduce_add_pd(simd_select_bitmask(k, a, _mm512_setzero_pd()))
 }
 
 /// Reduce the packed 32-bit integers in a by multiplication. Returns the product of all elements in a.
@@ -31402,7 +31377,14 @@ pub unsafe fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_reduce_mul_ps(a: __m512) -> f32 {
-    simd_reduce_mul_unordered(a.as_f32x16())
+    // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ
+    let a = _mm256_mul_ps(
+        simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+        simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+    );
+    let a = _mm_mul_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+    let a = _mm_mul_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+    simd_extract::<_, f32>(a, 0) * simd_extract::<_, f32>(a, 1)
 }
 
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -31412,11 +31394,7 @@ pub unsafe fn _mm512_reduce_mul_ps(a: __m512) -> f32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_mask_reduce_mul_ps(k: __mmask16, a: __m512) -> f32 {
-    simd_reduce_mul_unordered(simd_select_bitmask(
-        k,
-        a.as_f32x16(),
-        _mm512_set1_ps(1.).as_f32x16(),
-    ))
+    _mm512_reduce_mul_ps(simd_select_bitmask(k, a, _mm512_set1_ps(1.)))
 }
 
 /// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
@@ -31426,7 +31404,12 @@ pub unsafe fn _mm512_mask_reduce_mul_ps(k: __mmask16, a: __m512) -> f32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_reduce_mul_pd(a: __m512d) -> f64 {
-    simd_reduce_mul_unordered(a.as_f64x8())
+    let a = _mm256_mul_pd(
+        _mm512_extractf64x4_pd::<0>(a),
+        _mm512_extractf64x4_pd::<1>(a),
+    );
+    let a = _mm_mul_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+    simd_extract::<_, f64>(a, 0) * simd_extract::<_, f64>(a, 1)
 }
 
 /// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -31436,11 +31419,7 @@ pub unsafe fn _mm512_reduce_mul_pd(a: __m512d) -> f64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_mask_reduce_mul_pd(k: __mmask8, a: __m512d) -> f64 {
-    simd_reduce_mul_unordered(simd_select_bitmask(
-        k,
-        a.as_f64x8(),
-        _mm512_set1_pd(1.).as_f64x8(),
-    ))
+    _mm512_reduce_mul_pd(simd_select_bitmask(k, a, _mm512_set1_pd(1.)))
 }
 
 /// Reduce the packed signed 32-bit integers in a by maximum. Returns the maximum of all elements in a.
@@ -32794,7 +32773,7 @@ pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *con
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -32814,7 +32793,7 @@ pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -32834,7 +32813,7 @@ pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *cons
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -32854,7 +32833,7 @@ pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -32874,7 +32853,7 @@ pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -32894,7 +32873,7 @@ pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m51
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -32914,7 +32893,7 @@ pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -32934,7 +32913,7 @@ pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -32954,7 +32933,7 @@ pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *cons
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -32974,7 +32953,7 @@ pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -32994,7 +32973,7 @@ pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *cons
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33014,7 +32993,7 @@ pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33034,7 +33013,7 @@ pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f3
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33054,7 +33033,7 @@ pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33074,7 +33053,7 @@ pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33094,7 +33073,7 @@ pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33114,7 +33093,7 @@ pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33134,7 +33113,7 @@ pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33154,7 +33133,7 @@ pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33174,7 +33153,7 @@ pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33194,7 +33173,7 @@ pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32)
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33214,7 +33193,7 @@ pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33234,7 +33213,7 @@ pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64)
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33254,7 +33233,7 @@ pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -33266,17 +33245,13 @@ pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_epi32)
 #[inline]
 #[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i {
-    let mut dst: __m512i = src;
-    asm!(
-        vpl!("vmovdqa32 {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    let ones = _mm512_set1_epi32(-1).as_i32x16();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(k, ones, zero);
+    transmute(simd_masked_load(mask, mem_addr, src.as_i32x16()))
 }
 
 /// Load packed 32-bit integers from memory into dst using zeromask k
@@ -33286,17 +33261,10 @@ pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *cons
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_epi32)
 #[inline]
 #[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
-    let mut dst: __m512i;
-    asm!(
-        vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    _mm512_mask_load_epi32(_mm512_setzero_epi32(), k, mem_addr)
 }
 
 /// Load packed 64-bit integers from memory into dst using writemask k
@@ -33306,17 +33274,13 @@ pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_epi64)
 #[inline]
 #[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i {
-    let mut dst: __m512i = src;
-    asm!(
-        vpl!("vmovdqa64 {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    let ones = _mm512_set1_epi64(-1).as_i64x8();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(k, ones, zero);
+    transmute(simd_masked_load(mask, mem_addr, src.as_i64x8()))
 }
 
 /// Load packed 64-bit integers from memory into dst using zeromask k
@@ -33326,17 +33290,10 @@ pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_epi64)
 #[inline]
 #[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
-    let mut dst: __m512i;
-    asm!(
-        vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    _mm512_mask_load_epi64(_mm512_setzero_epi32(), k, mem_addr)
 }
 
 /// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
@@ -33346,17 +33303,13 @@ pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m5
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_ps)
 #[inline]
 #[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 {
-    let mut dst: __m512 = src;
-    asm!(
-        vpl!("vmovaps {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    let ones = _mm512_set1_epi32(-1).as_i32x16();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(k, ones, zero);
+    simd_masked_load(mask, mem_addr, src)
 }
 
 /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
@@ -33366,17 +33319,10 @@ pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f3
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_ps)
 #[inline]
 #[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
-    let mut dst: __m512;
-    asm!(
-        vpl!("vmovaps {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    _mm512_mask_load_ps(_mm512_setzero_ps(), k, mem_addr)
 }
 
 /// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
@@ -33386,17 +33332,13 @@ pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_pd)
 #[inline]
 #[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d {
-    let mut dst: __m512d = src;
-    asm!(
-        vpl!("vmovapd {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    let ones = _mm512_set1_epi64(-1).as_i64x8();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(k, ones, zero);
+    simd_masked_load(mask, mem_addr, src)
 }
 
 /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
@@ -33406,17 +33348,10 @@ pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f6
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_pd)
 #[inline]
 #[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
-    let mut dst: __m512d;
-    asm!(
-        vpl!("vmovapd {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    _mm512_mask_load_pd(_mm512_setzero_pd(), k, mem_addr)
 }
 
 /// Load packed 32-bit integers from memory into dst using writemask k
@@ -33425,18 +33360,14 @@ pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_epi32)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i {
-    let mut dst: __m256i = src;
-    asm!(
-        vpl!("vmovdqa32 {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    let ones = _mm256_set1_epi32(-1).as_i32x8();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(k, ones, zero);
+    transmute(simd_masked_load(mask, mem_addr, src.as_i32x8()))
 }
 
 /// Load packed 32-bit integers from memory into dst using zeromask k
@@ -33445,18 +33376,11 @@ pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_epi32)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
-    let mut dst: __m256i;
-    asm!(
-        vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    _mm256_mask_load_epi32(_mm256_setzero_si256(), k, mem_addr)
 }
 
 /// Load packed 64-bit integers from memory into dst using writemask k
@@ -33465,18 +33389,14 @@ pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m2
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_epi64)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i {
-    let mut dst: __m256i = src;
-    asm!(
-        vpl!("vmovdqa64 {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    let ones = _mm256_set1_epi64x(-1).as_i64x4();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(k, ones, zero);
+    transmute(simd_masked_load(mask, mem_addr, src.as_i64x4()))
 }
 
 /// Load packed 64-bit integers from memory into dst using zeromask k
@@ -33485,18 +33405,11 @@ pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_epi64)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
-    let mut dst: __m256i;
-    asm!(
-        vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    _mm256_mask_load_epi64(_mm256_setzero_si256(), k, mem_addr)
 }
 
 /// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
@@ -33505,18 +33418,14 @@ pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m2
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_ps)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
-    let mut dst: __m256 = src;
-    asm!(
-        vpl!("vmovaps {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    let ones = _mm256_set1_epi32(-1).as_i32x8();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(k, ones, zero);
+    simd_masked_load(mask, mem_addr, src)
 }
 
 /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
@@ -33525,18 +33434,11 @@ pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_ps)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
-    let mut dst: __m256;
-    asm!(
-        vpl!("vmovaps {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    _mm256_mask_load_ps(_mm256_setzero_ps(), k, mem_addr)
 }
 
 /// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
@@ -33545,18 +33447,14 @@ pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_pd)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d {
-    let mut dst: __m256d = src;
-    asm!(
-        vpl!("vmovapd {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    let ones = _mm256_set1_epi64x(-1).as_i64x4();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(k, ones, zero);
+    simd_masked_load(mask, mem_addr, src)
 }
 
 /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
@@ -33565,18 +33463,11 @@ pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f6
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_pd)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
-    let mut dst: __m256d;
-    asm!(
-        vpl!("vmovapd {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    _mm256_mask_load_pd(_mm256_setzero_pd(), k, mem_addr)
 }
 
 /// Load packed 32-bit integers from memory into dst using writemask k
@@ -33585,18 +33476,14 @@ pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_epi32)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i {
-    let mut dst: __m128i = src;
-    asm!(
-        vpl!("vmovdqa32 {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    let ones = _mm_set1_epi32(-1).as_i32x4();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(k, ones, zero);
+    transmute(simd_masked_load(mask, mem_addr, src.as_i32x4()))
 }
 
 /// Load packed 32-bit integers from memory into dst using zeromask k
@@ -33605,18 +33492,11 @@ pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i3
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_epi32)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
-    let mut dst: __m128i;
-    asm!(
-        vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    _mm_mask_load_epi32(_mm_setzero_si128(), k, mem_addr)
 }
 
 /// Load packed 64-bit integers from memory into dst using writemask k
@@ -33625,18 +33505,14 @@ pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_epi64)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i {
-    let mut dst: __m128i = src;
-    asm!(
-        vpl!("vmovdqa64 {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    let ones = _mm_set1_epi64x(-1).as_i64x2();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(k, ones, zero);
+    transmute(simd_masked_load(mask, mem_addr, src.as_i64x2()))
 }
 
 /// Load packed 64-bit integers from memory into dst using zeromask k
@@ -33645,18 +33521,11 @@ pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i6
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_epi64)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
-    let mut dst: __m128i;
-    asm!(
-        vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    _mm_mask_load_epi64(_mm_setzero_si128(), k, mem_addr)
 }
 
 /// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
@@ -33665,18 +33534,14 @@ pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_ps)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
-    let mut dst: __m128 = src;
-    asm!(
-        vpl!("vmovaps {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    let ones = _mm_set1_epi32(-1).as_i32x4();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(k, ones, zero);
+    simd_masked_load(mask, mem_addr, src)
 }
 
 /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
@@ -33685,18 +33550,11 @@ pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_ps)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
-    let mut dst: __m128;
-    asm!(
-        vpl!("vmovaps {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    _mm_mask_load_ps(_mm_setzero_ps(), k, mem_addr)
 }
 
 /// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
@@ -33705,18 +33563,14 @@ pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_pd)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
-    let mut dst: __m128d = src;
-    asm!(
-        vpl!("vmovapd {dst}{{{k}}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    let ones = _mm_set1_epi64x(-1).as_i64x2();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(k, ones, zero);
+    simd_masked_load(mask, mem_addr, src)
 }
 
 /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
@@ -33725,18 +33579,11 @@ pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64)
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_pd)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
-    let mut dst: __m128d;
-    asm!(
-        vpl!("vmovapd {dst}{{{k}}} {{z}}"),
-        p = in(reg) mem_addr,
-        k = in(kreg) k,
-        dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
-    );
-    dst
+    _mm_mask_load_pd(_mm_setzero_pd(), k, mem_addr)
 }
 
 /// Store packed 32-bit integers from a into memory using writemask k.
@@ -33752,7 +33599,7 @@ pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: _
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(zmm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -33769,7 +33616,7 @@ pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(zmm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -33786,7 +33633,7 @@ pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m5
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(zmm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -33803,7 +33650,7 @@ pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m51
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(zmm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -33820,7 +33667,7 @@ pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(ymm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -33837,7 +33684,7 @@ pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(ymm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -33854,7 +33701,7 @@ pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m25
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(ymm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -33871,7 +33718,7 @@ pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m25
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(ymm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -33888,7 +33735,7 @@ pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m12
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(xmm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -33905,7 +33752,7 @@ pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m12
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(xmm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -33922,7 +33769,7 @@ pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128)
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(xmm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -33939,7 +33786,7 @@ pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d)
         p = in(reg) mem_addr,
         mask = in(kreg) mask,
         a = in(xmm_reg) a,
-        options(nostack)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -33949,15 +33796,13 @@ pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d)
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_epi32)
 #[inline]
 #[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) {
-    asm!(
-        vps!("vmovdqa32", "{{{mask}}}, {a}"),
-        p = in(reg) mem_addr,
-        mask = in(kreg) mask,
-        a = in(zmm_reg) a,
-        options(nostack)
-    );
+    let ones = _mm512_set1_epi32(-1).as_i32x16();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(mask, ones, zero);
+    simd_masked_store(mask, mem_addr, a.as_i32x16())
 }
 
 /// Store packed 64-bit integers from a into memory using writemask k.
@@ -33966,15 +33811,13 @@ pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_epi64)
 #[inline]
 #[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) {
-    asm!(
-        vps!("vmovdqa64", "{{{mask}}}, {a}"),
-        p = in(reg) mem_addr,
-        mask = in(kreg) mask,
-        a = in(zmm_reg) a,
-        options(nostack)
-    );
+    let ones = _mm512_set1_epi64(-1).as_i64x8();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(mask, ones, zero);
+    simd_masked_store(mask, mem_addr, a.as_i64x8())
 }
 
 /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
@@ -33983,15 +33826,13 @@ pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_ps)
 #[inline]
 #[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) {
-    asm!(
-        vps!("vmovaps", "{{{mask}}}, {a}"),
-        p = in(reg) mem_addr,
-        mask = in(kreg) mask,
-        a = in(zmm_reg) a,
-        options(nostack)
-    );
+    let ones = _mm512_set1_epi32(-1).as_i32x16();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(mask, ones, zero);
+    simd_masked_store(mask, mem_addr, a)
 }
 
 /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
@@ -34000,15 +33841,13 @@ pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m51
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_pd)
 #[inline]
 #[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) {
-    asm!(
-        vps!("vmovapd", "{{{mask}}}, {a}"),
-        p = in(reg) mem_addr,
-        mask = in(kreg) mask,
-        a = in(zmm_reg) a,
-        options(nostack)
-    );
+    let ones = _mm512_set1_epi64(-1).as_i64x8();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(mask, ones, zero);
+    simd_masked_store(mask, mem_addr, a)
 }
 
 /// Store packed 32-bit integers from a into memory using writemask k.
@@ -34016,16 +33855,14 @@ pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_epi32)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) {
-    asm!(
-        vps!("vmovdqa32", "{{{mask}}}, {a}"),
-        p = in(reg) mem_addr,
-        mask = in(kreg) mask,
-        a = in(ymm_reg) a,
-        options(nostack)
-    );
+    let ones = _mm256_set1_epi32(-1).as_i32x8();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(mask, ones, zero);
+    simd_masked_store(mask, mem_addr, a.as_i32x8())
 }
 
 /// Store packed 64-bit integers from a into memory using writemask k.
@@ -34033,16 +33870,14 @@ pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_epi64)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) {
-    asm!(
-        vps!("vmovdqa64", "{{{mask}}}, {a}"),
-        p = in(reg) mem_addr,
-        mask = in(kreg) mask,
-        a = in(ymm_reg) a,
-        options(nostack)
-    );
+    let ones = _mm256_set1_epi64x(-1).as_i64x4();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(mask, ones, zero);
+    simd_masked_store(mask, mem_addr, a.as_i64x4())
 }
 
 /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
@@ -34050,16 +33885,14 @@ pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_ps)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) {
-    asm!(
-        vps!("vmovaps", "{{{mask}}}, {a}"),
-        p = in(reg) mem_addr,
-        mask = in(kreg) mask,
-        a = in(ymm_reg) a,
-        options(nostack)
-    );
+    let ones = _mm256_set1_epi32(-1).as_i32x8();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(mask, ones, zero);
+    simd_masked_store(mask, mem_addr, a)
 }
 
 /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
@@ -34067,16 +33900,14 @@ pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_pd)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) {
-    asm!(
-        vps!("vmovapd", "{{{mask}}}, {a}"),
-        p = in(reg) mem_addr,
-        mask = in(kreg) mask,
-        a = in(ymm_reg) a,
-        options(nostack)
-    );
+    let ones = _mm256_set1_epi64x(-1).as_i64x4();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(mask, ones, zero);
+    simd_masked_store(mask, mem_addr, a)
 }
 
 /// Store packed 32-bit integers from a into memory using writemask k.
@@ -34084,16 +33915,14 @@ pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_epi32)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) {
-    asm!(
-        vps!("vmovdqa32", "{{{mask}}}, {a}"),
-        p = in(reg) mem_addr,
-        mask = in(kreg) mask,
-        a = in(xmm_reg) a,
-        options(nostack)
-    );
+    let ones = _mm_set1_epi32(-1).as_i32x4();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(mask, ones, zero);
+    simd_masked_store(mask, mem_addr, a.as_i32x4())
 }
 
 /// Store packed 64-bit integers from a into memory using writemask k.
@@ -34101,16 +33930,14 @@ pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_epi64)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) {
-    asm!(
-        vps!("vmovdqa64", "{{{mask}}}, {a}"),
-        p = in(reg) mem_addr,
-        mask = in(kreg) mask,
-        a = in(xmm_reg) a,
-        options(nostack)
-    );
+    let ones = _mm_set1_epi64x(-1).as_i64x2();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(mask, ones, zero);
+    simd_masked_store(mask, mem_addr, a.as_i64x2())
 }
 
 /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
@@ -34118,16 +33945,14 @@ pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_ps)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
-    asm!(
-        vps!("vmovaps", "{{{mask}}}, {a}"),
-        p = in(reg) mem_addr,
-        mask = in(kreg) mask,
-        a = in(xmm_reg) a,
-        options(nostack)
-    );
+    let ones = _mm_set1_epi32(-1).as_i32x4();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(mask, ones, zero);
+    simd_masked_store(mask, mem_addr, a)
 }
 
 /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
@@ -34135,16 +33960,14 @@ pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_pd)
 #[inline]
-#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) {
-    asm!(
-        vps!("vmovapd", "{{{mask}}}, {a}"),
-        p = in(reg) mem_addr,
-        mask = in(kreg) mask,
-        a = in(xmm_reg) a,
-        options(nostack)
-    );
+    let ones = _mm_set1_epi64x(-1).as_i64x2();
+    let zero = mem::zeroed();
+    let mask = simd_select_bitmask(mask, ones, zero);
+    simd_masked_store(mask, mem_addr, a)
 }
 
 /// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -34164,7 +33987,7 @@ pub unsafe fn _mm512_mask_expandloadu_epi32(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34182,7 +34005,7 @@ pub unsafe fn _mm512_maskz_expandloadu_epi32(k: __mmask16, mem_addr: *const i32)
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34204,7 +34027,7 @@ pub unsafe fn _mm256_mask_expandloadu_epi32(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34222,7 +34045,7 @@ pub unsafe fn _mm256_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32)
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34244,7 +34067,7 @@ pub unsafe fn _mm_mask_expandloadu_epi32(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34262,7 +34085,7 @@ pub unsafe fn _mm_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) ->
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34284,7 +34107,7 @@ pub unsafe fn _mm512_mask_expandloadu_epi64(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34302,7 +34125,7 @@ pub unsafe fn _mm512_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64)
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34324,7 +34147,7 @@ pub unsafe fn _mm256_mask_expandloadu_epi64(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34342,7 +34165,7 @@ pub unsafe fn _mm256_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64)
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34364,7 +34187,7 @@ pub unsafe fn _mm_mask_expandloadu_epi64(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34382,7 +34205,7 @@ pub unsafe fn _mm_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) ->
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34404,7 +34227,7 @@ pub unsafe fn _mm512_mask_expandloadu_ps(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34422,7 +34245,7 @@ pub unsafe fn _mm512_maskz_expandloadu_ps(k: __mmask16, mem_addr: *const f32) ->
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34440,7 +34263,7 @@ pub unsafe fn _mm256_mask_expandloadu_ps(src: __m256, k: __mmask8, mem_addr: *co
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34458,7 +34281,7 @@ pub unsafe fn _mm256_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) ->
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34476,7 +34299,7 @@ pub unsafe fn _mm_mask_expandloadu_ps(src: __m128, k: __mmask8, mem_addr: *const
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34494,7 +34317,7 @@ pub unsafe fn _mm_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34516,7 +34339,7 @@ pub unsafe fn _mm512_mask_expandloadu_pd(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34534,7 +34357,7 @@ pub unsafe fn _mm512_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) ->
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34556,7 +34379,7 @@ pub unsafe fn _mm256_mask_expandloadu_pd(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34574,7 +34397,7 @@ pub unsafe fn _mm256_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) ->
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34592,7 +34415,7 @@ pub unsafe fn _mm_mask_expandloadu_pd(src: __m128d, k: __mmask8, mem_addr: *cons
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -34610,7 +34433,7 @@ pub unsafe fn _mm_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -54387,6 +54210,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512f")]
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm512_stream_ps() {
         #[repr(align(64))]
         struct Memory {
@@ -54401,6 +54225,38 @@ mod tests {
         }
     }
 
+    #[simd_test(enable = "avx512f")]
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm512_stream_pd() {
+        #[repr(align(64))]
+        struct Memory {
+            pub data: [f64; 8],
+        }
+        let a = _mm512_set1_pd(7.0);
+        let mut mem = Memory { data: [-1.0; 8] };
+
+        _mm512_stream_pd(&mut mem.data[0] as *mut f64, a);
+        for i in 0..8 {
+            assert_eq!(mem.data[i], get_m512d(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx512f")]
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm512_stream_si512() {
+        #[repr(align(64))]
+        struct Memory {
+            pub data: [i64; 8],
+        }
+        let a = _mm512_set1_epi32(7);
+        let mut mem = Memory { data: [-1; 8] };
+
+        _mm512_stream_si512(mem.data.as_mut_ptr().cast(), a);
+        for i in 0..8 {
+            assert_eq!(mem.data[i], get_m512i(a, i));
+        }
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_reduce_add_epi32() {
         let a = _mm512_set1_epi32(1);
diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs
index 979cad7284..4bbb6f44c2 100644
--- a/crates/core_arch/src/x86/avx512vbmi2.rs
+++ b/crates/core_arch/src/x86/avx512vbmi2.rs
@@ -24,7 +24,7 @@ pub unsafe fn _mm512_mask_expandloadu_epi16(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -42,7 +42,7 @@ pub unsafe fn _mm512_maskz_expandloadu_epi16(k: __mmask32, mem_addr: *const i16)
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -64,7 +64,7 @@ pub unsafe fn _mm256_mask_expandloadu_epi16(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -82,7 +82,7 @@ pub unsafe fn _mm256_maskz_expandloadu_epi16(k: __mmask16, mem_addr: *const i16)
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -104,7 +104,7 @@ pub unsafe fn _mm_mask_expandloadu_epi16(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -122,7 +122,7 @@ pub unsafe fn _mm_maskz_expandloadu_epi16(k: __mmask8, mem_addr: *const i16) ->
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -144,7 +144,7 @@ pub unsafe fn _mm512_mask_expandloadu_epi8(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -162,7 +162,7 @@ pub unsafe fn _mm512_maskz_expandloadu_epi8(k: __mmask64, mem_addr: *const i8) -
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(zmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -184,7 +184,7 @@ pub unsafe fn _mm256_mask_expandloadu_epi8(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -202,7 +202,7 @@ pub unsafe fn _mm256_maskz_expandloadu_epi8(k: __mmask32, mem_addr: *const i8) -
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(ymm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -224,7 +224,7 @@ pub unsafe fn _mm_mask_expandloadu_epi8(
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = inout(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
@@ -242,7 +242,7 @@ pub unsafe fn _mm_maskz_expandloadu_epi8(k: __mmask16, mem_addr: *const i8) -> _
         p = in(reg) mem_addr,
         k = in(kreg) k,
         dst = out(xmm_reg) dst,
-        options(pure, readonly, nostack)
+        options(pure, readonly, nostack, preserves_flags)
     );
     dst
 }
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index ce6bc89f5b..aa501bec3d 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -12012,36 +12012,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_stream_pd() {
-        #[repr(align(64))]
-        struct Memory {
-            pub data: [f64; 8],
-        }
-        let a = _mm512_set1_pd(7.0);
-        let mut mem = Memory { data: [-1.0; 8] };
-
-        _mm512_stream_pd(&mut mem.data[0] as *mut f64, a);
-        for i in 0..8 {
-            assert_eq!(mem.data[i], get_m512d(a, i));
-        }
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_stream_si512() {
-        #[repr(align(64))]
-        struct Memory {
-            pub data: [i64; 8],
-        }
-        let a = _mm512_set1_epi64(7);
-        let mut mem = Memory { data: [-1; 8] };
-
-        _mm512_stream_si512(&mut mem.data[0] as *mut i64, a);
-        for i in 0..8 {
-            assert_eq!(mem.data[i], get_m512i(a, i));
-        }
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_set1_epi64() {
         let src = _mm512_set1_epi64(2);
diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
index a69d17e71c..c5773a0984 100644
--- a/crates/stdarch-verify/tests/x86-intel.rs
+++ b/crates/stdarch-verify/tests/x86-intel.rs
@@ -346,7 +346,10 @@ fn verify_all_signatures() {
             // unsigned integer, but all other _mm_shuffle_.. intrinsics
             // take a signed-integer. This breaks `_MM_SHUFFLE` for
             // `_mm_shuffle_ps`:
-            "_mm_shuffle_ps" => continue,
+            name@"_mm_shuffle_ps" => {
+                map.remove(name);
+                continue;
+            },
             _ => {}
         }
 
@@ -391,13 +394,19 @@ fn verify_all_signatures() {
 
 fn print_missing(map: &HashMap<&str, Vec<&Intrinsic>>, mut f: impl Write) -> io::Result<()> {
     let mut missing = BTreeMap::new(); // BTreeMap to keep the cpuids ordered
-                                       // we cannot use SVML and MMX, and MPX is not in LLVM, and intrinsics without any cpuid requirement
-                                       // are accessible from safe rust
+
+    // we cannot use SVML and MMX, and MPX is not in LLVM, and intrinsics without any cpuid requirement
+    // are accessible from safe rust
     for intrinsic in map.values().flatten().filter(|intrinsic| {
         intrinsic.tech != "SVML"
             && intrinsic.tech != "MMX"
             && !intrinsic.cpuid.is_empty()
             && !intrinsic.cpuid.contains(&"MPX".to_string())
+            && intrinsic.return_.type_ != "__m64"
+            && !intrinsic
+                .parameters
+                .iter()
+                .any(|param| param.type_.contains("__m64"))
     }) {
         missing
             .entry(&intrinsic.cpuid)