From 7387776db5a4f90556eef604c8c52f91e343855a Mon Sep 17 00:00:00 2001 From: sayantn Date: Tue, 25 Jun 2024 01:07:52 +0530 Subject: [PATCH] Fixed many intrinsics fixed avx512-fma, mask-load/store stream, reduce-add and reduce-mul. and load/store of mask32 and mask64. added preserves-flags to load and store asm. fixed the missing list --- crates/core_arch/missing-x86.md | 80 +- crates/core_arch/src/x86/avx.rs | 8 +- crates/core_arch/src/x86/avx2.rs | 2 - crates/core_arch/src/x86/avx512bw.rs | 44 +- crates/core_arch/src/x86/avx512f.rs | 982 ++++++++++------------- crates/core_arch/src/x86/avx512vbmi2.rs | 24 +- crates/core_arch/src/x86_64/avx512f.rs | 30 - crates/stdarch-verify/tests/x86-intel.rs | 15 +- 8 files changed, 475 insertions(+), 710 deletions(-) diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md index 7c1057fc1a..f1fa8cbd16 100644 --- a/crates/core_arch/missing-x86.md +++ b/crates/core_arch/missing-x86.md @@ -53,6 +53,7 @@
["AVX2"]

+ * [ ] [`_mm256_stream_load_si256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_load_si256) * [ ] [`_mm_broadcastsi128_si256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsi128_si256)

@@ -174,6 +175,7 @@ * [ ] [`_mm512_mask_i32logather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_pd) * [ ] [`_mm512_mask_i32loscatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_epi64) * [ ] [`_mm512_mask_i32loscatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_pd) + * [ ] [`_mm512_stream_load_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_load_si512) * [ ] [`_mm_mask_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd) * [ ] [`_mm_mask_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss) * [ ] [`_mm_mask_store_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd) @@ -1539,95 +1541,25 @@
["SSE"]

- * [ ] [`_m_maskmovq`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq) - * [ ] [`_m_pavgb`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb) - * [ ] [`_m_pavgw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw) - * [ ] [`_m_pextrw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw) - * [ ] [`_m_pinsrw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pinsrw) - * [ ] [`_m_pmaxsw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw) - * [ ] [`_m_pmaxub`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub) - * [ ] [`_m_pminsw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw) - * [ ] [`_m_pminub`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub) - * [ ] [`_m_pmovmskb`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb) - * [ ] [`_m_pmulhuw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw) - * [ ] [`_m_psadbw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw) - * [ ] [`_m_pshufw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw) - * [ ] [`_mm_avg_pu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16) - * [ ] [`_mm_avg_pu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8) - * [ ] [`_mm_cvt_pi2ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps) - * [ ] [`_mm_cvt_ps2pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi) - * [ ] [`_mm_cvtpi16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps) - * [ ] [`_mm_cvtpi32_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps) - * [ ] [`_mm_cvtpi32x2_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps) - * [ ] [`_mm_cvtpi8_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps) - * [ ] [`_mm_cvtps_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16) - * [ ] [`_mm_cvtps_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32) - * [ ] [`_mm_cvtps_pi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8) - * [ ] [`_mm_cvtpu16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps) - * [ ] [`_mm_cvtpu8_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps) - * [ ] [`_mm_cvtt_ps2pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi) - * [ ] [`_mm_cvttps_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32) - * [ ] [`_mm_extract_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16) * [ ] [`_mm_free`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free) - * [ ] [`_mm_insert_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16) - * [ ] [`_mm_loadh_pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pi) - * [ ] [`_mm_loadl_pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pi) * [ ] [`_mm_malloc`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_malloc) - * [ ] [`_mm_maskmove_si64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64) - * [ ] [`_mm_max_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16) - * [ ] [`_mm_max_pu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8) - * [ ] [`_mm_min_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16) - * [ ] [`_mm_min_pu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8) - * [ ] [`_mm_movemask_pi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8) - * [ ] [`_mm_mulhi_pu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16) - * [ ] [`_mm_sad_pu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8) - * [ ] [`_mm_shuffle_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16) - * [ ] [`_mm_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_ps) - * [ ] [`_mm_storeh_pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pi) - * [ ] [`_mm_storel_pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pi) - * [ ] [`_mm_stream_pi`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi)

["SSE2"]

- * [ ] [`_mm_add_si64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64) - * [ ] [`_mm_cvtpd_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32) - * [ ] [`_mm_cvtpi32_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd) - * [ ] [`_mm_cvttpd_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32) * [ ] [`_mm_loadu_si16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16) * [ ] [`_mm_loadu_si32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32) - * [ ] [`_mm_movepi64_pi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64) - * [ ] [`_mm_movpi64_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64) - * [ ] [`_mm_mul_su32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32) - * [ ] [`_mm_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64) - * [ ] [`_mm_set_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi64) - * [ ] [`_mm_setr_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64) + * [ ] [`_mm_loadu_si64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64) * [ ] [`_mm_storeu_si16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16) * [ ] [`_mm_storeu_si32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32) * [ ] [`_mm_storeu_si64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64) - * [ ] [`_mm_sub_si64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64)

-
["SSSE3"]

- - * [ ] [`_mm_abs_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16) - * [ ] [`_mm_abs_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32) - * [ ] [`_mm_abs_pi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8) - * [ ] [`_mm_alignr_pi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8) - * [ ] [`_mm_hadd_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16) - * [ ] [`_mm_hadd_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32) - * [ ] [`_mm_hadds_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16) - * [ ] [`_mm_hsub_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16) - * [ ] [`_mm_hsub_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi32) - * [ ] [`_mm_hsubs_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16) - * [ ] [`_mm_maddubs_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16) - * [ ] [`_mm_mulhrs_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16) - * [ ] [`_mm_shuffle_pi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8) - * [ ] [`_mm_sign_pi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16) - * [ ] [`_mm_sign_pi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32) - * [ ] [`_mm_sign_pi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8) +

["SSE4.1"]

+ + * [ ] [`_mm_stream_load_si128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128)

diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index 3a993bf785..db17be9f21 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -1715,11 +1715,11 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i { /// See [`_mm_sfence`] for details. #[inline] #[target_feature(enable = "avx")] -#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq +#[cfg_attr(test, assert_instr(vmovntdq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { crate::arch::asm!( - "vmovntps [{mem_addr}], {a}", + "vmovntdq [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(ymm_reg) a, options(nostack, preserves_flags), @@ -1742,12 +1742,12 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { /// See [`_mm_sfence`] for details. #[inline] #[target_feature(enable = "avx")] -#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd +#[cfg_attr(test, assert_instr(vmovntpd))] #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) { crate::arch::asm!( - "vmovntps [{mem_addr}], {a}", + "vmovntpd [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(ymm_reg) a, options(nostack, preserves_flags), diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index 5227b549dd..f3dd0c8e4a 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -3124,8 +3124,6 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i { transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) } -// TODO _mm256_stream_load_si256 (__m256i const* mem_addr) - /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16) diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index 936d4a0041..2e3d2cc913 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -4583,7 +4583,7 @@ pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *con p = in(reg) mem_addr, k = in(kreg) k, dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -4603,7 +4603,7 @@ pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __ p = in(reg) mem_addr, k = in(kreg) k, dst = out(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -4623,7 +4623,7 @@ pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *cons p = in(reg) mem_addr, k = in(kreg) k, dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -4643,7 +4643,7 @@ pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m5 p = in(reg) mem_addr, k = in(kreg) k, dst = out(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -4663,7 +4663,7 @@ pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *con p = in(reg) mem_addr, k = in(kreg) k, dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -4683,7 +4683,7 @@ pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __ p = in(reg) mem_addr, k = in(kreg) k, dst = out(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -4703,7 +4703,7 @@ pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *cons p = in(reg) mem_addr, k = in(kreg) k, dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -4723,7 +4723,7 @@ pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m2 p = in(reg) mem_addr, k = in(kreg) k, dst = out(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -4743,7 +4743,7 @@ pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i p = in(reg) mem_addr, k = in(kreg) k, dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -4763,7 +4763,7 @@ pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128 p = in(reg) mem_addr, k = in(kreg) k, dst = out(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -4783,7 +4783,7 @@ pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i p = in(reg) mem_addr, k = in(kreg) k, dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -4803,7 +4803,7 @@ pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i p = in(reg) mem_addr, k = in(kreg) k, dst = out(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -4821,7 +4821,7 @@ pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: _ p = in(reg) mem_addr, mask = in(kreg) mask, a = in(zmm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -4838,7 +4838,7 @@ pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m p = in(reg) mem_addr, mask = in(kreg) mask, a = in(zmm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -4855,7 +4855,7 @@ pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: _ p = in(reg) mem_addr, mask = in(kreg) mask, a = in(ymm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -4872,7 +4872,7 @@ pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m p = in(reg) mem_addr, mask = in(kreg) mask, a = in(ymm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -4889,7 +4889,7 @@ pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m12 p = in(reg) mem_addr, mask = in(kreg) mask, a = in(xmm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -4906,7 +4906,7 @@ pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128 p = in(reg) mem_addr, mask = in(kreg) mask, a = in(xmm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -8761,7 +8761,7 @@ pub unsafe fn _mm_mask_testn_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> #[target_feature(enable = "avx512bw")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(mov))] //should be kmovq -pub unsafe fn _store_mask64(mem_addr: *mut u64, a: __mmask64) { +pub unsafe fn _store_mask64(mem_addr: *mut __mmask64, a: __mmask64) { ptr::write(mem_addr as *mut __mmask64, a); } @@ -8772,7 +8772,7 @@ pub unsafe fn _store_mask64(mem_addr: *mut u64, a: __mmask64) { #[target_feature(enable = "avx512bw")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(mov))] //should be kmovd -pub unsafe fn _store_mask32(mem_addr: *mut u32, a: __mmask32) { +pub unsafe fn _store_mask32(mem_addr: *mut __mmask32, a: __mmask32) { ptr::write(mem_addr as *mut __mmask32, a); } @@ -8783,7 +8783,7 @@ pub unsafe fn _store_mask32(mem_addr: *mut u32, a: __mmask32) { #[target_feature(enable = "avx512bw")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(mov))] //should be kmovq -pub unsafe fn _load_mask64(mem_addr: *const u64) -> __mmask64 { +pub unsafe fn _load_mask64(mem_addr: *const __mmask64) -> __mmask64 { ptr::read(mem_addr as *const __mmask64) } @@ -8794,7 +8794,7 @@ pub unsafe fn _load_mask64(mem_addr: *const u64) -> __mmask64 { #[target_feature(enable = "avx512bw")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(mov))] //should be kmovd -pub unsafe fn _load_mask32(mem_addr: *const u32) -> __mmask32 { +pub unsafe fn _load_mask32(mem_addr: *const __mmask32) -> __mmask32 { ptr::read(mem_addr as *const __mmask32) } diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 99d63734fd..20ca9d2a6d 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -50,7 +50,7 @@ pub unsafe fn _mm512_abs_epi32(a: __m512i) -> __m512i { let a = a.as_i32x16(); // all-0 is a properly initialized i32x16 let zero: i32x16 = mem::zeroed(); - let sub = simd_sub(zero, a); + let sub = simd_neg(a); let cmp: i32x16 = simd_gt(a, zero); transmute(simd_select(cmp, a, sub)) } @@ -145,7 +145,7 @@ pub unsafe fn _mm512_abs_epi64(a: __m512i) -> __m512i { let a = a.as_i64x8(); // all-0 is a properly initialized i64x8 let zero: i64x8 = mem::zeroed(); - let sub = simd_sub(zero, a); + let sub = simd_neg(a); let cmp: i64x8 = simd_gt(a, zero); transmute(simd_select(cmp, a, sub)) } @@ -186,7 +186,7 @@ pub unsafe fn _mm256_abs_epi64(a: __m256i) -> __m256i { let a = a.as_i64x4(); // all-0 is a properly initialized i64x4 let zero: i64x4 = mem::zeroed(); - let sub = simd_sub(zero, a); + let sub = simd_neg(a); let cmp: i64x4 = simd_gt(a, zero); transmute(simd_select(cmp, a, sub)) } @@ -3414,11 +3414,13 @@ pub unsafe fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 { - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); - transmute(vfmadd132ps(a.as_f32x16(), b.as_f32x16(), sub)) + transmute(vfmadd132ps( + a.as_f32x16(), + b.as_f32x16(), + simd_neg(c.as_f32x16()), + )) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3427,7 +3429,7 @@ pub unsafe fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 { let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16(); transmute(simd_select_bitmask(k, fmsub, a.as_f32x16())) @@ -3439,7 +3441,7 @@ pub unsafe fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 { let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16(); let zero = _mm512_setzero_ps().as_f32x16(); @@ -3452,7 +3454,7 @@ pub unsafe fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m51 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 { let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16(); transmute(simd_select_bitmask(k, fmsub, c.as_f32x16())) @@ -3538,10 +3540,9 @@ pub unsafe fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) - #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); + let sub = simd_neg(c.as_f64x8()); transmute(vfmadd132pd(a.as_f64x8(), b.as_f64x8(), sub)) } @@ -3551,7 +3552,7 @@ pub unsafe fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d { let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8(); transmute(simd_select_bitmask(k, fmsub, a.as_f64x8())) @@ -3563,7 +3564,7 @@ pub unsafe fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m51 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d { let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8(); let zero = _mm512_setzero_pd().as_f64x8(); @@ -3576,7 +3577,7 @@ pub unsafe fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m5 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d { let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8(); transmute(simd_select_bitmask(k, fmsub, c.as_f64x8())) @@ -3916,10 +3917,9 @@ pub unsafe fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mma #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 { - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); + let sub = simd_neg(c.as_f32x16()); transmute(vfmaddsub213ps( a.as_f32x16(), b.as_f32x16(), @@ -3934,7 +3934,7 @@ pub unsafe fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 { let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16(); transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x16())) @@ -3946,7 +3946,7 @@ pub unsafe fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 { let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16(); let zero = _mm512_setzero_ps().as_f32x16(); @@ -3959,7 +3959,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __ #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 { let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16(); transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x16())) @@ -4045,10 +4045,9 @@ pub unsafe fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); + let sub = simd_neg(c.as_f64x8()); transmute(vfmaddsub213pd( a.as_f64x8(), b.as_f64x8(), @@ -4063,7 +4062,7 @@ pub unsafe fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d { let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8(); transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x8())) @@ -4075,7 +4074,7 @@ pub unsafe fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __ #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d { let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8(); let zero = _mm512_setzero_pd().as_f64x8(); @@ -4088,7 +4087,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: _ #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d { let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8(); transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x8())) @@ -4174,10 +4173,9 @@ pub unsafe fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mma #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 { - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f32x16()); + let sub = simd_neg(a.as_f32x16()); transmute(vfmadd132ps(sub, b.as_f32x16(), c.as_f32x16())) } @@ -4187,7 +4185,7 @@ pub unsafe fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 { let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16(); transmute(simd_select_bitmask(k, fnmadd, a.as_f32x16())) @@ -4199,7 +4197,7 @@ pub unsafe fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m51 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 { let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16(); let zero = _mm512_setzero_ps().as_f32x16(); @@ -4212,7 +4210,7 @@ pub unsafe fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m5 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 { let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16(); transmute(simd_select_bitmask(k, fnmadd, c.as_f32x16())) @@ -4298,10 +4296,9 @@ pub unsafe fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f64x8()); + let sub = simd_neg(a.as_f64x8()); transmute(vfmadd132pd(sub, b.as_f64x8(), c.as_f64x8())) } @@ -4311,7 +4308,7 @@ pub unsafe fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d { let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8(); transmute(simd_select_bitmask(k, fnmadd, a.as_f64x8())) @@ -4323,7 +4320,7 @@ pub unsafe fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m5 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d { let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8(); let zero = _mm512_setzero_pd().as_f64x8(); @@ -4336,7 +4333,7 @@ pub unsafe fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d { let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8(); transmute(simd_select_bitmask(k, fnmadd, c.as_f64x8())) @@ -4422,11 +4419,10 @@ pub unsafe fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 { - let zero: f32x16 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f32x16()); - let subc = simd_sub(zero, c.as_f32x16()); + let suba = simd_neg(a.as_f32x16()); + let subc = simd_neg(c.as_f32x16()); transmute(vfmadd132ps(suba, b.as_f32x16(), subc)) } @@ -4436,7 +4432,7 @@ pub unsafe fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 { let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16(); transmute(simd_select_bitmask(k, fnmsub, a.as_f32x16())) @@ -4448,7 +4444,7 @@ pub unsafe fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m51 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 { let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16(); let zero = _mm512_setzero_ps().as_f32x16(); @@ -4461,7 +4457,7 @@ pub unsafe fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m5 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 { let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16(); transmute(simd_select_bitmask(k, fnmsub, c.as_f32x16())) @@ -4547,11 +4543,10 @@ pub unsafe fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let zero: f64x8 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f64x8()); - let subc = simd_sub(zero, c.as_f64x8()); + let suba = simd_neg(a.as_f64x8()); + let subc = simd_neg(c.as_f64x8()); transmute(vfmadd132pd(suba, b.as_f64x8(), subc)) } @@ -4561,7 +4556,7 @@ pub unsafe fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d { let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8(); transmute(simd_select_bitmask(k, fnmsub, a.as_f64x8())) @@ -4573,7 +4568,7 @@ pub unsafe fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m5 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d { let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8(); let zero = _mm512_setzero_pd().as_f64x8(); @@ -4586,7 +4581,7 @@ pub unsafe fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d { let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8(); transmute(simd_select_bitmask(k, fnmsub, c.as_f64x8())) @@ -8377,7 +8372,7 @@ pub unsafe fn _mm512_mask3_fmadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fmsub_round_ps( a: __m512, @@ -8385,8 +8380,7 @@ pub unsafe fn _mm512_fmsub_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); + let sub = simd_neg(c.as_f32x16()); let a = a.as_f32x16(); let b = b.as_f32x16(); let r = vfmadd132psround(a, b, sub, ROUNDING); @@ -8406,7 +8400,7 @@ pub unsafe fn _mm512_fmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fmsub_round_ps( a: __m512, @@ -8415,8 +8409,7 @@ pub unsafe fn _mm512_mask_fmsub_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); + let sub = simd_neg(c.as_f32x16()); let a = a.as_f32x16(); let b = b.as_f32x16(); let r = vfmadd132psround(a, b, sub, ROUNDING); @@ -8436,7 +8429,7 @@ pub unsafe fn _mm512_mask_fmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fmsub_round_ps( k: __mmask16, @@ -8446,7 +8439,7 @@ pub unsafe fn _mm512_maskz_fmsub_round_ps( ) -> __m512 { static_assert_rounding!(ROUNDING); let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); + let sub = simd_neg(c.as_f32x16()); let a = a.as_f32x16(); let b = b.as_f32x16(); let r = vfmadd132psround(a, b, sub, ROUNDING); @@ -8466,7 +8459,7 @@ pub unsafe fn _mm512_maskz_fmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fmsub_round_ps( a: __m512, @@ -8475,9 +8468,8 @@ pub unsafe fn _mm512_mask3_fmsub_round_ps( k: __mmask16, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); let c = c.as_f32x16(); - let sub = simd_sub(zero, c); + let sub = simd_neg(c); let a = a.as_f32x16(); let b = b.as_f32x16(); let r = vfmadd132psround(a, b, sub, ROUNDING); @@ -8497,7 +8489,7 @@ pub unsafe fn _mm512_mask3_fmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fmsub_round_pd( a: __m512d, @@ -8505,8 +8497,7 @@ pub unsafe fn _mm512_fmsub_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); + let sub = simd_neg(c.as_f64x8()); let a = a.as_f64x8(); let b = b.as_f64x8(); let r = vfmadd132pdround(a, b, sub, ROUNDING); @@ -8526,7 +8517,7 @@ pub unsafe fn _mm512_fmsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fmsub_round_pd( a: __m512d, @@ -8535,8 +8526,7 @@ pub unsafe fn _mm512_mask_fmsub_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); + let sub = simd_neg(c.as_f64x8()); let a = a.as_f64x8(); let b = b.as_f64x8(); let r = vfmadd132pdround(a, b, sub, ROUNDING); @@ -8556,7 +8546,7 @@ pub unsafe fn _mm512_mask_fmsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fmsub_round_pd( k: __mmask8, @@ -8566,7 +8556,7 @@ pub unsafe fn _mm512_maskz_fmsub_round_pd( ) -> __m512d { static_assert_rounding!(ROUNDING); let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); + let sub = simd_neg(c.as_f64x8()); let a = a.as_f64x8(); let b = b.as_f64x8(); let r = vfmadd132pdround(a, b, sub, ROUNDING); @@ -8586,7 +8576,7 @@ pub unsafe fn _mm512_maskz_fmsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fmsub_round_pd( a: __m512d, @@ -8595,9 +8585,8 @@ pub unsafe fn _mm512_mask3_fmsub_round_pd( k: __mmask8, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); let c = c.as_f64x8(); - let sub = simd_sub(zero, c); + let sub = simd_neg(c); let a = a.as_f64x8(); let b = b.as_f64x8(); let r = vfmadd132pdround(a, b, sub, ROUNDING); @@ -8849,7 +8838,7 @@ pub unsafe fn _mm512_mask3_fmaddsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fmsubadd_round_ps( a: __m512, @@ -8857,8 +8846,7 @@ pub unsafe fn _mm512_fmsubadd_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); + let sub = simd_neg(c.as_f32x16()); let a = a.as_f32x16(); let b = b.as_f32x16(); let r = vfmaddsub213ps(a, b, sub, ROUNDING); @@ -8878,7 +8866,7 @@ pub unsafe fn _mm512_fmsubadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fmsubadd_round_ps( a: __m512, @@ -8887,8 +8875,7 @@ pub unsafe fn _mm512_mask_fmsubadd_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); + let sub = simd_neg(c.as_f32x16()); let a = a.as_f32x16(); let b = b.as_f32x16(); let r = vfmaddsub213ps(a, b, sub, ROUNDING); @@ -8908,7 +8895,7 @@ pub unsafe fn _mm512_mask_fmsubadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fmsubadd_round_ps( k: __mmask16, @@ -8918,7 +8905,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_ps( ) -> __m512 { static_assert_rounding!(ROUNDING); let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); + let sub = simd_neg(c.as_f32x16()); let a = a.as_f32x16(); let b = b.as_f32x16(); let r = vfmaddsub213ps(a, b, sub, ROUNDING); @@ -8938,7 +8925,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fmsubadd_round_ps( a: __m512, @@ -8947,9 +8934,8 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_ps( k: __mmask16, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); let c = c.as_f32x16(); - let sub = simd_sub(zero, c); + let sub = simd_neg(c); let a = a.as_f32x16(); let b = b.as_f32x16(); let r = vfmaddsub213ps(a, b, sub, ROUNDING); @@ -8969,7 +8955,7 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fmsubadd_round_pd( a: __m512d, @@ -8977,8 +8963,7 @@ pub unsafe fn _mm512_fmsubadd_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); + let sub = simd_neg(c.as_f64x8()); let a = a.as_f64x8(); let b = b.as_f64x8(); let r = vfmaddsub213pd(a, b, sub, ROUNDING); @@ -8998,7 +8983,7 @@ pub unsafe fn _mm512_fmsubadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fmsubadd_round_pd( a: __m512d, @@ -9007,8 +8992,7 @@ pub unsafe fn _mm512_mask_fmsubadd_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); + let sub = simd_neg(c.as_f64x8()); let a = a.as_f64x8(); let b = b.as_f64x8(); let r = vfmaddsub213pd(a, b, sub, ROUNDING); @@ -9028,7 +9012,7 @@ pub unsafe fn _mm512_mask_fmsubadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fmsubadd_round_pd( k: __mmask8, @@ -9038,7 +9022,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_pd( ) -> __m512d { static_assert_rounding!(ROUNDING); let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); + let sub = simd_neg(c.as_f64x8()); let a = a.as_f64x8(); let b = b.as_f64x8(); let r = vfmaddsub213pd(a, b, sub, ROUNDING); @@ -9058,7 +9042,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fmsubadd_round_pd( a: __m512d, @@ -9067,9 +9051,8 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_pd( k: __mmask8, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); let c = c.as_f64x8(); - let sub = simd_sub(zero, c); + let sub = simd_neg(c); let a = a.as_f64x8(); let b = b.as_f64x8(); let r = vfmaddsub213pd(a, b, sub, ROUNDING); @@ -9089,7 +9072,7 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fnmadd_round_ps( a: __m512, @@ -9097,8 +9080,7 @@ pub unsafe fn _mm512_fnmadd_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f32x16()); + let sub = simd_neg(a.as_f32x16()); let b = b.as_f32x16(); let c = c.as_f32x16(); let r = vfmadd132psround(sub, b, c, ROUNDING); @@ -9118,7 +9100,7 @@ pub unsafe fn _mm512_fnmadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fnmadd_round_ps( a: __m512, @@ -9127,8 +9109,7 @@ pub unsafe fn _mm512_mask_fnmadd_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f32x16()); + let sub = simd_neg(a.as_f32x16()); let b = b.as_f32x16(); let c = c.as_f32x16(); let r = vfmadd132psround(sub, b, c, ROUNDING); @@ -9148,7 +9129,7 @@ pub unsafe fn _mm512_mask_fnmadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fnmadd_round_ps( k: __mmask16, @@ -9158,7 +9139,7 @@ pub unsafe fn _mm512_maskz_fnmadd_round_ps( ) -> __m512 { static_assert_rounding!(ROUNDING); let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f32x16()); + let sub = simd_neg(a.as_f32x16()); let b = b.as_f32x16(); let c = c.as_f32x16(); let r = vfmadd132psround(sub, b, c, ROUNDING); @@ -9178,7 +9159,7 @@ pub unsafe fn _mm512_maskz_fnmadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fnmadd_round_ps( a: __m512, @@ -9187,8 +9168,7 @@ pub unsafe fn _mm512_mask3_fnmadd_round_ps( k: __mmask16, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f32x16()); + let sub = simd_neg(a.as_f32x16()); let b = b.as_f32x16(); let c = c.as_f32x16(); let r = vfmadd132psround(sub, b, c, ROUNDING); @@ -9208,7 +9188,7 @@ pub unsafe fn _mm512_mask3_fnmadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fnmadd_round_pd( a: __m512d, @@ -9216,8 +9196,7 @@ pub unsafe fn _mm512_fnmadd_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f64x8()); + let sub = simd_neg(a.as_f64x8()); let b = b.as_f64x8(); let c = c.as_f64x8(); let r = vfmadd132pdround(sub, b, c, ROUNDING); @@ -9237,7 +9216,7 @@ pub unsafe fn _mm512_fnmadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fnmadd_round_pd( a: __m512d, @@ -9246,9 +9225,8 @@ pub unsafe fn _mm512_mask_fnmadd_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); let a = a.as_f64x8(); - let sub = simd_sub(zero, a); + let sub = simd_neg(a); let b = b.as_f64x8(); let c = c.as_f64x8(); let r = vfmadd132pdround(sub, b, c, ROUNDING); @@ -9268,7 +9246,7 @@ pub unsafe fn _mm512_mask_fnmadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fnmadd_round_pd( k: __mmask8, @@ -9278,7 +9256,7 @@ pub unsafe fn _mm512_maskz_fnmadd_round_pd( ) -> __m512d { static_assert_rounding!(ROUNDING); let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f64x8()); + let sub = simd_neg(a.as_f64x8()); let b = b.as_f64x8(); let c = c.as_f64x8(); let r = vfmadd132pdround(sub, b, c, ROUNDING); @@ -9298,7 +9276,7 @@ pub unsafe fn _mm512_maskz_fnmadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fnmadd_round_pd( a: __m512d, @@ -9307,8 +9285,7 @@ pub unsafe fn _mm512_mask3_fnmadd_round_pd( k: __mmask8, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f64x8()); + let sub = simd_neg(a.as_f64x8()); let b = b.as_f64x8(); let c = c.as_f64x8(); let r = vfmadd132pdround(sub, b, c, ROUNDING); @@ -9328,7 +9305,7 @@ pub unsafe fn _mm512_mask3_fnmadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fnmsub_round_ps( a: __m512, @@ -9336,9 +9313,8 @@ pub unsafe fn _mm512_fnmsub_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f32x16()); - let subc = simd_sub(zero, c.as_f32x16()); + let suba = simd_neg(a.as_f32x16()); + let subc = simd_neg(c.as_f32x16()); let b = b.as_f32x16(); let r = vfmadd132psround(suba, b, subc, ROUNDING); transmute(r) @@ -9357,7 +9333,7 @@ pub unsafe fn _mm512_fnmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fnmsub_round_ps( a: __m512, @@ -9366,10 +9342,9 @@ pub unsafe fn _mm512_mask_fnmsub_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); let a = a.as_f32x16(); - let suba = simd_sub(zero, a); - let subc = simd_sub(zero, c.as_f32x16()); + let suba = simd_neg(a); + let subc = simd_neg(c.as_f32x16()); let b = b.as_f32x16(); let r = vfmadd132psround(suba, b, subc, ROUNDING); transmute(simd_select_bitmask(k, r, a)) @@ -9388,7 +9363,7 @@ pub unsafe fn _mm512_mask_fnmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fnmsub_round_ps( k: __mmask16, @@ -9398,8 +9373,8 @@ pub unsafe fn _mm512_maskz_fnmsub_round_ps( ) -> __m512 { static_assert_rounding!(ROUNDING); let zero: f32x16 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f32x16()); - let subc = simd_sub(zero, c.as_f32x16()); + let suba = simd_neg(a.as_f32x16()); + let subc = simd_neg(c.as_f32x16()); let b = b.as_f32x16(); let r = vfmadd132psround(suba, b, subc, ROUNDING); transmute(simd_select_bitmask(k, r, zero)) @@ -9418,7 +9393,7 @@ pub unsafe fn _mm512_maskz_fnmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fnmsub_round_ps( a: __m512, @@ -9427,10 +9402,9 @@ pub unsafe fn _mm512_mask3_fnmsub_round_ps( k: __mmask16, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f32x16()); + let suba = simd_neg(a.as_f32x16()); let c = c.as_f32x16(); - let subc = simd_sub(zero, c); + let subc = simd_neg(c); let b = b.as_f32x16(); let r = vfmadd132psround(suba, b, subc, ROUNDING); transmute(simd_select_bitmask(k, r, c)) @@ -9449,7 +9423,7 @@ pub unsafe fn _mm512_mask3_fnmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fnmsub_round_pd( a: __m512d, @@ -9457,9 +9431,8 @@ pub unsafe fn _mm512_fnmsub_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f64x8()); - let subc = simd_sub(zero, c.as_f64x8()); + let suba = simd_neg(a.as_f64x8()); + let subc = simd_neg(c.as_f64x8()); let b = b.as_f64x8(); let r = vfmadd132pdround(suba, b, subc, ROUNDING); transmute(r) @@ -9478,7 +9451,7 @@ pub unsafe fn _mm512_fnmsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fnmsub_round_pd( a: __m512d, @@ -9487,10 +9460,9 @@ pub unsafe fn _mm512_mask_fnmsub_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); let a = a.as_f64x8(); - let suba = simd_sub(zero, a); - let subc = simd_sub(zero, c.as_f64x8()); + let suba = simd_neg(a); + let subc = simd_neg(c.as_f64x8()); let b = b.as_f64x8(); let r = vfmadd132pdround(suba, b, subc, ROUNDING); transmute(simd_select_bitmask(k, r, a)) @@ -9509,7 +9481,7 @@ pub unsafe fn _mm512_mask_fnmsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fnmsub_round_pd( k: __mmask8, @@ -9519,8 +9491,8 @@ pub unsafe fn _mm512_maskz_fnmsub_round_pd( ) -> __m512d { static_assert_rounding!(ROUNDING); let zero: f64x8 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f64x8()); - let subc = simd_sub(zero, c.as_f64x8()); + let suba = simd_neg(a.as_f64x8()); + let subc = simd_neg(c.as_f64x8()); let b = b.as_f64x8(); let r = vfmadd132pdround(suba, b, subc, ROUNDING); transmute(simd_select_bitmask(k, r, zero)) @@ -9539,7 +9511,7 @@ pub unsafe fn _mm512_maskz_fnmsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fnmsub_round_pd( a: __m512d, @@ -9548,10 +9520,9 @@ pub unsafe fn _mm512_mask3_fnmsub_round_pd( k: __mmask8, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f64x8()); + let suba = simd_neg(a.as_f64x8()); let c = c.as_f64x8(); - let subc = simd_sub(zero, c); + let subc = simd_neg(c); let b = b.as_f64x8(); let r = vfmadd132pdround(suba, b, subc, ROUNDING); transmute(simd_select_bitmask(k, r, c)) @@ -28037,11 +28008,11 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntpd +#[cfg_attr(test, assert_instr(vmovntpd))] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { crate::arch::asm!( - "vmovntps [{mem_addr}], {a}", + "vmovntpd [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(zmm_reg) a, options(nostack, preserves_flags), @@ -28063,11 +28034,11 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntdq +#[cfg_attr(test, assert_instr(vmovntdq))] #[allow(clippy::cast_ptr_alignment)] -pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) { +pub unsafe fn _mm512_stream_si512(mem_addr: *mut i32, a: __m512i) { crate::arch::asm!( - "vmovntps [{mem_addr}], {a}", + "vmovntdq [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(zmm_reg) a, options(nostack, preserves_flags), @@ -31306,7 +31277,14 @@ pub unsafe fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 { #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_reduce_add_ps(a: __m512) -> f32 { - simd_reduce_add_unordered(a.as_f32x16()) + // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ + let a = _mm256_add_ps( + simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]), + simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]), + ); + let a = _mm_add_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a)); + let a = _mm_add_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1])); + simd_extract::<_, f32>(a, 0) + simd_extract::<_, f32>(a, 1) } /// Reduce the packed single-precision (32-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a. @@ -31316,11 +31294,7 @@ pub unsafe fn _mm512_reduce_add_ps(a: __m512) -> f32 { #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_reduce_add_ps(k: __mmask16, a: __m512) -> f32 { - simd_reduce_add_unordered(simd_select_bitmask( - k, - a.as_f32x16(), - _mm512_setzero_ps().as_f32x16(), - )) + _mm512_reduce_add_ps(simd_select_bitmask(k, a, _mm512_setzero_ps())) } /// Reduce the packed double-precision (64-bit) floating-point elements in a by addition. Returns the sum of all elements in a. @@ -31330,7 +31304,12 @@ pub unsafe fn _mm512_mask_reduce_add_ps(k: __mmask16, a: __m512) -> f32 { #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_reduce_add_pd(a: __m512d) -> f64 { - simd_reduce_add_unordered(a.as_f64x8()) + let a = _mm256_add_pd( + _mm512_extractf64x4_pd::<0>(a), + _mm512_extractf64x4_pd::<1>(a), + ); + let a = _mm_add_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a)); + simd_extract::<_, f64>(a, 0) + simd_extract::<_, f64>(a, 1) } /// Reduce the packed double-precision (64-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a. @@ -31340,11 +31319,7 @@ pub unsafe fn _mm512_reduce_add_pd(a: __m512d) -> f64 { #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_reduce_add_pd(k: __mmask8, a: __m512d) -> f64 { - simd_reduce_add_unordered(simd_select_bitmask( - k, - a.as_f64x8(), - _mm512_setzero_pd().as_f64x8(), - )) + _mm512_reduce_add_pd(simd_select_bitmask(k, a, _mm512_setzero_pd())) } /// Reduce the packed 32-bit integers in a by multiplication. Returns the product of all elements in a. @@ -31402,7 +31377,14 @@ pub unsafe fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 { #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_reduce_mul_ps(a: __m512) -> f32 { - simd_reduce_mul_unordered(a.as_f32x16()) + // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ + let a = _mm256_mul_ps( + simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]), + simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]), + ); + let a = _mm_mul_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a)); + let a = _mm_mul_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1])); + simd_extract::<_, f32>(a, 0) * simd_extract::<_, f32>(a, 1) } /// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a. @@ -31412,11 +31394,7 @@ pub unsafe fn _mm512_reduce_mul_ps(a: __m512) -> f32 { #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_reduce_mul_ps(k: __mmask16, a: __m512) -> f32 { - simd_reduce_mul_unordered(simd_select_bitmask( - k, - a.as_f32x16(), - _mm512_set1_ps(1.).as_f32x16(), - )) + _mm512_reduce_mul_ps(simd_select_bitmask(k, a, _mm512_set1_ps(1.))) } /// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication. Returns the product of all elements in a. @@ -31426,7 +31404,12 @@ pub unsafe fn _mm512_mask_reduce_mul_ps(k: __mmask16, a: __m512) -> f32 { #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_reduce_mul_pd(a: __m512d) -> f64 { - simd_reduce_mul_unordered(a.as_f64x8()) + let a = _mm256_mul_pd( + _mm512_extractf64x4_pd::<0>(a), + _mm512_extractf64x4_pd::<1>(a), + ); + let a = _mm_mul_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a)); + simd_extract::<_, f64>(a, 0) * simd_extract::<_, f64>(a, 1) } /// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a. @@ -31436,11 +31419,7 @@ pub unsafe fn _mm512_reduce_mul_pd(a: __m512d) -> f64 { #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_reduce_mul_pd(k: __mmask8, a: __m512d) -> f64 { - simd_reduce_mul_unordered(simd_select_bitmask( - k, - a.as_f64x8(), - _mm512_set1_pd(1.).as_f64x8(), - )) + _mm512_reduce_mul_pd(simd_select_bitmask(k, a, _mm512_set1_pd(1.))) } /// Reduce the packed signed 32-bit integers in a by maximum. Returns the maximum of all elements in a. @@ -32794,7 +32773,7 @@ pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *con p = in(reg) mem_addr, k = in(kreg) k, dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -32814,7 +32793,7 @@ pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __ p = in(reg) mem_addr, k = in(kreg) k, dst = out(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -32834,7 +32813,7 @@ pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *cons p = in(reg) mem_addr, k = in(kreg) k, dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -32854,7 +32833,7 @@ pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m p = in(reg) mem_addr, k = in(kreg) k, dst = out(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -32874,7 +32853,7 @@ pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f p = in(reg) mem_addr, k = in(kreg) k, dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -32894,7 +32873,7 @@ pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m51 p = in(reg) mem_addr, k = in(kreg) k, dst = out(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -32914,7 +32893,7 @@ pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f p = in(reg) mem_addr, k = in(kreg) k, dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -32934,7 +32913,7 @@ pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512 p = in(reg) mem_addr, k = in(kreg) k, dst = out(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -32954,7 +32933,7 @@ pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *cons p = in(reg) mem_addr, k = in(kreg) k, dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -32974,7 +32953,7 @@ pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m p = in(reg) mem_addr, k = in(kreg) k, dst = out(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -32994,7 +32973,7 @@ pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *cons p = in(reg) mem_addr, k = in(kreg) k, dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33014,7 +32993,7 @@ pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m p = in(reg) mem_addr, k = in(kreg) k, dst = out(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33034,7 +33013,7 @@ pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f3 p = in(reg) mem_addr, k = in(kreg) k, dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33054,7 +33033,7 @@ pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 p = in(reg) mem_addr, k = in(kreg) k, dst = out(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33074,7 +33053,7 @@ pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f p = in(reg) mem_addr, k = in(kreg) k, dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33094,7 +33073,7 @@ pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256 p = in(reg) mem_addr, k = in(kreg) k, dst = out(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33114,7 +33093,7 @@ pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i p = in(reg) mem_addr, k = in(kreg) k, dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33134,7 +33113,7 @@ pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128 p = in(reg) mem_addr, k = in(kreg) k, dst = out(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33154,7 +33133,7 @@ pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i p = in(reg) mem_addr, k = in(kreg) k, dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33174,7 +33153,7 @@ pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128 p = in(reg) mem_addr, k = in(kreg) k, dst = out(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33194,7 +33173,7 @@ pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) p = in(reg) mem_addr, k = in(kreg) k, dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33214,7 +33193,7 @@ pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { p = in(reg) mem_addr, k = in(kreg) k, dst = out(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33234,7 +33213,7 @@ pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) p = in(reg) mem_addr, k = in(kreg) k, dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33254,7 +33233,7 @@ pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { p = in(reg) mem_addr, k = in(kreg) k, dst = out(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -33266,17 +33245,13 @@ pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_epi32) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i { - let mut dst: __m512i = src; - asm!( - vpl!("vmovdqa32 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + let ones = _mm512_set1_epi32(-1).as_i32x16(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(k, ones, zero); + transmute(simd_masked_load(mask, mem_addr, src.as_i32x16())) } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -33286,17 +33261,10 @@ pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *cons /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_epi32) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i { - let mut dst: __m512i; - asm!( - vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + _mm512_mask_load_epi32(_mm512_setzero_epi32(), k, mem_addr) } /// Load packed 64-bit integers from memory into dst using writemask k @@ -33306,17 +33274,13 @@ pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_epi64) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i { - let mut dst: __m512i = src; - asm!( - vpl!("vmovdqa64 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + let ones = _mm512_set1_epi64(-1).as_i64x8(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(k, ones, zero); + transmute(simd_masked_load(mask, mem_addr, src.as_i64x8())) } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -33326,17 +33290,10 @@ pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_epi64) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i { - let mut dst: __m512i; - asm!( - vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + _mm512_mask_load_epi64(_mm512_setzero_epi32(), k, mem_addr) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k @@ -33346,17 +33303,13 @@ pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m5 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_ps) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 { - let mut dst: __m512 = src; - asm!( - vpl!("vmovaps {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + let ones = _mm512_set1_epi32(-1).as_i32x16(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(k, ones, zero); + simd_masked_load(mask, mem_addr, src) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -33366,17 +33319,10 @@ pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f3 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_ps) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 { - let mut dst: __m512; - asm!( - vpl!("vmovaps {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + _mm512_mask_load_ps(_mm512_setzero_ps(), k, mem_addr) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k @@ -33386,17 +33332,13 @@ pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_pd) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d { - let mut dst: __m512d = src; - asm!( - vpl!("vmovapd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + let ones = _mm512_set1_epi64(-1).as_i64x8(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(k, ones, zero); + simd_masked_load(mask, mem_addr, src) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -33406,17 +33348,10 @@ pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f6 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_pd) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d { - let mut dst: __m512d; - asm!( - vpl!("vmovapd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + _mm512_mask_load_pd(_mm512_setzero_pd(), k, mem_addr) } /// Load packed 32-bit integers from memory into dst using writemask k @@ -33425,18 +33360,14 @@ pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i { - let mut dst: __m256i = src; - asm!( - vpl!("vmovdqa32 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) - ); - dst + let ones = _mm256_set1_epi32(-1).as_i32x8(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(k, ones, zero); + transmute(simd_masked_load(mask, mem_addr, src.as_i32x8())) } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -33445,18 +33376,11 @@ pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i { - let mut dst: __m256i; - asm!( - vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack) - ); - dst + _mm256_mask_load_epi32(_mm256_setzero_si256(), k, mem_addr) } /// Load packed 64-bit integers from memory into dst using writemask k @@ -33465,18 +33389,14 @@ pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m2 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i { - let mut dst: __m256i = src; - asm!( - vpl!("vmovdqa64 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) - ); - dst + let ones = _mm256_set1_epi64x(-1).as_i64x4(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(k, ones, zero); + transmute(simd_masked_load(mask, mem_addr, src.as_i64x4())) } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -33485,18 +33405,11 @@ pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i { - let mut dst: __m256i; - asm!( - vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack) - ); - dst + _mm256_mask_load_epi64(_mm256_setzero_si256(), k, mem_addr) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k @@ -33505,18 +33418,14 @@ pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m2 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 { - let mut dst: __m256 = src; - asm!( - vpl!("vmovaps {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) - ); - dst + let ones = _mm256_set1_epi32(-1).as_i32x8(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(k, ones, zero); + simd_masked_load(mask, mem_addr, src) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -33525,18 +33434,11 @@ pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 { - let mut dst: __m256; - asm!( - vpl!("vmovaps {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack) - ); - dst + _mm256_mask_load_ps(_mm256_setzero_ps(), k, mem_addr) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k @@ -33545,18 +33447,14 @@ pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d { - let mut dst: __m256d = src; - asm!( - vpl!("vmovapd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) - ); - dst + let ones = _mm256_set1_epi64x(-1).as_i64x4(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(k, ones, zero); + simd_masked_load(mask, mem_addr, src) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -33565,18 +33463,11 @@ pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f6 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d { - let mut dst: __m256d; - asm!( - vpl!("vmovapd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack) - ); - dst + _mm256_mask_load_pd(_mm256_setzero_pd(), k, mem_addr) } /// Load packed 32-bit integers from memory into dst using writemask k @@ -33585,18 +33476,14 @@ pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i { - let mut dst: __m128i = src; - asm!( - vpl!("vmovdqa32 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + let ones = _mm_set1_epi32(-1).as_i32x4(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(k, ones, zero); + transmute(simd_masked_load(mask, mem_addr, src.as_i32x4())) } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -33605,18 +33492,11 @@ pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i3 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i { - let mut dst: __m128i; - asm!( - vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + _mm_mask_load_epi32(_mm_setzero_si128(), k, mem_addr) } /// Load packed 64-bit integers from memory into dst using writemask k @@ -33625,18 +33505,14 @@ pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i { - let mut dst: __m128i = src; - asm!( - vpl!("vmovdqa64 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + let ones = _mm_set1_epi64x(-1).as_i64x2(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(k, ones, zero); + transmute(simd_masked_load(mask, mem_addr, src.as_i64x2())) } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -33645,18 +33521,11 @@ pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i6 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i { - let mut dst: __m128i; - asm!( - vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + _mm_mask_load_epi64(_mm_setzero_si128(), k, mem_addr) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k @@ -33665,18 +33534,14 @@ pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { - let mut dst: __m128 = src; - asm!( - vpl!("vmovaps {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + let ones = _mm_set1_epi32(-1).as_i32x4(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(k, ones, zero); + simd_masked_load(mask, mem_addr, src) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -33685,18 +33550,11 @@ pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) - /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { - let mut dst: __m128; - asm!( - vpl!("vmovaps {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + _mm_mask_load_ps(_mm_setzero_ps(), k, mem_addr) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k @@ -33705,18 +33563,14 @@ pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { - let mut dst: __m128d = src; - asm!( - vpl!("vmovapd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + let ones = _mm_set1_epi64x(-1).as_i64x2(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(k, ones, zero); + simd_masked_load(mask, mem_addr, src) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -33725,18 +33579,11 @@ pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { - let mut dst: __m128d; - asm!( - vpl!("vmovapd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack) - ); - dst + _mm_mask_load_pd(_mm_setzero_pd(), k, mem_addr) } /// Store packed 32-bit integers from a into memory using writemask k. @@ -33752,7 +33599,7 @@ pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: _ p = in(reg) mem_addr, mask = in(kreg) mask, a = in(zmm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -33769,7 +33616,7 @@ pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __ p = in(reg) mem_addr, mask = in(kreg) mask, a = in(zmm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -33786,7 +33633,7 @@ pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m5 p = in(reg) mem_addr, mask = in(kreg) mask, a = in(zmm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -33803,7 +33650,7 @@ pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m51 p = in(reg) mem_addr, mask = in(kreg) mask, a = in(zmm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -33820,7 +33667,7 @@ pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __ p = in(reg) mem_addr, mask = in(kreg) mask, a = in(ymm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -33837,7 +33684,7 @@ pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __ p = in(reg) mem_addr, mask = in(kreg) mask, a = in(ymm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -33854,7 +33701,7 @@ pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m25 p = in(reg) mem_addr, mask = in(kreg) mask, a = in(ymm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -33871,7 +33718,7 @@ pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m25 p = in(reg) mem_addr, mask = in(kreg) mask, a = in(ymm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -33888,7 +33735,7 @@ pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m12 p = in(reg) mem_addr, mask = in(kreg) mask, a = in(xmm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -33905,7 +33752,7 @@ pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m12 p = in(reg) mem_addr, mask = in(kreg) mask, a = in(xmm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -33922,7 +33769,7 @@ pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) p = in(reg) mem_addr, mask = in(kreg) mask, a = in(xmm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -33939,7 +33786,7 @@ pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) p = in(reg) mem_addr, mask = in(kreg) mask, a = in(xmm_reg) a, - options(nostack) + options(nostack, preserves_flags) ); } @@ -33949,15 +33796,13 @@ pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_epi32) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) { - asm!( - vps!("vmovdqa32", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack) - ); + let ones = _mm512_set1_epi32(-1).as_i32x16(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(mask, ones, zero); + simd_masked_store(mask, mem_addr, a.as_i32x16()) } /// Store packed 64-bit integers from a into memory using writemask k. @@ -33966,15 +33811,13 @@ pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_epi64) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) { - asm!( - vps!("vmovdqa64", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack) - ); + let ones = _mm512_set1_epi64(-1).as_i64x8(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(mask, ones, zero); + simd_masked_store(mask, mem_addr, a.as_i64x8()) } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -33983,15 +33826,13 @@ pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_ps) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) { - asm!( - vps!("vmovaps", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack) - ); + let ones = _mm512_set1_epi32(-1).as_i32x16(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(mask, ones, zero); + simd_masked_store(mask, mem_addr, a) } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -34000,15 +33841,13 @@ pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m51 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_pd) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) { - asm!( - vps!("vmovapd", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack) - ); + let ones = _mm512_set1_epi64(-1).as_i64x8(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(mask, ones, zero); + simd_masked_store(mask, mem_addr, a) } /// Store packed 32-bit integers from a into memory using writemask k. @@ -34016,16 +33855,14 @@ pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) { - asm!( - vps!("vmovdqa32", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack) - ); + let ones = _mm256_set1_epi32(-1).as_i32x8(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(mask, ones, zero); + simd_masked_store(mask, mem_addr, a.as_i32x8()) } /// Store packed 64-bit integers from a into memory using writemask k. @@ -34033,16 +33870,14 @@ pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) { - asm!( - vps!("vmovdqa64", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack) - ); + let ones = _mm256_set1_epi64x(-1).as_i64x4(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(mask, ones, zero); + simd_masked_store(mask, mem_addr, a.as_i64x4()) } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -34050,16 +33885,14 @@ pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) { - asm!( - vps!("vmovaps", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack) - ); + let ones = _mm256_set1_epi32(-1).as_i32x8(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(mask, ones, zero); + simd_masked_store(mask, mem_addr, a) } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -34067,16 +33900,14 @@ pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) { - asm!( - vps!("vmovapd", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack) - ); + let ones = _mm256_set1_epi64x(-1).as_i64x4(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(mask, ones, zero); + simd_masked_store(mask, mem_addr, a) } /// Store packed 32-bit integers from a into memory using writemask k. @@ -34084,16 +33915,14 @@ pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqu32))] // FIXME: should be vmovdqa32 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) { - asm!( - vps!("vmovdqa32", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack) - ); + let ones = _mm_set1_epi32(-1).as_i32x4(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(mask, ones, zero); + simd_masked_store(mask, mem_addr, a.as_i32x4()) } /// Store packed 64-bit integers from a into memory using writemask k. @@ -34101,16 +33930,14 @@ pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqu64))] // FIXME: should be vmovdqa64 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) { - asm!( - vps!("vmovdqa64", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack) - ); + let ones = _mm_set1_epi64x(-1).as_i64x2(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(mask, ones, zero); + simd_masked_store(mask, mem_addr, a.as_i64x2()) } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -34118,16 +33945,14 @@ pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovups))] // FIXME: should be vmovaps #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { - asm!( - vps!("vmovaps", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack) - ); + let ones = _mm_set1_epi32(-1).as_i32x4(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(mask, ones, zero); + simd_masked_store(mask, mem_addr, a) } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -34135,16 +33960,14 @@ pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovupd))] // FIXME: should be vmovapd #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) { - asm!( - vps!("vmovapd", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack) - ); + let ones = _mm_set1_epi64x(-1).as_i64x2(); + let zero = mem::zeroed(); + let mask = simd_select_bitmask(mask, ones, zero); + simd_masked_store(mask, mem_addr, a) } /// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -34164,7 +33987,7 @@ pub unsafe fn _mm512_mask_expandloadu_epi32( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34182,7 +34005,7 @@ pub unsafe fn _mm512_maskz_expandloadu_epi32(k: __mmask16, mem_addr: *const i32) p = in(reg) mem_addr, k = in(kreg) k, dst = out(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34204,7 +34027,7 @@ pub unsafe fn _mm256_mask_expandloadu_epi32( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34222,7 +34045,7 @@ pub unsafe fn _mm256_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) p = in(reg) mem_addr, k = in(kreg) k, dst = out(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34244,7 +34067,7 @@ pub unsafe fn _mm_mask_expandloadu_epi32( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34262,7 +34085,7 @@ pub unsafe fn _mm_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) -> p = in(reg) mem_addr, k = in(kreg) k, dst = out(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34284,7 +34107,7 @@ pub unsafe fn _mm512_mask_expandloadu_epi64( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34302,7 +34125,7 @@ pub unsafe fn _mm512_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) p = in(reg) mem_addr, k = in(kreg) k, dst = out(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34324,7 +34147,7 @@ pub unsafe fn _mm256_mask_expandloadu_epi64( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34342,7 +34165,7 @@ pub unsafe fn _mm256_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) p = in(reg) mem_addr, k = in(kreg) k, dst = out(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34364,7 +34187,7 @@ pub unsafe fn _mm_mask_expandloadu_epi64( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34382,7 +34205,7 @@ pub unsafe fn _mm_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> p = in(reg) mem_addr, k = in(kreg) k, dst = out(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34404,7 +34227,7 @@ pub unsafe fn _mm512_mask_expandloadu_ps( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34422,7 +34245,7 @@ pub unsafe fn _mm512_maskz_expandloadu_ps(k: __mmask16, mem_addr: *const f32) -> p = in(reg) mem_addr, k = in(kreg) k, dst = out(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34440,7 +34263,7 @@ pub unsafe fn _mm256_mask_expandloadu_ps(src: __m256, k: __mmask8, mem_addr: *co p = in(reg) mem_addr, k = in(kreg) k, dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34458,7 +34281,7 @@ pub unsafe fn _mm256_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> p = in(reg) mem_addr, k = in(kreg) k, dst = out(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34476,7 +34299,7 @@ pub unsafe fn _mm_mask_expandloadu_ps(src: __m128, k: __mmask8, mem_addr: *const p = in(reg) mem_addr, k = in(kreg) k, dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34494,7 +34317,7 @@ pub unsafe fn _mm_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m p = in(reg) mem_addr, k = in(kreg) k, dst = out(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34516,7 +34339,7 @@ pub unsafe fn _mm512_mask_expandloadu_pd( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34534,7 +34357,7 @@ pub unsafe fn _mm512_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> p = in(reg) mem_addr, k = in(kreg) k, dst = out(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34556,7 +34379,7 @@ pub unsafe fn _mm256_mask_expandloadu_pd( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34574,7 +34397,7 @@ pub unsafe fn _mm256_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> p = in(reg) mem_addr, k = in(kreg) k, dst = out(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34592,7 +34415,7 @@ pub unsafe fn _mm_mask_expandloadu_pd(src: __m128d, k: __mmask8, mem_addr: *cons p = in(reg) mem_addr, k = in(kreg) k, dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -34610,7 +34433,7 @@ pub unsafe fn _mm_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m p = in(reg) mem_addr, k = in(kreg) k, dst = out(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -54387,6 +54210,7 @@ mod tests { } #[simd_test(enable = "avx512f")] + #[cfg_attr(miri, ignore)] unsafe fn test_mm512_stream_ps() { #[repr(align(64))] struct Memory { @@ -54401,6 +54225,38 @@ mod tests { } } + #[simd_test(enable = "avx512f")] + #[cfg_attr(miri, ignore)] + unsafe fn test_mm512_stream_pd() { + #[repr(align(64))] + struct Memory { + pub data: [f64; 8], + } + let a = _mm512_set1_pd(7.0); + let mut mem = Memory { data: [-1.0; 8] }; + + _mm512_stream_pd(&mut mem.data[0] as *mut f64, a); + for i in 0..8 { + assert_eq!(mem.data[i], get_m512d(a, i)); + } + } + + #[simd_test(enable = "avx512f")] + #[cfg_attr(miri, ignore)] + unsafe fn test_mm512_stream_si512() { + #[repr(align(64))] + struct Memory { + pub data: [i64; 8], + } + let a = _mm512_set1_epi32(7); + let mut mem = Memory { data: [-1; 8] }; + + _mm512_stream_si512(mem.data.as_mut_ptr().cast(), a); + for i in 0..8 { + assert_eq!(mem.data[i], get_m512i(a, i)); + } + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_reduce_add_epi32() { let a = _mm512_set1_epi32(1); diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs index 979cad7284..4bbb6f44c2 100644 --- a/crates/core_arch/src/x86/avx512vbmi2.rs +++ b/crates/core_arch/src/x86/avx512vbmi2.rs @@ -24,7 +24,7 @@ pub unsafe fn _mm512_mask_expandloadu_epi16( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -42,7 +42,7 @@ pub unsafe fn _mm512_maskz_expandloadu_epi16(k: __mmask32, mem_addr: *const i16) p = in(reg) mem_addr, k = in(kreg) k, dst = out(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -64,7 +64,7 @@ pub unsafe fn _mm256_mask_expandloadu_epi16( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -82,7 +82,7 @@ pub unsafe fn _mm256_maskz_expandloadu_epi16(k: __mmask16, mem_addr: *const i16) p = in(reg) mem_addr, k = in(kreg) k, dst = out(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -104,7 +104,7 @@ pub unsafe fn _mm_mask_expandloadu_epi16( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -122,7 +122,7 @@ pub unsafe fn _mm_maskz_expandloadu_epi16(k: __mmask8, mem_addr: *const i16) -> p = in(reg) mem_addr, k = in(kreg) k, dst = out(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -144,7 +144,7 @@ pub unsafe fn _mm512_mask_expandloadu_epi8( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -162,7 +162,7 @@ pub unsafe fn _mm512_maskz_expandloadu_epi8(k: __mmask64, mem_addr: *const i8) - p = in(reg) mem_addr, k = in(kreg) k, dst = out(zmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -184,7 +184,7 @@ pub unsafe fn _mm256_mask_expandloadu_epi8( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -202,7 +202,7 @@ pub unsafe fn _mm256_maskz_expandloadu_epi8(k: __mmask32, mem_addr: *const i8) - p = in(reg) mem_addr, k = in(kreg) k, dst = out(ymm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -224,7 +224,7 @@ pub unsafe fn _mm_mask_expandloadu_epi8( p = in(reg) mem_addr, k = in(kreg) k, dst = inout(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } @@ -242,7 +242,7 @@ pub unsafe fn _mm_maskz_expandloadu_epi8(k: __mmask16, mem_addr: *const i8) -> _ p = in(reg) mem_addr, k = in(kreg) k, dst = out(xmm_reg) dst, - options(pure, readonly, nostack) + options(pure, readonly, nostack, preserves_flags) ); dst } diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index ce6bc89f5b..aa501bec3d 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -12012,36 +12012,6 @@ mod tests { assert_eq!(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_stream_pd() { - #[repr(align(64))] - struct Memory { - pub data: [f64; 8], - } - let a = _mm512_set1_pd(7.0); - let mut mem = Memory { data: [-1.0; 8] }; - - _mm512_stream_pd(&mut mem.data[0] as *mut f64, a); - for i in 0..8 { - assert_eq!(mem.data[i], get_m512d(a, i)); - } - } - - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_stream_si512() { - #[repr(align(64))] - struct Memory { - pub data: [i64; 8], - } - let a = _mm512_set1_epi64(7); - let mut mem = Memory { data: [-1; 8] }; - - _mm512_stream_si512(&mut mem.data[0] as *mut i64, a); - for i in 0..8 { - assert_eq!(mem.data[i], get_m512i(a, i)); - } - } - #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_set1_epi64() { let src = _mm512_set1_epi64(2); diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index a69d17e71c..c5773a0984 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -346,7 +346,10 @@ fn verify_all_signatures() { // unsigned integer, but all other _mm_shuffle_.. intrinsics // take a signed-integer. This breaks `_MM_SHUFFLE` for // `_mm_shuffle_ps`: - "_mm_shuffle_ps" => continue, + name@"_mm_shuffle_ps" => { + map.remove(name); + continue; + }, _ => {} } @@ -391,13 +394,19 @@ fn verify_all_signatures() { fn print_missing(map: &HashMap<&str, Vec<&Intrinsic>>, mut f: impl Write) -> io::Result<()> { let mut missing = BTreeMap::new(); // BTreeMap to keep the cpuids ordered - // we cannot use SVML and MMX, and MPX is not in LLVM, and intrinsics without any cpuid requirement - // are accessible from safe rust + + // we cannot use SVML and MMX, and MPX is not in LLVM, and intrinsics without any cpuid requirement + // are accessible from safe rust for intrinsic in map.values().flatten().filter(|intrinsic| { intrinsic.tech != "SVML" && intrinsic.tech != "MMX" && !intrinsic.cpuid.is_empty() && !intrinsic.cpuid.contains(&"MPX".to_string()) + && intrinsic.return_.type_ != "__m64" + && !intrinsic + .parameters + .iter() + .any(|param| param.type_.contains("__m64")) }) { missing .entry(&intrinsic.cpuid)