From 3ec8d7aa4a05c3286189441abef0ce757d03b4b5 Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Sun, 12 Nov 2023 17:57:35 +0000 Subject: [PATCH 1/7] Implement _mm_cmpestri and _mm_cmpestrm using inline asm --- src/intrinsics/llvm_x86.rs | 98 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs index 78a0a347c..0fa46b1da 100644 --- a/src/intrinsics/llvm_x86.rs +++ b/src/intrinsics/llvm_x86.rs @@ -720,6 +720,104 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( } } + "llvm.x86.sse42.pcmpestri128" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri&ig_expand=939 + intrinsic_args!(fx, args => (a, la, b, lb, _imm8); intrinsic); + + let a = a.load_scalar(fx); + let la = la.load_scalar(fx); + let b = b.load_scalar(fx); + let lb = lb.load_scalar(fx); + + let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[4]) + { + imm8 + } else { + fx.tcx.sess.span_fatal(span, "Index argument for `_mm_cmpestri` is not a constant"); + }; + + let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8)); + + codegen_inline_asm_inner( + fx, + &[InlineAsmTemplatePiece::String(format!("pcmpestri xmm0, xmm1, {imm8}"))], + &[ + CInlineAsmOperand::In { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)), + value: a, + }, + CInlineAsmOperand::In { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)), + value: b, + }, + // Implicit argument to the pcmpestri intrinsic + CInlineAsmOperand::In { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)), + value: la, + }, + // Implicit argument to the pcmpestri intrinsic + CInlineAsmOperand::In { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)), + value: lb, + }, + // Implicit result of the pcmpestri intrinsic + CInlineAsmOperand::Out { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)), + late: true, + place: Some(ret), + }, + ], + InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM, + ); + } + + "llvm.x86.sse42.pcmpestrm128" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm&ig_expand=940 + intrinsic_args!(fx, args => (a, la, b, lb, _imm8); intrinsic); + + let a = a.load_scalar(fx); + let la = la.load_scalar(fx); + let b = b.load_scalar(fx); + let lb = lb.load_scalar(fx); + + let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[4]) + { + imm8 + } else { + fx.tcx.sess.span_fatal(span, "Index argument for `_mm_cmpestrm` is not a constant"); + }; + + let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8)); + + codegen_inline_asm_inner( + fx, + &[InlineAsmTemplatePiece::String(format!("pcmpestrm xmm0, xmm1, {imm8}"))], + &[ + CInlineAsmOperand::InOut { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)), + _late: true, + in_value: a, + out_place: Some(ret), + }, + CInlineAsmOperand::In { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)), + value: b, + }, + // Implicit argument to the pcmpestri intrinsic + CInlineAsmOperand::In { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)), + value: la, + }, + // Implicit argument to the pcmpestri intrinsic + CInlineAsmOperand::In { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)), + value: lb, + }, + ], + InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM, + ); + } + "llvm.x86.pclmulqdq" => { // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772 intrinsic_args!(fx, args => (a, b, _imm8); intrinsic); From 705031d0177af0b492a3224c01bda257c85505e6 Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Fri, 24 Nov 2023 13:57:21 +0000 Subject: [PATCH 2/7] Implement _mm_cvttps_epi32 --- src/intrinsics/llvm_x86.rs | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs index 0fa46b1da..85ce94099 100644 --- a/src/intrinsics/llvm_x86.rs +++ b/src/intrinsics/llvm_x86.rs @@ -273,16 +273,31 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( ); } "llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => { - let a = match args { - [a] => a, - _ => bug!("wrong number of args for intrinsic {intrinsic}"), - }; - let a = codegen_operand(fx, a); + intrinsic_args!(fx, args => (a); intrinsic); simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| { fx.bcx.ins().iabs(lane) }); } + "llvm.x86.sse2.cvttps2dq" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32&ig_expand=2429 + intrinsic_args!(fx, args => (a); intrinsic); + let a = a.load_scalar(fx); + + // Using inline asm instead of fcvt_to_sint_sat as unrepresentable values are turned + // into 0x80000000 for which Cranelift doesn't have a native instruction. + codegen_inline_asm_inner( + fx, + &[InlineAsmTemplatePiece::String(format!("cvttps2dq xmm0, xmm0"))], + &[CInlineAsmOperand::InOut { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)), + _late: true, + in_value: a, + out_place: Some(ret), + }], + InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM, + ); + } "llvm.x86.addcarry.32" | "llvm.x86.addcarry.64" => { intrinsic_args!(fx, args => (c_in, a, b); intrinsic); let c_in = c_in.load_scalar(fx); From 65da67169427a3b09726b64389d827aa7b2bfbe8 Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Fri, 24 Nov 2023 15:30:53 +0000 Subject: [PATCH 3/7] Implement *fmaddsub_p*, *fmsubadd_p* and *fnmadd_p* vendor intrinsics --- src/intrinsics/llvm_x86.rs | 111 +++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs index 85ce94099..142ca1cf5 100644 --- a/src/intrinsics/llvm_x86.rs +++ b/src/intrinsics/llvm_x86.rs @@ -735,6 +735,117 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( } } + "llvm.x86.fma.vfmaddsub.ps" + | "llvm.x86.fma.vfmaddsub.pd" + | "llvm.x86.fma.vfmaddsub.ps.256" + | "llvm.x86.fma.vfmaddsub.pd.256" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_ps&ig_expand=3205 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_pd&ig_expand=3181 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_ps&ig_expand=3209 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_pd&ig_expand=3185 + intrinsic_args!(fx, args => (a, b, c); intrinsic); + + assert_eq!(a.layout(), b.layout()); + assert_eq!(a.layout(), c.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert!(lane_ty.is_floating_point()); + assert!(ret_lane_ty.is_floating_point()); + assert_eq!(lane_count, ret_lane_count); + let ret_lane_layout = fx.layout_of(ret_lane_ty); + + for idx in 0..lane_count { + let a_lane = a.value_lane(fx, idx).load_scalar(fx); + let b_lane = b.value_lane(fx, idx).load_scalar(fx); + let c_lane = c.value_lane(fx, idx).load_scalar(fx); + + let mul = fx.bcx.ins().fmul(a_lane, b_lane); + let res = if idx & 1 == 0 { + fx.bcx.ins().fsub(mul, c_lane) + } else { + fx.bcx.ins().fadd(mul, c_lane) + }; + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.fma.vfmsubadd.ps" + | "llvm.x86.fma.vfmsubadd.pd" + | "llvm.x86.fma.vfmsubadd.ps.256" + | "llvm.x86.fma.vfmsubadd.pd.256" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_ps&ig_expand=3325 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_pd&ig_expand=3301 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_ps&ig_expand=3329 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_pd&ig_expand=3305 + intrinsic_args!(fx, args => (a, b, c); intrinsic); + + assert_eq!(a.layout(), b.layout()); + assert_eq!(a.layout(), c.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert!(lane_ty.is_floating_point()); + assert!(ret_lane_ty.is_floating_point()); + assert_eq!(lane_count, ret_lane_count); + let ret_lane_layout = fx.layout_of(ret_lane_ty); + + for idx in 0..lane_count { + let a_lane = a.value_lane(fx, idx).load_scalar(fx); + let b_lane = b.value_lane(fx, idx).load_scalar(fx); + let c_lane = c.value_lane(fx, idx).load_scalar(fx); + + let mul = fx.bcx.ins().fmul(a_lane, b_lane); + let res = if idx & 1 == 0 { + fx.bcx.ins().fadd(mul, c_lane) + } else { + fx.bcx.ins().fsub(mul, c_lane) + }; + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.fma.vfnmadd.ps" + | "llvm.x86.fma.vfnmadd.pd" + | "llvm.x86.fma.vfnmadd.ps.256" + | "llvm.x86.fma.vfnmadd.pd.256" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ps&ig_expand=3391 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_pd&ig_expand=3367 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_ps&ig_expand=3395 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_pd&ig_expand=3371 + intrinsic_args!(fx, args => (a, b, c); intrinsic); + + assert_eq!(a.layout(), b.layout()); + assert_eq!(a.layout(), c.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert!(lane_ty.is_floating_point()); + assert!(ret_lane_ty.is_floating_point()); + assert_eq!(lane_count, ret_lane_count); + let ret_lane_layout = fx.layout_of(ret_lane_ty); + + for idx in 0..lane_count { + let a_lane = a.value_lane(fx, idx).load_scalar(fx); + let b_lane = b.value_lane(fx, idx).load_scalar(fx); + let c_lane = c.value_lane(fx, idx).load_scalar(fx); + + let mul = fx.bcx.ins().fmul(a_lane, b_lane); + let neg_mul = fx.bcx.ins().fneg(mul); + let res = fx.bcx.ins().fadd(neg_mul, c_lane); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, idx).write_cvalue(fx, res_lane); + } + } + "llvm.x86.sse42.pcmpestri128" => { // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri&ig_expand=939 intrinsic_args!(fx, args => (a, la, b, lb, _imm8); intrinsic); From c8729e9216bda2b51d8781d95cae0b907e71c6b8 Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Fri, 24 Nov 2023 15:31:14 +0000 Subject: [PATCH 4/7] Implement _mm256_zeroupper vendor intrinsic --- src/intrinsics/llvm_x86.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs index 142ca1cf5..f85f9b891 100644 --- a/src/intrinsics/llvm_x86.rs +++ b/src/intrinsics/llvm_x86.rs @@ -22,6 +22,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( // Spin loop hint } + "llvm.x86.avx.vzeroupper" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper&ig_expand=7218 + // Do nothing. It is a perf hint anyway. + } + // Used by is_x86_feature_detected!(); "llvm.x86.xgetbv" => { intrinsic_args!(fx, args => (xcr_no); intrinsic); From d5a7ae7976ec02196ec9893f79fa06612059dfbc Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Fri, 24 Nov 2023 19:26:46 +0000 Subject: [PATCH 5/7] Implement the float part of the gather family vendor intrinsics --- src/intrinsics/llvm_x86.rs | 87 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs index f85f9b891..6cccc8b83 100644 --- a/src/intrinsics/llvm_x86.rs +++ b/src/intrinsics/llvm_x86.rs @@ -74,6 +74,93 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( ret.write_cvalue(fx, val); } + "llvm.x86.avx2.gather.d.ps" + | "llvm.x86.avx2.gather.d.pd" + | "llvm.x86.avx2.gather.d.ps.256" + | "llvm.x86.avx2.gather.d.pd.256" + | "llvm.x86.avx2.gather.q.ps" + | "llvm.x86.avx2.gather.q.pd" + | "llvm.x86.avx2.gather.q.ps.256" + | "llvm.x86.avx2.gather.q.pd.256" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd&ig_expand=3818 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd&ig_expand=3819 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd&ig_expand=3821 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd&ig_expand=3822 + // ... + + intrinsic_args!(fx, args => (src, ptr, index, mask, scale); intrinsic); + + let (src_lane_count, src_lane_ty) = src.layout().ty.simd_size_and_type(fx.tcx); + let (index_lane_count, index_lane_ty) = index.layout().ty.simd_size_and_type(fx.tcx); + let (mask_lane_count, mask_lane_ty) = mask.layout().ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert!(src_lane_ty.is_floating_point()); + assert!(index_lane_ty.is_integral()); + assert!(mask_lane_ty.is_floating_point()); + assert!(ret_lane_ty.is_floating_point()); + assert_eq!(src_lane_count, mask_lane_count); + assert_eq!(src_lane_count, ret_lane_count); + + let lane_clif_ty = fx.clif_type(ret_lane_ty).unwrap(); + let index_lane_clif_ty = fx.clif_type(index_lane_ty).unwrap(); + let mask_lane_clif_ty = fx.clif_type(mask_lane_ty).unwrap(); + let ret_lane_layout = fx.layout_of(ret_lane_ty); + + let ptr = ptr.load_scalar(fx); + let scale = scale.load_scalar(fx); + let scale = fx.bcx.ins().uextend(types::I64, scale); + for lane_idx in 0..std::cmp::min(src_lane_count, index_lane_count) { + let src_lane = src.value_lane(fx, lane_idx).load_scalar(fx); + let index_lane = index.value_lane(fx, lane_idx).load_scalar(fx); + let mask_lane = mask.value_lane(fx, lane_idx).load_scalar(fx); + let mask_lane = + fx.bcx.ins().bitcast(mask_lane_clif_ty.as_int(), MemFlags::new(), mask_lane); + + let if_enabled = fx.bcx.create_block(); + let if_disabled = fx.bcx.create_block(); + let next = fx.bcx.create_block(); + let res_lane = fx.bcx.append_block_param(next, lane_clif_ty); + + let mask_lane = match mask_lane_clif_ty { + types::F32 => fx.bcx.ins().band_imm(mask_lane, 0x8000_0000u64 as i64), + types::F64 => fx.bcx.ins().band_imm(mask_lane, 0x8000_0000_0000_0000u64 as i64), + _ => unreachable!(), + }; + fx.bcx.ins().brif(mask_lane, if_enabled, &[], if_disabled, &[]); + fx.bcx.seal_block(if_enabled); + fx.bcx.seal_block(if_disabled); + + fx.bcx.switch_to_block(if_enabled); + let index_lane = if index_lane_clif_ty != types::I64 { + fx.bcx.ins().sextend(types::I64, index_lane) + } else { + index_lane + }; + let offset = fx.bcx.ins().imul(index_lane, scale); + let lane_ptr = fx.bcx.ins().iadd(ptr, offset); + let res = fx.bcx.ins().load(lane_clif_ty, MemFlags::trusted(), lane_ptr, 0); + fx.bcx.ins().jump(next, &[res]); + + fx.bcx.switch_to_block(if_disabled); + fx.bcx.ins().jump(next, &[src_lane]); + + fx.bcx.seal_block(next); + fx.bcx.switch_to_block(next); + + fx.bcx.ins().nop(); + + ret.place_lane(fx, lane_idx) + .write_cvalue(fx, CValue::by_val(res_lane, ret_lane_layout)); + } + + for lane_idx in std::cmp::min(src_lane_count, index_lane_count)..ret_lane_count { + let zero_lane = fx.bcx.ins().iconst(mask_lane_clif_ty.as_int(), 0); + let zero_lane = fx.bcx.ins().bitcast(mask_lane_clif_ty, MemFlags::new(), zero_lane); + ret.place_lane(fx, lane_idx) + .write_cvalue(fx, CValue::by_val(zero_lane, ret_lane_layout)); + } + } + "llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => { let (x, y, kind) = match args { [x, y, kind] => (x, y, kind), From 3b49b9efd5e91b3104bc377ab64ee88aaa3ce31d Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Fri, 24 Nov 2023 19:38:13 +0000 Subject: [PATCH 6/7] Implement the int part of the gather family vendor intrinsics --- src/intrinsics/llvm_x86.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs index 6cccc8b83..2108b4bb9 100644 --- a/src/intrinsics/llvm_x86.rs +++ b/src/intrinsics/llvm_x86.rs @@ -74,12 +74,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( ret.write_cvalue(fx, val); } - "llvm.x86.avx2.gather.d.ps" + "llvm.x86.avx2.gather.d.d" + | "llvm.x86.avx2.gather.d.q" + | "llvm.x86.avx2.gather.d.ps" | "llvm.x86.avx2.gather.d.pd" + | "llvm.x86.avx2.gather.d.d.256" + | "llvm.x86.avx2.gather.d.q.256" | "llvm.x86.avx2.gather.d.ps.256" | "llvm.x86.avx2.gather.d.pd.256" + | "llvm.x86.avx2.gather.q.d" + | "llvm.x86.avx2.gather.q.q" | "llvm.x86.avx2.gather.q.ps" | "llvm.x86.avx2.gather.q.pd" + | "llvm.x86.avx2.gather.q.d.256" + | "llvm.x86.avx2.gather.q.q.256" | "llvm.x86.avx2.gather.q.ps.256" | "llvm.x86.avx2.gather.q.pd.256" => { // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd&ig_expand=3818 @@ -94,10 +102,8 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( let (index_lane_count, index_lane_ty) = index.layout().ty.simd_size_and_type(fx.tcx); let (mask_lane_count, mask_lane_ty) = mask.layout().ty.simd_size_and_type(fx.tcx); let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); - assert!(src_lane_ty.is_floating_point()); + assert_eq!(src_lane_ty, ret_lane_ty); assert!(index_lane_ty.is_integral()); - assert!(mask_lane_ty.is_floating_point()); - assert!(ret_lane_ty.is_floating_point()); assert_eq!(src_lane_count, mask_lane_count); assert_eq!(src_lane_count, ret_lane_count); @@ -122,8 +128,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( let res_lane = fx.bcx.append_block_param(next, lane_clif_ty); let mask_lane = match mask_lane_clif_ty { - types::F32 => fx.bcx.ins().band_imm(mask_lane, 0x8000_0000u64 as i64), - types::F64 => fx.bcx.ins().band_imm(mask_lane, 0x8000_0000_0000_0000u64 as i64), + types::I32 | types::F32 => { + fx.bcx.ins().band_imm(mask_lane, 0x8000_0000u64 as i64) + } + types::I64 | types::F64 => { + fx.bcx.ins().band_imm(mask_lane, 0x8000_0000_0000_0000u64 as i64) + } _ => unreachable!(), }; fx.bcx.ins().brif(mask_lane, if_enabled, &[], if_disabled, &[]); From e6e2f00d2196281bdd92c3dbfb466687708807c3 Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Sat, 25 Nov 2023 09:14:19 +0000 Subject: [PATCH 7/7] Fix incorrect implementation of several vendor intrinsics --- src/intrinsics/llvm_x86.rs | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs index 2108b4bb9..07b95b793 100644 --- a/src/intrinsics/llvm_x86.rs +++ b/src/intrinsics/llvm_x86.rs @@ -556,12 +556,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( let ret_lane_layout = fx.layout_of(fx.tcx.types.i32); for out_lane_idx in 0..lane_count / 2 { let a_lane0 = a.value_lane(fx, out_lane_idx * 2).load_scalar(fx); - let a_lane0 = fx.bcx.ins().uextend(types::I32, a_lane0); + let a_lane0 = fx.bcx.ins().sextend(types::I32, a_lane0); let b_lane0 = b.value_lane(fx, out_lane_idx * 2).load_scalar(fx); let b_lane0 = fx.bcx.ins().sextend(types::I32, b_lane0); let a_lane1 = a.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx); - let a_lane1 = fx.bcx.ins().uextend(types::I32, a_lane1); + let a_lane1 = fx.bcx.ins().sextend(types::I32, a_lane1); let b_lane1 = b.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx); let b_lane1 = fx.bcx.ins().sextend(types::I32, b_lane1); @@ -716,14 +716,14 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( assert_eq!(ret_lane_ty, fx.tcx.types.i16); assert_eq!(lane_count * 2, ret_lane_count); - let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16)); - let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16)); + let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64); + let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64); let ret_lane_layout = fx.layout_of(fx.tcx.types.i16); for idx in 0..lane_count { let lane = a.value_lane(fx, idx).load_scalar(fx); let sat = fx.bcx.ins().smax(lane, min_i16); - let sat = fx.bcx.ins().umin(sat, max_i16); + let sat = fx.bcx.ins().smin(sat, max_i16); let res = fx.bcx.ins().ireduce(types::I16, sat); let res_lane = CValue::by_val(res, ret_lane_layout); @@ -733,7 +733,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( for idx in 0..lane_count { let lane = b.value_lane(fx, idx).load_scalar(fx); let sat = fx.bcx.ins().smax(lane, min_i16); - let sat = fx.bcx.ins().umin(sat, max_i16); + let sat = fx.bcx.ins().smin(sat, max_i16); let res = fx.bcx.ins().ireduce(types::I16, sat); let res_lane = CValue::by_val(res, ret_lane_layout); @@ -760,8 +760,8 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( for idx in 0..lane_count { let lane = a.value_lane(fx, idx).load_scalar(fx); - let sat = fx.bcx.ins().umax(lane, min_u16); - let sat = fx.bcx.ins().umin(sat, max_u16); + let sat = fx.bcx.ins().smax(lane, min_u16); + let sat = fx.bcx.ins().smin(sat, max_u16); let res = fx.bcx.ins().ireduce(types::I16, sat); let res_lane = CValue::by_val(res, ret_lane_layout); @@ -770,8 +770,8 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( for idx in 0..lane_count { let lane = b.value_lane(fx, idx).load_scalar(fx); - let sat = fx.bcx.ins().umax(lane, min_u16); - let sat = fx.bcx.ins().umin(sat, max_u16); + let sat = fx.bcx.ins().smax(lane, min_u16); + let sat = fx.bcx.ins().smin(sat, max_u16); let res = fx.bcx.ins().ireduce(types::I16, sat); let res_lane = CValue::by_val(res, ret_lane_layout); @@ -792,14 +792,14 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( assert_eq!(ret_lane_ty, fx.tcx.types.i16); assert_eq!(lane_count * 2, ret_lane_count); - let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16)); - let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16)); + let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64); + let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64); let ret_lane_layout = fx.layout_of(fx.tcx.types.i16); for idx in 0..lane_count / 2 { let lane = a.value_lane(fx, idx).load_scalar(fx); let sat = fx.bcx.ins().smax(lane, min_i16); - let sat = fx.bcx.ins().umin(sat, max_i16); + let sat = fx.bcx.ins().smin(sat, max_i16); let res = fx.bcx.ins().ireduce(types::I16, sat); let res_lane = CValue::by_val(res, ret_lane_layout); @@ -809,7 +809,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( for idx in 0..lane_count / 2 { let lane = b.value_lane(fx, idx).load_scalar(fx); let sat = fx.bcx.ins().smax(lane, min_i16); - let sat = fx.bcx.ins().umin(sat, max_i16); + let sat = fx.bcx.ins().smin(sat, max_i16); let res = fx.bcx.ins().ireduce(types::I16, sat); let res_lane = CValue::by_val(res, ret_lane_layout); @@ -819,7 +819,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( for idx in 0..lane_count / 2 { let lane = a.value_lane(fx, idx).load_scalar(fx); let sat = fx.bcx.ins().smax(lane, min_i16); - let sat = fx.bcx.ins().umin(sat, max_i16); + let sat = fx.bcx.ins().smin(sat, max_i16); let res = fx.bcx.ins().ireduce(types::I16, sat); let res_lane = CValue::by_val(res, ret_lane_layout); @@ -829,7 +829,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( for idx in 0..lane_count / 2 { let lane = b.value_lane(fx, idx).load_scalar(fx); let sat = fx.bcx.ins().smax(lane, min_i16); - let sat = fx.bcx.ins().umin(sat, max_i16); + let sat = fx.bcx.ins().smin(sat, max_i16); let res = fx.bcx.ins().ireduce(types::I16, sat); let res_lane = CValue::by_val(res, ret_lane_layout);