diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs index a04039c0ed68..6faf0334980e 100644 --- a/crates/wast-util/src/lib.rs +++ b/crates/wast-util/src/lib.rs @@ -430,12 +430,9 @@ impl WastTest { "spec_testsuite/simd_f64x2_pmin_pmax.wast", "spec_testsuite/simd_f64x2_rounding.wast", "spec_testsuite/simd_i16x8_arith2.wast", - "spec_testsuite/simd_i16x8_extadd_pairwise_i8x16.wast", - "spec_testsuite/simd_i16x8_extmul_i8x16.wast", "spec_testsuite/simd_i16x8_q15mulr_sat_s.wast", "spec_testsuite/simd_i32x4_arith2.wast", "spec_testsuite/simd_i32x4_dot_i16x8.wast", - "spec_testsuite/simd_i32x4_extadd_pairwise_i16x8.wast", "spec_testsuite/simd_i32x4_trunc_sat_f32x4.wast", "spec_testsuite/simd_i32x4_trunc_sat_f64x2.wast", "spec_testsuite/simd_i64x2_extmul_i32x4.wast", @@ -503,6 +500,8 @@ impl WastTest { "spec_testsuite/simd_i16x8_extmul_i8x16.wast", "spec_testsuite/simd_i32x4_extmul_i16x8.wast", "spec_testsuite/simd_i64x2_extmul_i32x4.wast", + "spec_testsuite/simd_i16x8_extadd_pairwise_i8x16.wast", + "spec_testsuite/simd_i32x4_extadd_pairwise_i16x8.wast", ]; if unsupported.iter().any(|part| self.path.ends_with(part)) { diff --git a/tests/disas/winch/x64/i16x8/extadd/extadd_s.wat b/tests/disas/winch/x64/i16x8/extadd/extadd_s.wat new file mode 100644 index 000000000000..2d7579c7b5c5 --- /dev/null +++ b/tests/disas/winch/x64/i16x8/extadd/extadd_s.wat @@ -0,0 +1,31 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (func (param v128) (result v128) + (local.get 0) + (i16x8.extadd_pairwise_i8x16_s) + )) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x20, %r11 +;; cmpq %rsp, %r11 +;; ja 0x51 +;; 1c: movq %rdi, %r14 +;; subq $0x20, %rsp +;; movq %rdi, 0x18(%rsp) +;; movq %rsi, 0x10(%rsp) +;; movdqu %xmm0, (%rsp) +;; movdqu (%rsp), %xmm0 +;; vpmovsxbw %xmm0, %xmm15 +;; vpalignr $8, %xmm0, %xmm0, %xmm0 +;; vpmovsxbw %xmm0, %xmm0 +;; vpaddw %xmm0, %xmm0, %xmm0 +;; addq $0x20, %rsp +;; popq %rbp +;; retq +;; 51: ud2 diff --git a/tests/disas/winch/x64/i16x8/extadd/extadd_u.wat b/tests/disas/winch/x64/i16x8/extadd/extadd_u.wat new file mode 100644 index 000000000000..2880ea2b773b --- /dev/null +++ b/tests/disas/winch/x64/i16x8/extadd/extadd_u.wat @@ -0,0 +1,31 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (func (param v128) (result v128) + (local.get 0) + (i16x8.extadd_pairwise_i8x16_u) + )) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x20, %r11 +;; cmpq %rsp, %r11 +;; ja 0x50 +;; 1c: movq %rdi, %r14 +;; subq $0x20, %rsp +;; movq %rdi, 0x18(%rsp) +;; movq %rsi, 0x10(%rsp) +;; movdqu %xmm0, (%rsp) +;; movdqu (%rsp), %xmm0 +;; vpmovzxbw %xmm0, %xmm15 +;; vpxor %xmm15, %xmm15, %xmm15 +;; vpunpckhbw %xmm15, %xmm0, %xmm0 +;; vpaddw %xmm0, %xmm0, %xmm0 +;; addq $0x20, %rsp +;; popq %rbp +;; retq +;; 50: ud2 diff --git a/tests/disas/winch/x64/i32x4/extadd/extadd_s.wat b/tests/disas/winch/x64/i32x4/extadd/extadd_s.wat new file mode 100644 index 000000000000..c9c682111d5d --- /dev/null +++ b/tests/disas/winch/x64/i32x4/extadd/extadd_s.wat @@ -0,0 +1,31 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (func (param v128) (result v128) + (local.get 0) + (i32x4.extadd_pairwise_i16x8_s) + )) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x20, %r11 +;; cmpq %rsp, %r11 +;; ja 0x51 +;; 1c: movq %rdi, %r14 +;; subq $0x20, %rsp +;; movq %rdi, 0x18(%rsp) +;; movq %rsi, 0x10(%rsp) +;; movdqu %xmm0, (%rsp) +;; movdqu (%rsp), %xmm0 +;; vpmovsxwd %xmm0, %xmm15 +;; vpalignr $8, %xmm0, %xmm0, %xmm0 +;; vpmovsxwd %xmm0, %xmm0 +;; vpaddd %xmm0, %xmm0, %xmm0 +;; addq $0x20, %rsp +;; popq %rbp +;; retq +;; 51: ud2 diff --git a/tests/disas/winch/x64/i32x4/extadd/extadd_u.wat b/tests/disas/winch/x64/i32x4/extadd/extadd_u.wat new file mode 100644 index 000000000000..9d3171c86014 --- /dev/null +++ b/tests/disas/winch/x64/i32x4/extadd/extadd_u.wat @@ -0,0 +1,31 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (func (param v128) (result v128) + (local.get 0) + (i32x4.extadd_pairwise_i16x8_u) + )) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x20, %r11 +;; cmpq %rsp, %r11 +;; ja 0x50 +;; 1c: movq %rdi, %r14 +;; subq $0x20, %rsp +;; movq %rdi, 0x18(%rsp) +;; movq %rsi, 0x10(%rsp) +;; movdqu %xmm0, (%rsp) +;; movdqu (%rsp), %xmm0 +;; vpmovzxwd %xmm0, %xmm15 +;; vpxor %xmm15, %xmm15, %xmm15 +;; vpunpckhwd %xmm15, %xmm0, %xmm0 +;; vpaddd %xmm0, %xmm0, %xmm0 +;; addq $0x20, %rsp +;; popq %rbp +;; retq +;; 50: ud2 diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs index 632a9bd185b6..1ad077999260 100644 --- a/winch/codegen/src/isa/aarch64/masm.rs +++ b/winch/codegen/src/isa/aarch64/masm.rs @@ -1224,6 +1224,16 @@ impl Masm for MacroAssembler { ) -> Result<()> { Err(anyhow!(CodeGenError::unimplemented_masm_instruction())) } + + fn v128_extadd_pairwise( + &mut self, + _src: Reg, + _dst: WritableReg, + _lane_width: OperandSize, + _kind: crate::masm::ExtAddKind, + ) -> Result<()> { + Err(anyhow!(CodeGenError::unimplemented_masm_instruction())) + } } impl MacroAssembler { diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs index d8b0c916cece..3330912164b5 100644 --- a/winch/codegen/src/isa/x64/masm.rs +++ b/winch/codegen/src/isa/x64/masm.rs @@ -7,11 +7,7 @@ use super::{ use anyhow::{anyhow, bail, Result}; use crate::masm::{ - DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, Imm as I, - IntCmpKind, LaneSelector, LoadKind, MacroAssembler as Masm, MaxKind, MinKind, MulWideKind, - OperandSize, RegImm, RemKind, ReplaceLaneKind, RmwOp, RoundingMode, ShiftKind, SplatKind, - StoreKind, TrapCode, TruncKind, V128AbsKind, V128ConvertKind, V128ExtendKind, V128NarrowKind, - VectorCompareKind, VectorEqualityKind, Zero, TRUSTED_FLAGS, UNTRUSTED_FLAGS, + DivKind, ExtAddKind, ExtMulKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, Imm as I, IntCmpKind, LaneSelector, LoadKind, MacroAssembler as Masm, MaxKind, MinKind, MulWideKind, OperandSize, RegImm, RemKind, ReplaceLaneKind, RmwOp, RoundingMode, ShiftKind, SplatKind, StoreKind, TrapCode, TruncKind, V128AbsKind, V128ConvertKind, V128ExtendKind, V128NarrowKind, VectorCompareKind, VectorEqualityKind, Zero, TRUSTED_FLAGS, UNTRUSTED_FLAGS }; use crate::{ abi::{self, align_to, calculate_frame_adjustment, LocalSlot}, @@ -2540,6 +2536,8 @@ impl Masm for MacroAssembler { OperandSize::S32 | OperandSize::S64 => self.asm.xmm_vmovskp_rr(src, dst, size, size), _ => unimplemented!(), } + + Ok(()) } fn v128_min( @@ -2602,33 +2600,35 @@ impl Masm for MacroAssembler { lane_width: OperandSize, kind: ExtMulKind, ) -> Result<()> { + use V128ExtendKind::*; + self.ensure_has_avx()?; // The implementation for extmul is not optimized; for simplicity's sake, we simply perform - // an extention followed by a multiplication using already implemented primitives. + // an extension followed by a multiplication using already implemented primitives. let src1 = context.pop_to_reg(self, None)?; let src2 = context.pop_to_reg(self, None)?; let ext_kind = match (lane_width, kind) { - (OperandSize::S16, ExtMulKind::HighSigned) => V128ExtendKind::HighI8x16S, - (OperandSize::S32, ExtMulKind::HighSigned) => V128ExtendKind::HighI16x8S, - (OperandSize::S64, ExtMulKind::HighSigned) => V128ExtendKind::HighI32x4S, + (OperandSize::S16, ExtMulKind::HighSigned) => HighI8x16S, + (OperandSize::S32, ExtMulKind::HighSigned) => HighI16x8S, + (OperandSize::S64, ExtMulKind::HighSigned) => HighI32x4S, (_, ExtMulKind::HighSigned) => bail!(CodeGenError::unexpected_operand_size()), - (OperandSize::S16, ExtMulKind::LowSigned) => V128ExtendKind::LowI8x16S, - (OperandSize::S32, ExtMulKind::LowSigned) => V128ExtendKind::LowI16x8S, - (OperandSize::S64, ExtMulKind::LowSigned) => V128ExtendKind::LowI32x4S, + (OperandSize::S16, ExtMulKind::LowSigned) => LowI8x16S, + (OperandSize::S32, ExtMulKind::LowSigned) => LowI16x8S, + (OperandSize::S64, ExtMulKind::LowSigned) => LowI32x4S, (_, ExtMulKind::LowSigned) => bail!(CodeGenError::unexpected_operand_size()), - (OperandSize::S16, ExtMulKind::HighUnsigned) => V128ExtendKind::HighI8x16U, - (OperandSize::S32, ExtMulKind::HighUnsigned) => V128ExtendKind::HighI16x8U, - (OperandSize::S64, ExtMulKind::HighUnsigned) => V128ExtendKind::HighI32x4U, + (OperandSize::S16, ExtMulKind::HighUnsigned) => HighI8x16U, + (OperandSize::S32, ExtMulKind::HighUnsigned) => HighI16x8U, + (OperandSize::S64, ExtMulKind::HighUnsigned) => HighI32x4U, (_, ExtMulKind::HighUnsigned) => bail!(CodeGenError::unexpected_operand_size()), - (OperandSize::S16, ExtMulKind::LowUnsigned) => V128ExtendKind::LowI8x16U, - (OperandSize::S32, ExtMulKind::LowUnsigned) => V128ExtendKind::LowI16x8U, - (OperandSize::S64, ExtMulKind::LowUnsigned) => V128ExtendKind::LowI32x4U, + (OperandSize::S16, ExtMulKind::LowUnsigned) => LowI8x16U, + (OperandSize::S32, ExtMulKind::LowUnsigned) => LowI16x8U, + (OperandSize::S64, ExtMulKind::LowUnsigned) => LowI32x4U, (_, ExtMulKind::LowUnsigned) => bail!(CodeGenError::unexpected_operand_size()), }; @@ -2640,6 +2640,35 @@ impl Masm for MacroAssembler { self.v128_mul(context, lane_width) } + + fn v128_extadd_pairwise( + &mut self, + src: Reg, + dst: WritableReg, + lane_width: OperandSize, + kind: ExtAddKind, + ) -> Result<()> { + use V128ExtendKind::*; + + self.ensure_has_avx()?; + + // The implementation for extadd is not optimized; for simplicity's sake, we simply perform + // an extension followed by an addition using already implemented primitives. + let (low_kind, high_kind) = match (lane_width, kind) { + (OperandSize::S16, ExtAddKind::Signed) => (LowI8x16S, HighI8x16S), + (OperandSize::S16, ExtAddKind::Unsigned) => (LowI8x16U, HighI8x16U), + (OperandSize::S32, ExtAddKind::Signed) => (LowI16x8S, HighI16x8S), + (OperandSize::S32, ExtAddKind::Unsigned) => (LowI16x8U, HighI16x8U), + _ => bail!(CodeGenError::unexpected_operand_size()), + }; + + let tmp = regs::scratch_xmm(); + + self.v128_extend(src, writable!(tmp), low_kind)?; + self.v128_extend(src, dst, high_kind)?; + + self.v128_add(src, dst.to_reg(), dst, lane_width, HandleOverflowKind::None) + } } impl MacroAssembler { diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs index 602844564e89..aa529371bdce 100644 --- a/winch/codegen/src/masm.rs +++ b/winch/codegen/src/masm.rs @@ -54,7 +54,7 @@ pub(crate) enum MaxKind { Unsigned, } -/// Kind of extend-multiply +/// Kind of extend-multiply. pub(crate) enum ExtMulKind { // Sign-extend higher-half of each lane. HighSigned, @@ -66,6 +66,14 @@ pub(crate) enum ExtMulKind { LowUnsigned, } +/// Kind of pairwise extend-add. +pub(crate) enum ExtAddKind { + /// Signed pairwise extend add. + Signed, + /// Unsigned pairwise extend add. + Unsigned, +} + #[derive(Eq, PartialEq)] pub(crate) enum MulWideKind { Signed, @@ -1936,10 +1944,10 @@ pub(crate) trait MacroAssembler { kind: MaxKind, ) -> Result<()>; - /// Perform lane-wise integer extended multiplication producing twice wider result than the inputs. - /// This is equivalent to a an extend followed by a multiply. + /// Perform the lane-wise integer extended multiplication producing twice wider result than the + /// inputs. This is equivalent to an extend followed by a multiply. /// - /// The extention to be performed is infered from the `lane_width` and the `kind` of extmul, + /// The extension to be performed is inferred from the `lane_width` and the `kind` of extmul, /// e.g, if `lane_width` is `S16`, and `kind` is `LowSigned`, then we sign-extend the lower /// 8bits of the 16bits lanes. fn v128_extmul( @@ -1948,4 +1956,14 @@ pub(crate) trait MacroAssembler { lane_width: OperandSize, kind: ExtMulKind, ) -> Result<()>; + + /// Perform the lane-wise integer extended pairwise addition producing extended results (twice + /// wider results than the inputs). + fn v128_extadd_pairwise( + &mut self, + src: Reg, + dst: WritableReg, + lane_width: OperandSize, + kind: ExtAddKind, + ) -> Result<()>; } diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs index 4d44b54a95b5..8402190900fc 100644 --- a/winch/codegen/src/visitor.rs +++ b/winch/codegen/src/visitor.rs @@ -10,11 +10,7 @@ use crate::codegen::{ FnCall, }; use crate::masm::{ - DivKind, Extend, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, IntCmpKind, LoadKind, - MacroAssembler, MaxKind, MemMoveDirection, MinKind, MulWideKind, OperandSize, RegImm, RemKind, - ReplaceLaneKind, RmwOp, RoundingMode, SPOffset, ShiftKind, Signed, SplatKind, SplatLoadKind, - StoreKind, TruncKind, V128AbsKind, V128ConvertKind, V128ExtendKind, V128LoadExtendKind, - V128NarrowKind, VectorCompareKind, VectorEqualityKind, Zero, + DivKind, ExtAddKind, ExtMulKind, Extend, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, IntCmpKind, LoadKind, MacroAssembler, MaxKind, MemMoveDirection, MinKind, MulWideKind, OperandSize, RegImm, RemKind, ReplaceLaneKind, RmwOp, RoundingMode, SPOffset, ShiftKind, Signed, SplatKind, SplatLoadKind, StoreKind, TruncKind, V128AbsKind, V128ConvertKind, V128ExtendKind, V128LoadExtendKind, V128NarrowKind, VectorCompareKind, VectorEqualityKind, Zero }; use crate::reg::{writable, Reg}; @@ -515,6 +511,10 @@ macro_rules! def_unsupported { (emit I16x8ExtMulHighI8x16U $($rest:tt)*) => {}; (emit I32x4ExtMulHighI16x8U $($rest:tt)*) => {}; (emit I64x2ExtMulHighI32x4U $($rest:tt)*) => {}; + (emit I16x8ExtAddPairwiseI8x16U $($rest:tt)*) => {}; + (emit I16x8ExtAddPairwiseI8x16S $($rest:tt)*) => {}; + (emit I32x4ExtAddPairwiseI16x8U $($rest:tt)*) => {}; + (emit I32x4ExtAddPairwiseI16x8S $($rest:tt)*) => {}; (emit $unsupported:tt $($rest:tt)*) => {$($rest)*}; } @@ -4097,6 +4097,7 @@ where self.context .binop(self.masm, OperandSize::S16, |masm, dst, src, size| { masm.v128_q15mulr_sat_s(dst, src, writable!(dst), size)?; + Ok(TypedReg::v128(dst)) }) } @@ -4313,6 +4314,34 @@ where .v128_extmul(&mut self.context, OperandSize::S64, ExtMulKind::HighSigned) } + fn visit_i16x8_extadd_pairwise_i8x16_s(&mut self) -> Self::Output { + self.context.unop(self.masm, |masm, op| { + masm.v128_extadd_pairwise(op, writable!(op), OperandSize::S16, ExtAddKind::Signed)?; + Ok(TypedReg::v128(op)) + }) + } + + fn visit_i16x8_extadd_pairwise_i8x16_u(&mut self) -> Self::Output { + self.context.unop(self.masm, |masm, op| { + masm.v128_extadd_pairwise(op, writable!(op), OperandSize::S16, ExtAddKind::Unsigned)?; + Ok(TypedReg::v128(op)) + }) + } + + fn visit_i32x4_extadd_pairwise_i16x8_s(&mut self) -> Self::Output { + self.context.unop(self.masm, |masm, op| { + masm.v128_extadd_pairwise(op, writable!(op), OperandSize::S32, ExtAddKind::Signed)?; + Ok(TypedReg::v128(op)) + }) + } + + fn visit_i32x4_extadd_pairwise_i16x8_u(&mut self) -> Self::Output { + self.context.unop(self.masm, |masm, op| { + masm.v128_extadd_pairwise(op, writable!(op), OperandSize::S32, ExtAddKind::Unsigned)?; + Ok(TypedReg::v128(op)) + }) + } + wasmparser::for_each_visit_simd_operator!(def_unsupported); }