Skip to content

Commit

Permalink
v128 extadd
Browse files Browse the repository at this point in the history
  • Loading branch information
MarinPostma committed Feb 11, 2025
1 parent 65f9b42 commit 448c221
Show file tree
Hide file tree
Showing 9 changed files with 239 additions and 30 deletions.
5 changes: 2 additions & 3 deletions crates/wast-util/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -430,12 +430,9 @@ impl WastTest {
"spec_testsuite/simd_f64x2_pmin_pmax.wast",
"spec_testsuite/simd_f64x2_rounding.wast",
"spec_testsuite/simd_i16x8_arith2.wast",
"spec_testsuite/simd_i16x8_extadd_pairwise_i8x16.wast",
"spec_testsuite/simd_i16x8_extmul_i8x16.wast",
"spec_testsuite/simd_i16x8_q15mulr_sat_s.wast",
"spec_testsuite/simd_i32x4_arith2.wast",
"spec_testsuite/simd_i32x4_dot_i16x8.wast",
"spec_testsuite/simd_i32x4_extadd_pairwise_i16x8.wast",
"spec_testsuite/simd_i32x4_trunc_sat_f32x4.wast",
"spec_testsuite/simd_i32x4_trunc_sat_f64x2.wast",
"spec_testsuite/simd_i64x2_extmul_i32x4.wast",
Expand Down Expand Up @@ -503,6 +500,8 @@ impl WastTest {
"spec_testsuite/simd_i16x8_extmul_i8x16.wast",
"spec_testsuite/simd_i32x4_extmul_i16x8.wast",
"spec_testsuite/simd_i64x2_extmul_i32x4.wast",
"spec_testsuite/simd_i16x8_extadd_pairwise_i8x16.wast",
"spec_testsuite/simd_i32x4_extadd_pairwise_i16x8.wast",
];

if unsupported.iter().any(|part| self.path.ends_with(part)) {
Expand Down
31 changes: 31 additions & 0 deletions tests/disas/winch/x64/i16x8/extadd/extadd_s.wat
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
;;! target = "x86_64"
;;! test = "winch"
;;! flags = [ "-Ccranelift-has-avx" ]

(module
(func (param v128) (result v128)
(local.get 0)
(i16x8.extadd_pairwise_i8x16_s)
))
;; wasm[0]::function[0]:
;; pushq %rbp
;; movq %rsp, %rbp
;; movq 8(%rdi), %r11
;; movq 0x10(%r11), %r11
;; addq $0x20, %r11
;; cmpq %rsp, %r11
;; ja 0x51
;; 1c: movq %rdi, %r14
;; subq $0x20, %rsp
;; movq %rdi, 0x18(%rsp)
;; movq %rsi, 0x10(%rsp)
;; movdqu %xmm0, (%rsp)
;; movdqu (%rsp), %xmm0
;; vpmovsxbw %xmm0, %xmm15
;; vpalignr $8, %xmm0, %xmm0, %xmm0
;; vpmovsxbw %xmm0, %xmm0
;; vpaddw %xmm0, %xmm0, %xmm0
;; addq $0x20, %rsp
;; popq %rbp
;; retq
;; 51: ud2
31 changes: 31 additions & 0 deletions tests/disas/winch/x64/i16x8/extadd/extadd_u.wat
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
;;! target = "x86_64"
;;! test = "winch"
;;! flags = [ "-Ccranelift-has-avx" ]

(module
(func (param v128) (result v128)
(local.get 0)
(i16x8.extadd_pairwise_i8x16_u)
))
;; wasm[0]::function[0]:
;; pushq %rbp
;; movq %rsp, %rbp
;; movq 8(%rdi), %r11
;; movq 0x10(%r11), %r11
;; addq $0x20, %r11
;; cmpq %rsp, %r11
;; ja 0x50
;; 1c: movq %rdi, %r14
;; subq $0x20, %rsp
;; movq %rdi, 0x18(%rsp)
;; movq %rsi, 0x10(%rsp)
;; movdqu %xmm0, (%rsp)
;; movdqu (%rsp), %xmm0
;; vpmovzxbw %xmm0, %xmm15
;; vpxor %xmm15, %xmm15, %xmm15
;; vpunpckhbw %xmm15, %xmm0, %xmm0
;; vpaddw %xmm0, %xmm0, %xmm0
;; addq $0x20, %rsp
;; popq %rbp
;; retq
;; 50: ud2
31 changes: 31 additions & 0 deletions tests/disas/winch/x64/i32x4/extadd/extadd_s.wat
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
;;! target = "x86_64"
;;! test = "winch"
;;! flags = [ "-Ccranelift-has-avx" ]

(module
(func (param v128) (result v128)
(local.get 0)
(i32x4.extadd_pairwise_i16x8_s)
))
;; wasm[0]::function[0]:
;; pushq %rbp
;; movq %rsp, %rbp
;; movq 8(%rdi), %r11
;; movq 0x10(%r11), %r11
;; addq $0x20, %r11
;; cmpq %rsp, %r11
;; ja 0x51
;; 1c: movq %rdi, %r14
;; subq $0x20, %rsp
;; movq %rdi, 0x18(%rsp)
;; movq %rsi, 0x10(%rsp)
;; movdqu %xmm0, (%rsp)
;; movdqu (%rsp), %xmm0
;; vpmovsxwd %xmm0, %xmm15
;; vpalignr $8, %xmm0, %xmm0, %xmm0
;; vpmovsxwd %xmm0, %xmm0
;; vpaddd %xmm0, %xmm0, %xmm0
;; addq $0x20, %rsp
;; popq %rbp
;; retq
;; 51: ud2
31 changes: 31 additions & 0 deletions tests/disas/winch/x64/i32x4/extadd/extadd_u.wat
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
;;! target = "x86_64"
;;! test = "winch"
;;! flags = [ "-Ccranelift-has-avx" ]

(module
(func (param v128) (result v128)
(local.get 0)
(i32x4.extadd_pairwise_i16x8_u)
))
;; wasm[0]::function[0]:
;; pushq %rbp
;; movq %rsp, %rbp
;; movq 8(%rdi), %r11
;; movq 0x10(%r11), %r11
;; addq $0x20, %r11
;; cmpq %rsp, %r11
;; ja 0x50
;; 1c: movq %rdi, %r14
;; subq $0x20, %rsp
;; movq %rdi, 0x18(%rsp)
;; movq %rsi, 0x10(%rsp)
;; movdqu %xmm0, (%rsp)
;; movdqu (%rsp), %xmm0
;; vpmovzxwd %xmm0, %xmm15
;; vpxor %xmm15, %xmm15, %xmm15
;; vpunpckhwd %xmm15, %xmm0, %xmm0
;; vpaddd %xmm0, %xmm0, %xmm0
;; addq $0x20, %rsp
;; popq %rbp
;; retq
;; 50: ud2
10 changes: 10 additions & 0 deletions winch/codegen/src/isa/aarch64/masm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1224,6 +1224,16 @@ impl Masm for MacroAssembler {
) -> Result<()> {
Err(anyhow!(CodeGenError::unimplemented_masm_instruction()))
}

fn v128_extadd_pairwise(
&mut self,
_src: Reg,
_dst: WritableReg,
_lane_width: OperandSize,
_kind: crate::masm::ExtAddKind,
) -> Result<()> {
Err(anyhow!(CodeGenError::unimplemented_masm_instruction()))
}
}

impl MacroAssembler {
Expand Down
65 changes: 47 additions & 18 deletions winch/codegen/src/isa/x64/masm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,7 @@ use super::{
use anyhow::{anyhow, bail, Result};

use crate::masm::{
DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, Imm as I,
IntCmpKind, LaneSelector, LoadKind, MacroAssembler as Masm, MaxKind, MinKind, MulWideKind,
OperandSize, RegImm, RemKind, ReplaceLaneKind, RmwOp, RoundingMode, ShiftKind, SplatKind,
StoreKind, TrapCode, TruncKind, V128AbsKind, V128ConvertKind, V128ExtendKind, V128NarrowKind,
VectorCompareKind, VectorEqualityKind, Zero, TRUSTED_FLAGS, UNTRUSTED_FLAGS,
DivKind, ExtAddKind, ExtMulKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, Imm as I, IntCmpKind, LaneSelector, LoadKind, MacroAssembler as Masm, MaxKind, MinKind, MulWideKind, OperandSize, RegImm, RemKind, ReplaceLaneKind, RmwOp, RoundingMode, ShiftKind, SplatKind, StoreKind, TrapCode, TruncKind, V128AbsKind, V128ConvertKind, V128ExtendKind, V128NarrowKind, VectorCompareKind, VectorEqualityKind, Zero, TRUSTED_FLAGS, UNTRUSTED_FLAGS
};
use crate::{
abi::{self, align_to, calculate_frame_adjustment, LocalSlot},
Expand Down Expand Up @@ -2540,6 +2536,8 @@ impl Masm for MacroAssembler {
OperandSize::S32 | OperandSize::S64 => self.asm.xmm_vmovskp_rr(src, dst, size, size),
_ => unimplemented!(),
}

Ok(())
}

fn v128_min(
Expand Down Expand Up @@ -2602,33 +2600,35 @@ impl Masm for MacroAssembler {
lane_width: OperandSize,
kind: ExtMulKind,
) -> Result<()> {
use V128ExtendKind::*;

self.ensure_has_avx()?;

// The implementation for extmul is not optimized; for simplicity's sake, we simply perform
// an extention followed by a multiplication using already implemented primitives.
// an extension followed by a multiplication using already implemented primitives.

let src1 = context.pop_to_reg(self, None)?;
let src2 = context.pop_to_reg(self, None)?;

let ext_kind = match (lane_width, kind) {
(OperandSize::S16, ExtMulKind::HighSigned) => V128ExtendKind::HighI8x16S,
(OperandSize::S32, ExtMulKind::HighSigned) => V128ExtendKind::HighI16x8S,
(OperandSize::S64, ExtMulKind::HighSigned) => V128ExtendKind::HighI32x4S,
(OperandSize::S16, ExtMulKind::HighSigned) => HighI8x16S,
(OperandSize::S32, ExtMulKind::HighSigned) => HighI16x8S,
(OperandSize::S64, ExtMulKind::HighSigned) => HighI32x4S,
(_, ExtMulKind::HighSigned) => bail!(CodeGenError::unexpected_operand_size()),

(OperandSize::S16, ExtMulKind::LowSigned) => V128ExtendKind::LowI8x16S,
(OperandSize::S32, ExtMulKind::LowSigned) => V128ExtendKind::LowI16x8S,
(OperandSize::S64, ExtMulKind::LowSigned) => V128ExtendKind::LowI32x4S,
(OperandSize::S16, ExtMulKind::LowSigned) => LowI8x16S,
(OperandSize::S32, ExtMulKind::LowSigned) => LowI16x8S,
(OperandSize::S64, ExtMulKind::LowSigned) => LowI32x4S,
(_, ExtMulKind::LowSigned) => bail!(CodeGenError::unexpected_operand_size()),

(OperandSize::S16, ExtMulKind::HighUnsigned) => V128ExtendKind::HighI8x16U,
(OperandSize::S32, ExtMulKind::HighUnsigned) => V128ExtendKind::HighI16x8U,
(OperandSize::S64, ExtMulKind::HighUnsigned) => V128ExtendKind::HighI32x4U,
(OperandSize::S16, ExtMulKind::HighUnsigned) => HighI8x16U,
(OperandSize::S32, ExtMulKind::HighUnsigned) => HighI16x8U,
(OperandSize::S64, ExtMulKind::HighUnsigned) => HighI32x4U,
(_, ExtMulKind::HighUnsigned) => bail!(CodeGenError::unexpected_operand_size()),

(OperandSize::S16, ExtMulKind::LowUnsigned) => V128ExtendKind::LowI8x16U,
(OperandSize::S32, ExtMulKind::LowUnsigned) => V128ExtendKind::LowI16x8U,
(OperandSize::S64, ExtMulKind::LowUnsigned) => V128ExtendKind::LowI32x4U,
(OperandSize::S16, ExtMulKind::LowUnsigned) => LowI8x16U,
(OperandSize::S32, ExtMulKind::LowUnsigned) => LowI16x8U,
(OperandSize::S64, ExtMulKind::LowUnsigned) => LowI32x4U,
(_, ExtMulKind::LowUnsigned) => bail!(CodeGenError::unexpected_operand_size()),
};

Expand All @@ -2640,6 +2640,35 @@ impl Masm for MacroAssembler {

self.v128_mul(context, lane_width)
}

fn v128_extadd_pairwise(
&mut self,
src: Reg,
dst: WritableReg,
lane_width: OperandSize,
kind: ExtAddKind,
) -> Result<()> {
use V128ExtendKind::*;

self.ensure_has_avx()?;

// The implementation for extadd is not optimized; for simplicity's sake, we simply perform
// an extension followed by an addition using already implemented primitives.
let (low_kind, high_kind) = match (lane_width, kind) {
(OperandSize::S16, ExtAddKind::Signed) => (LowI8x16S, HighI8x16S),
(OperandSize::S16, ExtAddKind::Unsigned) => (LowI8x16U, HighI8x16U),
(OperandSize::S32, ExtAddKind::Signed) => (LowI16x8S, HighI16x8S),
(OperandSize::S32, ExtAddKind::Unsigned) => (LowI16x8U, HighI16x8U),
_ => bail!(CodeGenError::unexpected_operand_size()),
};

let tmp = regs::scratch_xmm();

self.v128_extend(src, writable!(tmp), low_kind)?;
self.v128_extend(src, dst, high_kind)?;

self.v128_add(src, dst.to_reg(), dst, lane_width, HandleOverflowKind::None)
}
}

impl MacroAssembler {
Expand Down
26 changes: 22 additions & 4 deletions winch/codegen/src/masm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ pub(crate) enum MaxKind {
Unsigned,
}

/// Kind of extend-multiply
/// Kind of extend-multiply.
pub(crate) enum ExtMulKind {
// Sign-extend higher-half of each lane.
HighSigned,
Expand All @@ -66,6 +66,14 @@ pub(crate) enum ExtMulKind {
LowUnsigned,
}

/// Kind of pairwise extend-add.
pub(crate) enum ExtAddKind {
/// Signed pairwise extend add.
Signed,
/// Unsigned pairwise extend add.
Unsigned,
}

#[derive(Eq, PartialEq)]
pub(crate) enum MulWideKind {
Signed,
Expand Down Expand Up @@ -1936,10 +1944,10 @@ pub(crate) trait MacroAssembler {
kind: MaxKind,
) -> Result<()>;

/// Perform lane-wise integer extended multiplication producing twice wider result than the inputs.
/// This is equivalent to a an extend followed by a multiply.
/// Perform the lane-wise integer extended multiplication producing twice wider result than the
/// inputs. This is equivalent to an extend followed by a multiply.
///
/// The extention to be performed is infered from the `lane_width` and the `kind` of extmul,
/// The extension to be performed is inferred from the `lane_width` and the `kind` of extmul,
/// e.g, if `lane_width` is `S16`, and `kind` is `LowSigned`, then we sign-extend the lower
/// 8bits of the 16bits lanes.
fn v128_extmul(
Expand All @@ -1948,4 +1956,14 @@ pub(crate) trait MacroAssembler {
lane_width: OperandSize,
kind: ExtMulKind,
) -> Result<()>;

/// Perform the lane-wise integer extended pairwise addition producing extended results (twice
/// wider results than the inputs).
fn v128_extadd_pairwise(
&mut self,
src: Reg,
dst: WritableReg,
lane_width: OperandSize,
kind: ExtAddKind,
) -> Result<()>;
}
39 changes: 34 additions & 5 deletions winch/codegen/src/visitor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,7 @@ use crate::codegen::{
FnCall,
};
use crate::masm::{
DivKind, Extend, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, IntCmpKind, LoadKind,
MacroAssembler, MaxKind, MemMoveDirection, MinKind, MulWideKind, OperandSize, RegImm, RemKind,
ReplaceLaneKind, RmwOp, RoundingMode, SPOffset, ShiftKind, Signed, SplatKind, SplatLoadKind,
StoreKind, TruncKind, V128AbsKind, V128ConvertKind, V128ExtendKind, V128LoadExtendKind,
V128NarrowKind, VectorCompareKind, VectorEqualityKind, Zero,
DivKind, ExtAddKind, ExtMulKind, Extend, ExtractLaneKind, FloatCmpKind, HandleOverflowKind, IntCmpKind, LoadKind, MacroAssembler, MaxKind, MemMoveDirection, MinKind, MulWideKind, OperandSize, RegImm, RemKind, ReplaceLaneKind, RmwOp, RoundingMode, SPOffset, ShiftKind, Signed, SplatKind, SplatLoadKind, StoreKind, TruncKind, V128AbsKind, V128ConvertKind, V128ExtendKind, V128LoadExtendKind, V128NarrowKind, VectorCompareKind, VectorEqualityKind, Zero
};

use crate::reg::{writable, Reg};
Expand Down Expand Up @@ -515,6 +511,10 @@ macro_rules! def_unsupported {
(emit I16x8ExtMulHighI8x16U $($rest:tt)*) => {};
(emit I32x4ExtMulHighI16x8U $($rest:tt)*) => {};
(emit I64x2ExtMulHighI32x4U $($rest:tt)*) => {};
(emit I16x8ExtAddPairwiseI8x16U $($rest:tt)*) => {};
(emit I16x8ExtAddPairwiseI8x16S $($rest:tt)*) => {};
(emit I32x4ExtAddPairwiseI16x8U $($rest:tt)*) => {};
(emit I32x4ExtAddPairwiseI16x8S $($rest:tt)*) => {};

(emit $unsupported:tt $($rest:tt)*) => {$($rest)*};
}
Expand Down Expand Up @@ -4097,6 +4097,7 @@ where
self.context
.binop(self.masm, OperandSize::S16, |masm, dst, src, size| {
masm.v128_q15mulr_sat_s(dst, src, writable!(dst), size)?;
Ok(TypedReg::v128(dst))
})
}

Expand Down Expand Up @@ -4313,6 +4314,34 @@ where
.v128_extmul(&mut self.context, OperandSize::S64, ExtMulKind::HighSigned)
}

fn visit_i16x8_extadd_pairwise_i8x16_s(&mut self) -> Self::Output {
self.context.unop(self.masm, |masm, op| {
masm.v128_extadd_pairwise(op, writable!(op), OperandSize::S16, ExtAddKind::Signed)?;
Ok(TypedReg::v128(op))
})
}

fn visit_i16x8_extadd_pairwise_i8x16_u(&mut self) -> Self::Output {
self.context.unop(self.masm, |masm, op| {
masm.v128_extadd_pairwise(op, writable!(op), OperandSize::S16, ExtAddKind::Unsigned)?;
Ok(TypedReg::v128(op))
})
}

fn visit_i32x4_extadd_pairwise_i16x8_s(&mut self) -> Self::Output {
self.context.unop(self.masm, |masm, op| {
masm.v128_extadd_pairwise(op, writable!(op), OperandSize::S32, ExtAddKind::Signed)?;
Ok(TypedReg::v128(op))
})
}

fn visit_i32x4_extadd_pairwise_i16x8_u(&mut self) -> Self::Output {
self.context.unop(self.masm, |masm, op| {
masm.v128_extadd_pairwise(op, writable!(op), OperandSize::S32, ExtAddKind::Unsigned)?;
Ok(TypedReg::v128(op))
})
}

wasmparser::for_each_visit_simd_operator!(def_unsupported);
}

Expand Down

0 comments on commit 448c221

Please sign in to comment.