Skip to content

Commit

Permalink
[WASM SIMD] Implement horizontal add for float and integer types.
Browse files Browse the repository at this point in the history
- Adds new F32x4AddHoriz, I32x4AddHoriz, etc. to WASM opcodes.
- Implements them for ARM.

LOG=N
BUG=v8:6020

Review-Url: https://codereview.chromium.org/2804883008
Cr-Commit-Position: refs/heads/master@{#44812}
  • Loading branch information
bbudge authored and Commit bot committed Apr 24, 2017
1 parent 6c0e81b commit a71c338
Show file tree
Hide file tree
Showing 19 changed files with 369 additions and 192 deletions.
29 changes: 28 additions & 1 deletion src/arm/assembler-arm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4486,13 +4486,16 @@ void Assembler::vrsqrts(QwNeonRegister dst, QwNeonRegister src1,
emit(EncodeNeonBinOp(VRSQRTS, dst, src1, src2));
}

enum NeonPairwiseOp { VPMIN, VPMAX };
enum NeonPairwiseOp { VPADD, VPMIN, VPMAX };

static Instr EncodeNeonPairwiseOp(NeonPairwiseOp op, NeonDataType dt,
DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2) {
int op_encoding = 0;
switch (op) {
case VPADD:
op_encoding = 0xB * B8 | B4;
break;
case VPMIN:
op_encoding = 0xA * B8 | B4;
break;
Expand All @@ -4515,6 +4518,30 @@ static Instr EncodeNeonPairwiseOp(NeonPairwiseOp op, NeonDataType dt,
n * B7 | m * B5 | vm | op_encoding;
}

void Assembler::vpadd(DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2) {
DCHECK(IsEnabled(NEON));
// Dd = vpadd(Dn, Dm) SIMD integer pairwise ADD.
// Instruction details available in ARM DDI 0406C.b, A8-982.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);

emit(0x1E6U * B23 | d * B22 | vn * B16 | vd * B12 | 0xD * B8 | n * B7 |
m * B5 | vm);
}

void Assembler::vpadd(NeonSize size, DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2) {
DCHECK(IsEnabled(NEON));
// Dd = vpadd(Dn, Dm) SIMD integer pairwise ADD.
// Instruction details available in ARM DDI 0406C.b, A8-980.
emit(EncodeNeonPairwiseOp(VPADD, NeonSizeToDatatype(size), dst, src1, src2));
}

void Assembler::vpmin(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2) {
DCHECK(IsEnabled(NEON));
Expand Down
3 changes: 3 additions & 0 deletions src/arm/assembler-arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -1371,6 +1371,9 @@ class Assembler : public AssemblerBase {
void vmax(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
void vmax(NeonDataType dt, QwNeonRegister dst,
QwNeonRegister src1, QwNeonRegister src2);
void vpadd(DwVfpRegister dst, DwVfpRegister src1, DwVfpRegister src2);
void vpadd(NeonSize size, DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2);
void vpmin(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2);
void vpmax(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,
Expand Down
14 changes: 7 additions & 7 deletions src/arm/constants-arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,8 @@ enum LFlag {
Short = 0 << 22 // Short load/store coprocessor.
};

// Neon sizes.
enum NeonSize { Neon8 = 0x0, Neon16 = 0x1, Neon32 = 0x2, Neon64 = 0x3 };

// NEON data type
enum NeonDataType {
Expand All @@ -339,20 +341,18 @@ enum NeonDataType {
inline int NeonU(NeonDataType dt) { return static_cast<int>(dt) >> 2; }
inline int NeonSz(NeonDataType dt) { return static_cast<int>(dt) & 0x3; }

// Convert sizes to data types (U bit is clear).
inline NeonDataType NeonSizeToDatatype(NeonSize size) {
return static_cast<NeonDataType>(size);
}

enum NeonListType {
nlt_1 = 0x7,
nlt_2 = 0xA,
nlt_3 = 0x6,
nlt_4 = 0x2
};

enum NeonSize {
Neon8 = 0x0,
Neon16 = 0x1,
Neon32 = 0x2,
Neon64 = 0x3
};

// -----------------------------------------------------------------------------
// Supervisor Call (svc) specific support.

Expand Down
17 changes: 15 additions & 2 deletions src/arm/disasm-arm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1950,6 +1950,13 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
op, size, Vd, Vn, Vm);
break;
}
case 0xb: {
// vpadd.i<size> Dd, Dm, Dn.
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vpadd.i%d d%d, d%d, d%d",
size, Vd, Vn, Vm);
break;
}
case 0xd: {
if (instr->Bit(4) == 0) {
const char* op = (instr->Bits(21, 20) == 0) ? "vadd" : "vsub";
Expand Down Expand Up @@ -2130,10 +2137,16 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
break;
}
case 0xd: {
if (instr->Bit(21) == 0 && instr->Bit(6) == 1 && instr->Bit(4) == 1) {
// vmul.f32 Qd, Qn, Qm
if (instr->Bits(21, 20) == 0 && instr->Bit(6) == 1 &&
instr->Bit(4) == 1) {
// vmul.f32 Qd, Qm, Qn
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmul.f32 q%d, q%d, q%d", Vd, Vn, Vm);
} else if (instr->Bits(21, 20) == 0 && instr->Bit(6) == 0 &&
instr->Bit(4) == 0) {
// vpadd.f32 Dd, Dm, Dn.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vpadd.f32 d%d, d%d, d%d", Vd, Vn, Vm);
} else {
Unknown(instr);
}
Expand Down
40 changes: 39 additions & 1 deletion src/arm/simulator-arm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4278,6 +4278,20 @@ void PairwiseMinMax(Simulator* simulator, int Vd, int Vm, int Vn, bool min) {
simulator->set_neon_register<T, kDoubleSize>(Vd, dst);
}

template <typename T>
void PairwiseAdd(Simulator* simulator, int Vd, int Vm, int Vn) {
static const int kElems = kDoubleSize / sizeof(T);
static const int kPairs = kElems / 2;
T dst[kElems], src1[kElems], src2[kElems];
simulator->get_neon_register<T, kDoubleSize>(Vn, src1);
simulator->get_neon_register<T, kDoubleSize>(Vm, src2);
for (int i = 0; i < kPairs; i++) {
dst[i] = src1[i * 2] + src1[i * 2 + 1];
dst[i + kPairs] = src2[i * 2] + src2[i * 2 + 1];
}
simulator->set_neon_register<T, kDoubleSize>(Vd, dst);
}

void Simulator::DecodeSpecialCondition(Instruction* instr) {
switch (instr->SpecialValue()) {
case 4: {
Expand Down Expand Up @@ -4489,6 +4503,25 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
}
break;
}
case 0xb: {
// vpadd.i<size> Dd, Dm, Dn.
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
switch (size) {
case Neon8:
PairwiseAdd<int8_t>(this, Vd, Vm, Vn);
break;
case Neon16:
PairwiseAdd<int16_t>(this, Vd, Vm, Vn);
break;
case Neon32:
PairwiseAdd<int32_t>(this, Vd, Vm, Vn);
break;
default:
UNREACHABLE();
break;
}
break;
}
case 0xd: {
if (instr->Bit(4) == 0) {
float src1[4], src2[4];
Expand Down Expand Up @@ -4837,7 +4870,8 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
break;
}
case 0xd: {
if (instr->Bit(21) == 0 && instr->Bit(6) == 1 && instr->Bit(4) == 1) {
if (instr->Bits(21, 20) == 0 && instr->Bit(6) == 1 &&
instr->Bit(4) == 1) {
// vmul.f32 Qd, Qn, Qm
float src1[4], src2[4];
get_neon_register(Vn, src1);
Expand All @@ -4846,6 +4880,10 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
src1[i] = src1[i] * src2[i];
}
set_neon_register(Vd, src1);
} else if (instr->Bits(21, 20) == 0 && instr->Bit(6) == 0 &&
instr->Bit(4) == 0) {
// vpadd.f32 Dd, Dn, Dm
PairwiseAdd<float>(this, Vd, Vm, Vn);
} else {
UNIMPLEMENTED();
}
Expand Down
139 changes: 67 additions & 72 deletions src/compiler/arm/code-generator-arm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,41 @@ Condition FlagsConditionToCondition(FlagsCondition condition) {
DCHECK_EQ(LeaveCC, i.OutputSBit()); \
} while (0)

#define ASSEMBLE_NEON_NARROWING_OP(dt) \
do { \
Simd128Register dst = i.OutputSimd128Register(), \
src0 = i.InputSimd128Register(0), \
src1 = i.InputSimd128Register(1); \
if (dst.is(src0) && dst.is(src1)) { \
__ vqmovn(dt, dst.low(), src0); \
__ vmov(dst.high(), dst.low()); \
} else if (dst.is(src0)) { \
__ vqmovn(dt, dst.low(), src0); \
__ vqmovn(dt, dst.high(), src1); \
} else { \
__ vqmovn(dt, dst.high(), src1); \
__ vqmovn(dt, dst.low(), src0); \
} \
} while (0)

#define ASSEMBLE_NEON_PAIRWISE_OP(op, size) \
do { \
Simd128Register dst = i.OutputSimd128Register(), \
src0 = i.InputSimd128Register(0), \
src1 = i.InputSimd128Register(1); \
if (dst.is(src0)) { \
__ op(size, dst.low(), src0.low(), src0.high()); \
if (dst.is(src1)) { \
__ vmov(dst.high(), dst.low()); \
} else { \
__ op(size, dst.high(), src1.low(), src1.high()); \
} \
} else { \
__ op(size, dst.high(), src1.low(), src1.high()); \
__ op(size, dst.low(), src0.low(), src0.high()); \
} \
} while (0)

void CodeGenerator::AssembleDeconstructFrame() {
__ LeaveFrame(StackFrame::MANUAL);
unwinding_info_writer_.MarkFrameDeconstructed(__ pc_offset());
Expand Down Expand Up @@ -1611,6 +1646,24 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1));
break;
}
case kArmF32x4AddHoriz: {
Simd128Register dst = i.OutputSimd128Register(),
src0 = i.InputSimd128Register(0),
src1 = i.InputSimd128Register(1);
// Make sure we don't overwrite source data before it's used.
if (dst.is(src0)) {
__ vpadd(dst.low(), src0.low(), src0.high());
if (dst.is(src1)) {
__ vmov(dst.high(), dst.low());
} else {
__ vpadd(dst.high(), src1.low(), src1.high());
}
} else {
__ vpadd(dst.high(), src1.low(), src1.high());
__ vpadd(dst.low(), src0.low(), src0.high());
}
break;
}
case kArmF32x4Sub: {
__ vsub(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
Expand Down Expand Up @@ -1699,6 +1752,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1));
break;
}
case kArmI32x4AddHoriz:
ASSEMBLE_NEON_PAIRWISE_OP(vpadd, Neon32);
break;
case kArmI32x4Sub: {
__ vsub(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
Expand Down Expand Up @@ -1818,25 +1874,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputInt4(1));
break;
}
case kArmI16x8SConvertI32x4: {
Simd128Register dst = i.OutputSimd128Register(),
src0 = i.InputSimd128Register(0),
src1 = i.InputSimd128Register(1);
// Take care not to overwrite a source register before it's used.
if (dst.is(src0) && dst.is(src1)) {
__ vqmovn(NeonS16, dst.low(), src0);
__ vmov(dst.high(), dst.low());
} else if (dst.is(src0)) {
// dst is src0, so narrow src0 first.
__ vqmovn(NeonS16, dst.low(), src0);
__ vqmovn(NeonS16, dst.high(), src1);
} else {
// dst may alias src1, so narrow src1 first.
__ vqmovn(NeonS16, dst.high(), src1);
__ vqmovn(NeonS16, dst.low(), src0);
}
case kArmI16x8SConvertI32x4:
ASSEMBLE_NEON_NARROWING_OP(NeonS16);
break;
}
case kArmI16x8Add: {
__ vadd(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
Expand All @@ -1847,6 +1887,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1));
break;
}
case kArmI16x8AddHoriz:
ASSEMBLE_NEON_PAIRWISE_OP(vpadd, Neon16);
break;
case kArmI16x8Sub: {
__ vsub(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
Expand Down Expand Up @@ -1909,25 +1952,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputInt4(1));
break;
}
case kArmI16x8UConvertI32x4: {
Simd128Register dst = i.OutputSimd128Register(),
src0 = i.InputSimd128Register(0),
src1 = i.InputSimd128Register(1);
// Take care not to overwrite a source register before it's used.
if (dst.is(src0) && dst.is(src1)) {
__ vqmovn(NeonU16, dst.low(), src0);
__ vmov(dst.high(), dst.low());
} else if (dst.is(src0)) {
// dst is src0, so narrow src0 first.
__ vqmovn(NeonU16, dst.low(), src0);
__ vqmovn(NeonU16, dst.high(), src1);
} else {
// dst may alias src1, so narrow src1 first.
__ vqmovn(NeonU16, dst.high(), src1);
__ vqmovn(NeonU16, dst.low(), src0);
}
case kArmI16x8UConvertI32x4:
ASSEMBLE_NEON_NARROWING_OP(NeonU16);
break;
}
case kArmI16x8AddSaturateU: {
__ vqadd(NeonU16, i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
Expand Down Expand Up @@ -1986,25 +2013,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputInt3(1));
break;
}
case kArmI8x16SConvertI16x8: {
Simd128Register dst = i.OutputSimd128Register(),
src0 = i.InputSimd128Register(0),
src1 = i.InputSimd128Register(1);
// Take care not to overwrite a source register before it's used.
if (dst.is(src0) && dst.is(src1)) {
__ vqmovn(NeonS8, dst.low(), src0);
__ vmov(dst.high(), dst.low());
} else if (dst.is(src0)) {
// dst is src0, so narrow src0 first.
__ vqmovn(NeonS8, dst.low(), src0);
__ vqmovn(NeonS8, dst.high(), src1);
} else {
// dst may alias src1, so narrow src1 first.
__ vqmovn(NeonS8, dst.high(), src1);
__ vqmovn(NeonS8, dst.low(), src0);
}
case kArmI8x16SConvertI16x8:
ASSEMBLE_NEON_NARROWING_OP(NeonS8);
break;
}
case kArmI8x16Add: {
__ vadd(Neon8, i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
Expand Down Expand Up @@ -2066,25 +2077,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputInt3(1));
break;
}
case kArmI8x16UConvertI16x8: {
Simd128Register dst = i.OutputSimd128Register(),
src0 = i.InputSimd128Register(0),
src1 = i.InputSimd128Register(1);
// Take care not to overwrite a source register before it's used.
if (dst.is(src0) && dst.is(src1)) {
__ vqmovn(NeonU8, dst.low(), src0);
__ vmov(dst.high(), dst.low());
} else if (dst.is(src0)) {
// dst is src0, so narrow src0 first.
__ vqmovn(NeonU8, dst.low(), src0);
__ vqmovn(NeonU8, dst.high(), src1);
} else {
// dst may alias src1, so narrow src1 first.
__ vqmovn(NeonU8, dst.high(), src1);
__ vqmovn(NeonU8, dst.low(), src0);
}
case kArmI8x16UConvertI16x8:
ASSEMBLE_NEON_NARROWING_OP(NeonU8);
break;
}
case kArmI8x16AddSaturateU: {
__ vqadd(NeonU8, i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
Expand Down
Loading

0 comments on commit a71c338

Please sign in to comment.