[WASM SIMD] Implement horizontal add for float and integer types.

- Adds new F32x4AddHoriz, I32x4AddHoriz, etc. to WASM opcodes. - Implements them for ARM. LOG=N BUG=v8:6020 Review-Url: https://codereview.chromium.org/2804883008 Cr-Commit-Position: refs/heads/master@{#44812}
WanderWang · Apr 24, 2017 · a71c338 · a71c338
1 parent 6c0e81b
commit a71c338
Show file tree

Hide file tree

Showing 19 changed files with 369 additions and 192 deletions.
diff --git a/src/arm/assembler-arm.cc b/src/arm/assembler-arm.cc
@@ -4486,13 +4486,16 @@ void Assembler::vrsqrts(QwNeonRegister dst, QwNeonRegister src1,
   emit(EncodeNeonBinOp(VRSQRTS, dst, src1, src2));
 }
 
-enum NeonPairwiseOp { VPMIN, VPMAX };
+enum NeonPairwiseOp { VPADD, VPMIN, VPMAX };
 
 static Instr EncodeNeonPairwiseOp(NeonPairwiseOp op, NeonDataType dt,
                                   DwVfpRegister dst, DwVfpRegister src1,
                                   DwVfpRegister src2) {
   int op_encoding = 0;
   switch (op) {
+    case VPADD:
+      op_encoding = 0xB * B8 | B4;
+      break;
     case VPMIN:
       op_encoding = 0xA * B8 | B4;
       break;
@@ -4515,6 +4518,30 @@ static Instr EncodeNeonPairwiseOp(NeonPairwiseOp op, NeonDataType dt,
          n * B7 | m * B5 | vm | op_encoding;
 }
 
+void Assembler::vpadd(DwVfpRegister dst, DwVfpRegister src1,
+                      DwVfpRegister src2) {
+  DCHECK(IsEnabled(NEON));
+  // Dd = vpadd(Dn, Dm) SIMD integer pairwise ADD.
+  // Instruction details available in ARM DDI 0406C.b, A8-982.
+  int vd, d;
+  dst.split_code(&vd, &d);
+  int vn, n;
+  src1.split_code(&vn, &n);
+  int vm, m;
+  src2.split_code(&vm, &m);
+
+  emit(0x1E6U * B23 | d * B22 | vn * B16 | vd * B12 | 0xD * B8 | n * B7 |
+       m * B5 | vm);
+}
+
+void Assembler::vpadd(NeonSize size, DwVfpRegister dst, DwVfpRegister src1,
+                      DwVfpRegister src2) {
+  DCHECK(IsEnabled(NEON));
+  // Dd = vpadd(Dn, Dm) SIMD integer pairwise ADD.
+  // Instruction details available in ARM DDI 0406C.b, A8-980.
+  emit(EncodeNeonPairwiseOp(VPADD, NeonSizeToDatatype(size), dst, src1, src2));
+}
+
 void Assembler::vpmin(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,
                       DwVfpRegister src2) {
   DCHECK(IsEnabled(NEON));

diff --git a/src/arm/assembler-arm.h b/src/arm/assembler-arm.h
@@ -1371,6 +1371,9 @@ class Assembler : public AssemblerBase {
   void vmax(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
   void vmax(NeonDataType dt, QwNeonRegister dst,
             QwNeonRegister src1, QwNeonRegister src2);
+  void vpadd(DwVfpRegister dst, DwVfpRegister src1, DwVfpRegister src2);
+  void vpadd(NeonSize size, DwVfpRegister dst, DwVfpRegister src1,
+             DwVfpRegister src2);
   void vpmin(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,
              DwVfpRegister src2);
   void vpmax(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,

diff --git a/src/arm/constants-arm.h b/src/arm/constants-arm.h
@@ -324,6 +324,8 @@ enum LFlag {
   Short = 0 << 22   // Short load/store coprocessor.
 };
 
+// Neon sizes.
+enum NeonSize { Neon8 = 0x0, Neon16 = 0x1, Neon32 = 0x2, Neon64 = 0x3 };
 
 // NEON data type
 enum NeonDataType {
@@ -339,20 +341,18 @@ enum NeonDataType {
 inline int NeonU(NeonDataType dt) { return static_cast<int>(dt) >> 2; }
 inline int NeonSz(NeonDataType dt) { return static_cast<int>(dt) & 0x3; }
 
+// Convert sizes to data types (U bit is clear).
+inline NeonDataType NeonSizeToDatatype(NeonSize size) {
+  return static_cast<NeonDataType>(size);
+}
+
 enum NeonListType {
   nlt_1 = 0x7,
   nlt_2 = 0xA,
   nlt_3 = 0x6,
   nlt_4 = 0x2
 };
 
-enum NeonSize {
-  Neon8 = 0x0,
-  Neon16 = 0x1,
-  Neon32 = 0x2,
-  Neon64 = 0x3
-};
-
 // -----------------------------------------------------------------------------
 // Supervisor Call (svc) specific support.
 

diff --git a/src/arm/disasm-arm.cc b/src/arm/disasm-arm.cc
@@ -1950,6 +1950,13 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
                        op, size, Vd, Vn, Vm);
           break;
         }
+        case 0xb: {
+          // vpadd.i<size> Dd, Dm, Dn.
+          out_buffer_pos_ +=
+              SNPrintF(out_buffer_ + out_buffer_pos_, "vpadd.i%d d%d, d%d, d%d",
+                       size, Vd, Vn, Vm);
+          break;
+        }
         case 0xd: {
           if (instr->Bit(4) == 0) {
             const char* op = (instr->Bits(21, 20) == 0) ? "vadd" : "vsub";
@@ -2130,10 +2137,16 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
           break;
         }
         case 0xd: {
-          if (instr->Bit(21) == 0 && instr->Bit(6) == 1 && instr->Bit(4) == 1) {
-            // vmul.f32 Qd, Qn, Qm
+          if (instr->Bits(21, 20) == 0 && instr->Bit(6) == 1 &&
+              instr->Bit(4) == 1) {
+            // vmul.f32 Qd, Qm, Qn
             out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
                                         "vmul.f32 q%d, q%d, q%d", Vd, Vn, Vm);
+          } else if (instr->Bits(21, 20) == 0 && instr->Bit(6) == 0 &&
+                     instr->Bit(4) == 0) {
+            // vpadd.f32 Dd, Dm, Dn.
+            out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+                                        "vpadd.f32 d%d, d%d, d%d", Vd, Vn, Vm);
           } else {
             Unknown(instr);
           }

diff --git a/src/arm/simulator-arm.cc b/src/arm/simulator-arm.cc
@@ -4278,6 +4278,20 @@ void PairwiseMinMax(Simulator* simulator, int Vd, int Vm, int Vn, bool min) {
   simulator->set_neon_register<T, kDoubleSize>(Vd, dst);
 }
 
+template <typename T>
+void PairwiseAdd(Simulator* simulator, int Vd, int Vm, int Vn) {
+  static const int kElems = kDoubleSize / sizeof(T);
+  static const int kPairs = kElems / 2;
+  T dst[kElems], src1[kElems], src2[kElems];
+  simulator->get_neon_register<T, kDoubleSize>(Vn, src1);
+  simulator->get_neon_register<T, kDoubleSize>(Vm, src2);
+  for (int i = 0; i < kPairs; i++) {
+    dst[i] = src1[i * 2] + src1[i * 2 + 1];
+    dst[i + kPairs] = src2[i * 2] + src2[i * 2 + 1];
+  }
+  simulator->set_neon_register<T, kDoubleSize>(Vd, dst);
+}
+
 void Simulator::DecodeSpecialCondition(Instruction* instr) {
   switch (instr->SpecialValue()) {
     case 4: {
@@ -4489,6 +4503,25 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
           }
           break;
         }
+        case 0xb: {
+          // vpadd.i<size> Dd, Dm, Dn.
+          NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
+          switch (size) {
+            case Neon8:
+              PairwiseAdd<int8_t>(this, Vd, Vm, Vn);
+              break;
+            case Neon16:
+              PairwiseAdd<int16_t>(this, Vd, Vm, Vn);
+              break;
+            case Neon32:
+              PairwiseAdd<int32_t>(this, Vd, Vm, Vn);
+              break;
+            default:
+              UNREACHABLE();
+              break;
+          }
+          break;
+        }
         case 0xd: {
           if (instr->Bit(4) == 0) {
             float src1[4], src2[4];
@@ -4837,7 +4870,8 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
           break;
         }
         case 0xd: {
-          if (instr->Bit(21) == 0 && instr->Bit(6) == 1 && instr->Bit(4) == 1) {
+          if (instr->Bits(21, 20) == 0 && instr->Bit(6) == 1 &&
+              instr->Bit(4) == 1) {
             // vmul.f32 Qd, Qn, Qm
             float src1[4], src2[4];
             get_neon_register(Vn, src1);
@@ -4846,6 +4880,10 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
               src1[i] = src1[i] * src2[i];
             }
             set_neon_register(Vd, src1);
+          } else if (instr->Bits(21, 20) == 0 && instr->Bit(6) == 0 &&
+                     instr->Bit(4) == 0) {
+            // vpadd.f32 Dd, Dn, Dm
+            PairwiseAdd<float>(this, Vd, Vm, Vn);
           } else {
             UNIMPLEMENTED();
           }

diff --git a/src/compiler/arm/code-generator-arm.cc b/src/compiler/arm/code-generator-arm.cc
@@ -496,6 +496,41 @@ Condition FlagsConditionToCondition(FlagsCondition condition) {
     DCHECK_EQ(LeaveCC, i.OutputSBit());                                        \
   } while (0)
 
+#define ASSEMBLE_NEON_NARROWING_OP(dt)                \
+  do {                                                \
+    Simd128Register dst = i.OutputSimd128Register(),  \
+                    src0 = i.InputSimd128Register(0), \
+                    src1 = i.InputSimd128Register(1); \
+    if (dst.is(src0) && dst.is(src1)) {               \
+      __ vqmovn(dt, dst.low(), src0);                 \
+      __ vmov(dst.high(), dst.low());                 \
+    } else if (dst.is(src0)) {                        \
+      __ vqmovn(dt, dst.low(), src0);                 \
+      __ vqmovn(dt, dst.high(), src1);                \
+    } else {                                          \
+      __ vqmovn(dt, dst.high(), src1);                \
+      __ vqmovn(dt, dst.low(), src0);                 \
+    }                                                 \
+  } while (0)
+
+#define ASSEMBLE_NEON_PAIRWISE_OP(op, size)               \
+  do {                                                    \
+    Simd128Register dst = i.OutputSimd128Register(),      \
+                    src0 = i.InputSimd128Register(0),     \
+                    src1 = i.InputSimd128Register(1);     \
+    if (dst.is(src0)) {                                   \
+      __ op(size, dst.low(), src0.low(), src0.high());    \
+      if (dst.is(src1)) {                                 \
+        __ vmov(dst.high(), dst.low());                   \
+      } else {                                            \
+        __ op(size, dst.high(), src1.low(), src1.high()); \
+      }                                                   \
+    } else {                                              \
+      __ op(size, dst.high(), src1.low(), src1.high());   \
+      __ op(size, dst.low(), src0.low(), src0.high());    \
+    }                                                     \
+  } while (0)
+
 void CodeGenerator::AssembleDeconstructFrame() {
   __ LeaveFrame(StackFrame::MANUAL);
   unwinding_info_writer_.MarkFrameDeconstructed(__ pc_offset());
@@ -1611,6 +1646,24 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
               i.InputSimd128Register(1));
       break;
     }
+    case kArmF32x4AddHoriz: {
+      Simd128Register dst = i.OutputSimd128Register(),
+                      src0 = i.InputSimd128Register(0),
+                      src1 = i.InputSimd128Register(1);
+      // Make sure we don't overwrite source data before it's used.
+      if (dst.is(src0)) {
+        __ vpadd(dst.low(), src0.low(), src0.high());
+        if (dst.is(src1)) {
+          __ vmov(dst.high(), dst.low());
+        } else {
+          __ vpadd(dst.high(), src1.low(), src1.high());
+        }
+      } else {
+        __ vpadd(dst.high(), src1.low(), src1.high());
+        __ vpadd(dst.low(), src0.low(), src0.high());
+      }
+      break;
+    }
     case kArmF32x4Sub: {
       __ vsub(i.OutputSimd128Register(), i.InputSimd128Register(0),
               i.InputSimd128Register(1));
@@ -1699,6 +1752,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
               i.InputSimd128Register(1));
       break;
     }
+    case kArmI32x4AddHoriz:
+      ASSEMBLE_NEON_PAIRWISE_OP(vpadd, Neon32);
+      break;
     case kArmI32x4Sub: {
       __ vsub(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0),
               i.InputSimd128Register(1));
@@ -1818,25 +1874,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
               i.InputInt4(1));
       break;
     }
-    case kArmI16x8SConvertI32x4: {
-      Simd128Register dst = i.OutputSimd128Register(),
-                      src0 = i.InputSimd128Register(0),
-                      src1 = i.InputSimd128Register(1);
-      // Take care not to overwrite a source register before it's used.
-      if (dst.is(src0) && dst.is(src1)) {
-        __ vqmovn(NeonS16, dst.low(), src0);
-        __ vmov(dst.high(), dst.low());
-      } else if (dst.is(src0)) {
-        // dst is src0, so narrow src0 first.
-        __ vqmovn(NeonS16, dst.low(), src0);
-        __ vqmovn(NeonS16, dst.high(), src1);
-      } else {
-        // dst may alias src1, so narrow src1 first.
-        __ vqmovn(NeonS16, dst.high(), src1);
-        __ vqmovn(NeonS16, dst.low(), src0);
-      }
+    case kArmI16x8SConvertI32x4:
+      ASSEMBLE_NEON_NARROWING_OP(NeonS16);
       break;
-    }
     case kArmI16x8Add: {
       __ vadd(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0),
               i.InputSimd128Register(1));
@@ -1847,6 +1887,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
                i.InputSimd128Register(1));
       break;
     }
+    case kArmI16x8AddHoriz:
+      ASSEMBLE_NEON_PAIRWISE_OP(vpadd, Neon16);
+      break;
     case kArmI16x8Sub: {
       __ vsub(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0),
               i.InputSimd128Register(1));
@@ -1909,25 +1952,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
               i.InputInt4(1));
       break;
     }
-    case kArmI16x8UConvertI32x4: {
-      Simd128Register dst = i.OutputSimd128Register(),
-                      src0 = i.InputSimd128Register(0),
-                      src1 = i.InputSimd128Register(1);
-      // Take care not to overwrite a source register before it's used.
-      if (dst.is(src0) && dst.is(src1)) {
-        __ vqmovn(NeonU16, dst.low(), src0);
-        __ vmov(dst.high(), dst.low());
-      } else if (dst.is(src0)) {
-        // dst is src0, so narrow src0 first.
-        __ vqmovn(NeonU16, dst.low(), src0);
-        __ vqmovn(NeonU16, dst.high(), src1);
-      } else {
-        // dst may alias src1, so narrow src1 first.
-        __ vqmovn(NeonU16, dst.high(), src1);
-        __ vqmovn(NeonU16, dst.low(), src0);
-      }
+    case kArmI16x8UConvertI32x4:
+      ASSEMBLE_NEON_NARROWING_OP(NeonU16);
       break;
-    }
     case kArmI16x8AddSaturateU: {
       __ vqadd(NeonU16, i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputSimd128Register(1));
@@ -1986,25 +2013,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
               i.InputInt3(1));
       break;
     }
-    case kArmI8x16SConvertI16x8: {
-      Simd128Register dst = i.OutputSimd128Register(),
-                      src0 = i.InputSimd128Register(0),
-                      src1 = i.InputSimd128Register(1);
-      // Take care not to overwrite a source register before it's used.
-      if (dst.is(src0) && dst.is(src1)) {
-        __ vqmovn(NeonS8, dst.low(), src0);
-        __ vmov(dst.high(), dst.low());
-      } else if (dst.is(src0)) {
-        // dst is src0, so narrow src0 first.
-        __ vqmovn(NeonS8, dst.low(), src0);
-        __ vqmovn(NeonS8, dst.high(), src1);
-      } else {
-        // dst may alias src1, so narrow src1 first.
-        __ vqmovn(NeonS8, dst.high(), src1);
-        __ vqmovn(NeonS8, dst.low(), src0);
-      }
+    case kArmI8x16SConvertI16x8:
+      ASSEMBLE_NEON_NARROWING_OP(NeonS8);
       break;
-    }
     case kArmI8x16Add: {
       __ vadd(Neon8, i.OutputSimd128Register(), i.InputSimd128Register(0),
               i.InputSimd128Register(1));
@@ -2066,25 +2077,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
               i.InputInt3(1));
       break;
     }
-    case kArmI8x16UConvertI16x8: {
-      Simd128Register dst = i.OutputSimd128Register(),
-                      src0 = i.InputSimd128Register(0),
-                      src1 = i.InputSimd128Register(1);
-      // Take care not to overwrite a source register before it's used.
-      if (dst.is(src0) && dst.is(src1)) {
-        __ vqmovn(NeonU8, dst.low(), src0);
-        __ vmov(dst.high(), dst.low());
-      } else if (dst.is(src0)) {
-        // dst is src0, so narrow src0 first.
-        __ vqmovn(NeonU8, dst.low(), src0);
-        __ vqmovn(NeonU8, dst.high(), src1);
-      } else {
-        // dst may alias src1, so narrow src1 first.
-        __ vqmovn(NeonU8, dst.high(), src1);
-        __ vqmovn(NeonU8, dst.low(), src0);
-      }
+    case kArmI8x16UConvertI16x8:
+      ASSEMBLE_NEON_NARROWING_OP(NeonU8);
       break;
-    }
     case kArmI8x16AddSaturateU: {
       __ vqadd(NeonU8, i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputSimd128Register(1));