diff --git a/llvm/lib/Target/AIE/AIECombinerHelper.cpp b/llvm/lib/Target/AIE/AIECombinerHelper.cpp index edf0b4542e06..c506d6bd503d 100644 --- a/llvm/lib/Target/AIE/AIECombinerHelper.cpp +++ b/llvm/lib/Target/AIE/AIECombinerHelper.cpp @@ -1874,8 +1874,8 @@ static bool checkExtractSubvectorPrerequisites(const AIEBaseInstrInfo &TII, (DstTySize != ScalarRegSize && DstTySize != 2 * ScalarRegSize)) return false; - // Currently, we cannot extract vectors of the size less than vector register - // size. + // Currently, we cannot extract vectors for the case when the size of the + // source vector is less than the basic vector register size (of the target). if (SrcTySize < VecRegSize) return false; @@ -1944,6 +1944,38 @@ buildExtractSubvector(MachineIRBuilder &B, MachineRegisterInfo &MRI, return B.buildInstr(Opc, {DstVecReg}, {NewSrcReg, Cst}); } +/// Match something like this: +/// %1:_(<16 x s32>) = COPY $x0 +/// %2:_(<16 x s32>) = COPY $x1 +/// %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1(<16 x s32>), %2(<16 x s32>), +/// shufflemask(8, 9, 10, 11, 12, 13, 14, 15) +/// PseudoRET implicit $lr, implicit %0 + +/// To convert to: +/// %1:_(<16 x s32>) = COPY $x0 +/// %2:_(<8 x s32>), %3:_(<8 x s32>) = G_UNMERGE_VALUES %1(<16 x s32>) +/// PseudoRET implicit $lr, implicit %3(<8 x s32>) +static bool matchShuffleToUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI, + BuildFnTy &MatchInfo, unsigned SubIdx, + unsigned NumSubVectors) { + const Register DstReg = MI.getOperand(0).getReg(); + const Register Src1Reg = MI.getOperand(1).getReg(); + const LLT DstTy = MRI.getType(DstReg); + + // TODO: Select into G_EXTRACT_SUBVECTOR once it is more widely supported + MatchInfo = [=, &MRI](MachineIRBuilder &B) { + SmallVector SubVecs; + for (unsigned I = 0; I < NumSubVectors; I++) { + if (I == (unsigned)SubIdx) + SubVecs.push_back(DstReg); + else + SubVecs.push_back(MRI.createGenericVirtualRegister(DstTy)); + } + B.buildUnmerge(SubVecs, Src1Reg); + }; + return true; +} + /// Match something like this: /// %1:_(<16 x s16>) = COPY $wl0 /// %2:_(<16 x s16>) = COPY $wl1 @@ -1954,6 +1986,81 @@ buildExtractSubvector(MachineIRBuilder &B, MachineRegisterInfo &MRI, /// %1:_(<16 x s16>) = COPY $wl0 /// %2:_(s32) = G_CONSTANT i32 1 /// %3:_(<4 x s16>) = G_AIE_EXTRACT_SUBVECTOR %1(<16 x s16>), %2(s32) +/// NOTE: This combine works ONLY for 32- and 64-bit outputs! +static bool matchShuffleToAIEExtractSubvec( + MachineInstr &MI, MachineRegisterInfo &MRI, const AIEBaseInstrInfo &TII, + BuildFnTy &MatchInfo, unsigned SubIdx, unsigned NumSubVectors) { + const unsigned GPRSize = TII.getScalarRegSize(); + const unsigned ExtractSubvecNativeSrcSize = TII.getBasicVectorBitSize(); + + const Register DstReg = MI.getOperand(0).getReg(); + const Register Src1Reg = MI.getOperand(1).getReg(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT Src1Ty = MRI.getType(Src1Reg); + const unsigned Src1TySize = Src1Ty.getSizeInBits(); + + if (!checkExtractSubvectorPrerequisites(TII, DstTy, Src1Ty)) + return false; + + const unsigned Opc = TII.getGenericExtractSubvectorOpcode(); + + // Natively supported source vector type + if (Src1TySize == ExtractSubvecNativeSrcSize) { + MatchInfo = [=](MachineIRBuilder &B) { + auto Cst = B.buildConstant(LLT::scalar(GPRSize), SubIdx); + B.buildInstr(Opc, {DstReg}, {Src1Reg, Cst}); + }; + + return true; + } + + // Source vectors of a non-native size are converted to vectors of the native + // size + const unsigned Src1ElmtSize = Src1Ty.getElementType().getSizeInBits(); + const unsigned Src1Vec512BitLen = ExtractSubvecNativeSrcSize / Src1ElmtSize; + const LLT NewSrc1Ty = LLT::fixed_vector(Src1Vec512BitLen, Src1ElmtSize); + const Register NewSrcReg = MRI.createGenericVirtualRegister(NewSrc1Ty); + + if (Src1TySize < ExtractSubvecNativeSrcSize) { + MatchInfo = [=](MachineIRBuilder &B) { + const Register ImplicitDef = B.buildUndef(Src1Ty).getReg(0); + SmallVector ConcatOps = {Src1Reg}; + unsigned NumImplicitDef = ExtractSubvecNativeSrcSize / Src1TySize - 1; + while (NumImplicitDef-- > 0) { + ConcatOps.push_back(ImplicitDef); + } + B.buildConcatVectors({NewSrcReg}, ConcatOps); + auto Cst = B.buildConstant(LLT::scalar(GPRSize), SubIdx); + B.buildInstr(Opc, {DstReg}, {NewSrcReg, Cst}); + }; + return true; + } + + // Source vectors with the size greater than the native source vector size + MatchInfo = [=, &MRI](MachineIRBuilder &B) { + const unsigned SizeCoefficient = Src1TySize / ExtractSubvecNativeSrcSize; + const unsigned NumSubVectorsNativeSize = NumSubVectors / SizeCoefficient; + unsigned NewSubIdx = SubIdx % NumSubVectorsNativeSize; + + SmallVector SubRegs; + unsigned NewSrcRegPosition = SubIdx / NumSubVectorsNativeSize; + for (unsigned I = 0; I < SizeCoefficient; ++I) { + if (I == NewSrcRegPosition) + SubRegs.push_back(NewSrcReg); + else + SubRegs.push_back(MRI.createGenericVirtualRegister(NewSrc1Ty)); + } + + B.buildUnmerge(SubRegs, Src1Reg); + auto Cst = B.buildConstant(LLT::scalar(GPRSize), NewSubIdx); + B.buildInstr(Opc, {DstReg}, {NewSrcReg, Cst}); + }; + return true; +} + +/// The method does some checks and calls matchShuffleToAIEExtractSubvec and +/// matchShuffleToUnmerge which extract subvectors is possible. bool llvm::matchShuffleToExtractSubvec(MachineInstr &MI, MachineRegisterInfo &MRI, const AIEBaseInstrInfo &TII, @@ -1962,45 +2069,60 @@ bool llvm::matchShuffleToExtractSubvec(MachineInstr &MI, const Register DstReg = MI.getOperand(0).getReg(); const Register Src1Reg = MI.getOperand(1).getReg(); - const ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + ArrayRef Mask = MI.getOperand(3).getShuffleMask(); const LLT DstTy = MRI.getType(DstReg); const LLT Src1Ty = MRI.getType(Src1Reg); + const unsigned Src1TySize = Src1Ty.getSizeInBits(); - if (!checkExtractSubvectorPrerequisites(TII, DstTy, Src1Ty)) + if (!DstTy.isVector() || !Src1Ty.isVector()) + return false; + + // Boolean vectors are unlikely to select into subregister copy + if (DstTy.getElementType() == LLT::scalar(1)) + return false; + + // This should be handled by a separate combine that copies Src1Reg to + // DstReg. + if (Src1TySize == DstTy.getSizeInBits()) return false; const unsigned NumDstElems = DstTy.getNumElements(); const unsigned NumSrc1Elems = Src1Ty.getNumElements(); - const unsigned NumSubVectors = NumSrc1Elems / NumDstElems; // Not an extract pattern - if (NumSrc1Elems < NumDstElems) + if (NumSrc1Elems <= NumDstElems) return false; // Unlikely to select into a subregister copy if (NumSrc1Elems % NumDstElems != 0) return false; - auto GetSubIdx = [=, &Mask]() -> std::optional { + const unsigned NumSubVectors = NumSrc1Elems / NumDstElems; + auto GetSubvecExtractIdx = [=, &Mask]() -> std::optional { for (unsigned SubVecIdx = 0; SubVecIdx < NumSubVectors; ++SubVecIdx) { if (checkSequentialMask(Mask, SubVecIdx * NumDstElems, NumDstElems)) { return SubVecIdx; } } + return std::nullopt; }; - std::optional SubIdx = GetSubIdx(); + std::optional SubvecExtractIdx = GetSubvecExtractIdx(); // Not an extract pattern - if (!SubIdx) + if (!SubvecExtractIdx) return false; - MatchInfo = [=, &MRI, &TII](MachineIRBuilder &B) { - buildExtractSubvector(B, MRI, TII, DstReg, Src1Reg, SubIdx.value()); - }; - return true; + if (matchShuffleToAIEExtractSubvec(MI, MRI, TII, MatchInfo, + SubvecExtractIdx.value(), NumSubVectors)) + return true; + if (matchShuffleToUnmerge(MI, MRI, MatchInfo, SubvecExtractIdx.value(), + NumSubVectors)) + return true; + + return false; } /// Match something like this: diff --git a/llvm/test/CodeGen/AIE/GlobalISel/prelegalizercombiner-shuffle-vector.mir b/llvm/test/CodeGen/AIE/GlobalISel/prelegalizercombiner-shuffle-vector.mir index e71e86a846ca..52d31cadddb1 100644 --- a/llvm/test/CodeGen/AIE/GlobalISel/prelegalizercombiner-shuffle-vector.mir +++ b/llvm/test/CodeGen/AIE/GlobalISel/prelegalizercombiner-shuffle-vector.mir @@ -784,3 +784,147 @@ body: | %0:_(<64 x s32>) = G_SHUFFLE_VECTOR %1(<64 x s32>), %2(<64 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1) PseudoRET implicit $lr, implicit %0 ... + +# Test G_SHUFFLE_VECTOR to UNMERGE +# Note: G_UNMERGE_VALUES is itself combined to G_AIE_UNPAD_VECTOR +--- +name: shuffle_vector_unmerge_lo +tracksRegLiveness: true +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: shuffle_vector_unmerge_lo + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_UNPAD_VECTOR]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1(<16 x s32>), %2(<16 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7) + PseudoRET implicit $lr, implicit %0 +... +--- +name: shuffle_vector_unmerge_lo_4_unmerge_outputs +tracksRegLiveness: true +body: | + bb.1: + liveins: $y0, $y1 + ; CHECK-LABEL: name: shuffle_vector_unmerge_lo_4_unmerge_outputs + ; CHECK: liveins: $y0, $y1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>), [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<8 x s32>) + %1:_(<32 x s32>) = COPY $y0 + %2:_(<32 x s32>) = COPY $y1 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1(<32 x s32>), %2(<32 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7) + PseudoRET implicit $lr, implicit %0 +... +--- +name: shuffle_vector_unmerge_hi_128 +tracksRegLiveness: true +body: | + bb.1: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: shuffle_vector_unmerge_hi_128 + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[COPY]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<4 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %0:_(<4 x s32>) = G_SHUFFLE_VECTOR %1(<8 x s32>), %2(<8 x s32>), shufflemask(4, 5, 6, 7) + PseudoRET implicit $lr, implicit %0 +... +--- +name: shuffle_vector_unmerge_hi_256 +tracksRegLiveness: true +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: shuffle_vector_unmerge_hi_256 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1(<16 x s32>), %2(<16 x s32>), shufflemask(8, 9, 10, 11, 12, 13, 14, 15) + PseudoRET implicit $lr, implicit %0 +... +--- +name: shuffle_vector_unmerge_hi_512 +tracksRegLiveness: true +body: | + bb.1: + liveins: $y0, $y1 + ; CHECK-LABEL: name: shuffle_vector_unmerge_hi_512 + ; CHECK: liveins: $y0, $y1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<16 x s32>) + %1:_(<32 x s32>) = COPY $y0 + %2:_(<32 x s32>) = COPY $y1 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1(<32 x s32>), %2(<32 x s32>), shufflemask(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31) + PseudoRET implicit $lr, implicit %0 +... +--- +name: shuffle_vector_unmerge_hi_1024 +tracksRegLiveness: true +body: | + bb.1: + liveins: $dm0, $dm1 + ; CHECK-LABEL: name: shuffle_vector_unmerge_hi_1024 + ; CHECK: liveins: $dm0, $dm1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s64>) = COPY $dm0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s64>), [[UV1:%[0-9]+]]:_(<16 x s64>) = G_UNMERGE_VALUES [[COPY]](<32 x s64>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<16 x s64>) + %1:_(<32 x s64>) = COPY $dm0 + %2:_(<32 x s64>) = COPY $dm1 + %0:_(<16 x s64>) = G_SHUFFLE_VECTOR %1(<32 x s64>), %2(<32 x s64>), shufflemask(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31) + PseudoRET implicit $lr, implicit %0 +... +--- +name: shuffle_vector_subreg_boundary_cross_invalid +tracksRegLiveness: true +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: shuffle_vector_subreg_boundary_cross_invalid + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<8 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<16 x s32>), [[COPY1]], shufflemask(4, 5, 6, 7, 8, 9, 10, 11) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[SHUF]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1(<16 x s32>), %2(<16 x s32>), shufflemask(4, 5, 6, 7, 8, 9, 10, 11) + PseudoRET implicit $lr, implicit %0 +... +# Note: currently it is combined to G_AIE_VSEL but it should be combined to COPY +# which is not implemented yet. +--- +name: shuffle_vector_to_copy +tracksRegLiveness: true +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: shuffle_vector_to_copy + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AIE_VSEL:%[0-9]+]]:_(<16 x s32>) = G_AIE_VSEL [[COPY]], [[COPY1]], [[C]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_VSEL]](<16 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1(<16 x s32>), %2(<16 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + PseudoRET implicit $lr, implicit %0 +... diff --git a/llvm/test/CodeGen/AIE/aie2p/shufflevec.ll b/llvm/test/CodeGen/AIE/aie2p/shufflevec.ll index 234b257ae9f3..bf0176dbde2b 100644 --- a/llvm/test/CodeGen/AIE/aie2p/shufflevec.ll +++ b/llvm/test/CodeGen/AIE/aie2p/shufflevec.ll @@ -423,3 +423,101 @@ entry: %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } + +; Test G_SHUFFLE_VECTOR to UNMERGE + +define <8 x i32> @test_shuffle_vector_vector_unmerge_lo(<16 x i32> noundef %a, <16 x i32> noundef %b) { +; CHECK-LABEL: test_shuffle_vector_vector_unmerge_lo: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ret lr +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov x0, x2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} + +define <8 x i32> @test_shuffle_vector_vector_unmerge_lo_4_unmerge_outputs(<32 x i32> noundef %a, <32 x i32> noundef %b) { +; CHECK-LABEL: test_shuffle_vector_vector_unmerge_lo_4_unmerge_outputs: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ret lr +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov x0, x4 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %shuffle = shufflevector <32 x i32> %a, <32 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} + +define <4 x i32> @test_shuffle_vector_vector_unmerge_hi_128(<8 x i32> noundef %a, <8 x i32> noundef %b) { +; CHECK-LABEL: test_shuffle_vector_vector_unmerge_hi_128: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv +; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: mova r0, #16 // Delay Slot 3 +; CHECK-NEXT: vshift x0, x2, x0, r0 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <4 x i32> + ret <4 x i32> %shuffle +} + +define <8 x i32> @test_shuffle_vector_vector_unmerge_hi_256(<16 x i32> noundef %a, <16 x i32> noundef %b) { +; CHECK-LABEL: test_shuffle_vector_vector_unmerge_hi_256: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ret lr +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov wl0, wh2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} + +define <16 x i32> @test_shuffle_vector_vector_unmerge_hi_512(<32 x i32> noundef %a, <32 x i32> noundef %b) { +; CHECK-LABEL: test_shuffle_vector_vector_unmerge_hi_512: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ret lr +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov x0, x5 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %shuffle = shufflevector <32 x i32> %a, <32 x i32> %b, <16 x i32> + ret <16 x i32> %shuffle +} + +define <32 x i32> @test_shuffle_vector_vector_unmerge_lo_1024(<64 x i32> noundef %a, <64 x i32> noundef %b) { +; CHECK-LABEL: test_shuffle_vector_vector_unmerge_lo_1024: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: nopa ; nopb ; nopx ; mov p0, sp; nops +; CHECK-NEXT: padda [p0], #-256 +; CHECK-NEXT: vlda bmll0, [p0, #0] +; CHECK-NEXT: vlda bmlh0, [p0, #64] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: ret lr +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vmov x4, bmll0 // Delay Slot 3 +; CHECK-NEXT: vmov x5, bmlh0 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %shuffle = shufflevector <64 x i32> %a, <64 x i32> %b, <32 x i32> + ret <32 x i32> %shuffle +} diff --git a/llvm/test/CodeGen/AIE/aie2p/vmac.ll b/llvm/test/CodeGen/AIE/aie2p/vmac.ll index d612845f6008..3ae7f104cb79 100644 --- a/llvm/test/CodeGen/AIE/aie2p/vmac.ll +++ b/llvm/test/CodeGen/AIE/aie2p/vmac.ll @@ -105,116 +105,26 @@ define dso_local inreg noundef <64 x i32> @_Z27test_addmac_4x16_16x16_confDv64_h ; CHECK-LABEL: _Z27test_addmac_4x16_16x16_confDv64_hiDv128_DB8_iDv64_u7__acc32S2_iiiii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nops ; mov crunpacksize, #0 -; CHECK-NEXT: paddxm [sp], #64 -; CHECK-NEXT: vextract.32 r25, x4, #3, vaddsign1 -; CHECK-NEXT: vextract.32 r26, x4, #4, vaddsign1 -; CHECK-NEXT: vextract.32 r27, x4, #5, vaddsign1 -; CHECK-NEXT: vextract.32 r28, x4, #6, vaddsign1 -; CHECK-NEXT: vextract.32 r29, x4, #7, vaddsign1 -; CHECK-NEXT: vextract.32 r30, x4, #8, vaddsign1 -; CHECK-NEXT: mova r7, #50; vextract.32 r31, x4, #9, vaddsign1 -; CHECK-NEXT: vshuffle x2, x0, x0, r7 -; CHECK-NEXT: vextract.32 r8, x4, #10, vaddsign1 -; CHECK-NEXT: st r8, [sp, #-48]; vextract.32 r9, x4, #11, vaddsign1 // 4-byte Folded Spill -; CHECK-NEXT: st r9, [sp, #-52]; vextract.32 r10, x4, #12, vaddsign1 // 4-byte Folded Spill -; CHECK-NEXT: st r10, [sp, #-56]; vextract.32 r11, x4, #13, vaddsign1 // 4-byte Folded Spill -; CHECK-NEXT: st r11, [sp, #-60]; vextract.32 r12, x4, #14, vaddsign1 // 4-byte Folded Spill -; CHECK-NEXT: st r12, [sp, #-64]; vextract.32 r18, x2, #1, vaddsign1 // 4-byte Folded Spill -; CHECK-NEXT: vextract.32 r19, x2, #2, vaddsign1 -; CHECK-NEXT: vextract.32 r20, x2, #3, vaddsign1 -; CHECK-NEXT: vextract.32 r21, x2, #4, vaddsign1 -; CHECK-NEXT: vextract.32 r22, x2, #5, vaddsign1 -; CHECK-NEXT: vextract.32 r16, x2, #6, vaddsign1 -; CHECK-NEXT: vextract.32 r7, x2, #7, vaddsign1 -; CHECK-NEXT: vextract.32 r23, x2, #9, vaddsign1 -; CHECK-NEXT: vextract.32 r17, x2, #0, vaddsign1 -; CHECK-NEXT: vextract.32 r24, x2, #10, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x0, x0, r17 -; CHECK-NEXT: vextract.32 r17, x2, #15, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x0, x0, r18 -; CHECK-NEXT: vextract.32 r18, x2, #14, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x0, x0, r19 -; CHECK-NEXT: vextract.32 r19, x2, #13, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x0, x0, r20 -; CHECK-NEXT: vextract.32 r20, x2, #12, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x0, x0, r21 -; CHECK-NEXT: vextract.32 r21, x2, #11, vaddsign1 -; CHECK-NEXT: vextract.32 r22, x2, #8, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x0, x0, r22 -; CHECK-NEXT: vpush.hi.32 x2, x0, r22 -; CHECK-NEXT: vextract.32 r22, x4, #0, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x2, x2, r23 -; CHECK-NEXT: vextract.32 r23, x4, #1, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x2, x2, r24 -; CHECK-NEXT: vpush.hi.32 x6, x0, r22 -; CHECK-NEXT: vextract.32 r24, x4, #2, vaddsign1 -; CHECK-NEXT: vextract.32 r22, x4, #15, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x6, r23 -; CHECK-NEXT: vpush.hi.32 x2, x2, r21 -; CHECK-NEXT: vextract.32 r23, x5, #1, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r24 -; CHECK-NEXT: vpush.hi.32 x2, x2, r20 -; CHECK-NEXT: vextract.32 r24, x5, #2, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r25 -; CHECK-NEXT: vpush.hi.32 x2, x2, r19 -; CHECK-NEXT: vextract.32 r25, x5, #3, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r26 -; CHECK-NEXT: vpush.hi.32 x2, x2, r18 -; CHECK-NEXT: vextract.32 r26, x5, #4, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r27 -; CHECK-NEXT: vpush.hi.32 x2, x2, r17 -; CHECK-NEXT: vextract.32 r27, x5, #5, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r28 -; CHECK-NEXT: vextract.32 r28, x5, #6, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r29 -; CHECK-NEXT: vextract.32 r29, x5, #7, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r30 -; CHECK-NEXT: vextract.32 r30, x5, #8, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r31 -; CHECK-NEXT: vextract.32 r31, x5, #9, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r8 -; CHECK-NEXT: vextract.32 r8, x5, #10, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r9 -; CHECK-NEXT: vextract.32 r9, x5, #11, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r10 -; CHECK-NEXT: vextract.32 r10, x5, #12, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r11 -; CHECK-NEXT: vextract.32 r11, x5, #13, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r12 -; CHECK-NEXT: vextract.32 r12, x5, #14, vaddsign1 -; CHECK-NEXT: vextract.32 r22, x5, #0, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r22 -; CHECK-NEXT: vpush.hi.32 x6, x0, r22 -; CHECK-NEXT: vpush.hi.32 x0, x0, r16 -; CHECK-NEXT: vextract.32 r22, x5, #15, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x6, x6, r23 -; CHECK-NEXT: nez r23, r1; vpush.hi.32 x0, x0, r7 -; CHECK-NEXT: mov unpacksign0, r23 -; CHECK-NEXT: vpush.hi.32 x6, x6, r24 -; CHECK-NEXT: vmov wl0, wh2 -; CHECK-NEXT: vpush.hi.32 x6, x6, r25 -; CHECK-NEXT: vpush.hi.32 x6, x6, r26 -; CHECK-NEXT: mova r23, #11; vpush.hi.32 x6, x6, r27 -; CHECK-NEXT: mova r24, #10; lshl r4, r4, r23; vpush.hi.32 x6, x6, r28 -; CHECK-NEXT: mova r24, #12; lshl r3, r3, r24; vpush.hi.32 x6, x6, r29 -; CHECK-NEXT: mova r23, #9; lshl r5, r5, r24; vpush.hi.32 x6, x6, r30 -; CHECK-NEXT: mova r24, #8; lshl r0, r0, r23; vpush.hi.32 x6, x6, r31 -; CHECK-NEXT: lshl r1, r1, r24; vpush.hi.32 x6, x6, r8 -; CHECK-NEXT: vunpack y3, x6, unpacksign0; or r0, r1, r0; vpush.hi.32 x6, x6, r9 -; CHECK-NEXT: or r1, r3, r2; vpush.hi.32 x6, x6, r10 -; CHECK-NEXT: mova r3, #13; vunpack y2, x4, unpacksign0; or r1, r1, r4; vpush.hi.32 x6, x6, r11 -; CHECK-NEXT: lshl r3, r6, r3; vpush.hi.32 x6, x6, r12 -; CHECK-NEXT: lda r8, [sp, #-48]; or r1, r1, r5; vpush.hi.32 x6, x6, r22 // 4-byte Folded Reload -; CHECK-NEXT: lda r9, [sp, #-52]; or r1, r1, r0; mov r2, #200 // 4-byte Folded Reload -; CHECK-NEXT: lda r10, [sp, #-56]; or r1, r1, r2 // 4-byte Folded Reload -; CHECK-NEXT: lda r11, [sp, #-60]; or r0, r0, r3; vmov wl0, wh0; vmac dm0, dm1, x0, y3,r1 // 4-byte Folded Reload -; CHECK-NEXT: lda r12, [sp, #-64]; or r0, r0, r2 // 4-byte Folded Reload -; CHECK-NEXT: ret lr; vaddmac dm0, dm0, dm2, x0, y2,r0 +; CHECK-NEXT: mova r17, #11; nopb ; nops ; nez r16, r1; nopm ; nopv +; CHECK-NEXT: mova r7, #10; nopb ; lshl r4, r4, r17 +; CHECK-NEXT: mova r7, #12; lshl r3, r3, r7 +; CHECK-NEXT: mova r17, #9; lshl r5, r5, r7 +; CHECK-NEXT: mova r7, #8; lshl r0, r0, r17 +; CHECK-NEXT: vunpack y3, x5, unpacksign0; lshl r1, r1, r7 +; CHECK-NEXT: vunpack y1, x4, unpacksign0; or r0, r1, r0 +; CHECK-NEXT: or r1, r3, r2; mov crunpacksize, #0 +; CHECK-NEXT: or r1, r1, r4; mov unpacksign0, r16 +; CHECK-NEXT: mova r4, #13; or r1, r1, r5; mov r2, #50 +; CHECK-NEXT: lshl r2, r6, r4; vshuffle x0, x0, x0, r2 +; CHECK-NEXT: mova r3, #200; or r1, r1, r0; vmov wl4, wh0 +; CHECK-NEXT: or r1, r1, r3 +; CHECK-NEXT: or r0, r0, r2; vmac dm0, dm1, x4, y3,r1 +; CHECK-NEXT: or r0, r0, r3 +; CHECK-NEXT: ret lr; vaddmac dm0, dm0, dm2, x0, y1,r0 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: paddxm [sp], #-64 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: mov unpacksign0, #0 // Delay Slot 1 entry: %0 = bitcast <64 x i8> %a to <16 x i32> @@ -259,116 +169,26 @@ define dso_local inreg noundef <64 x i32> @_Z27test_addmsc_4x16_16x16_confDv64_h ; CHECK-LABEL: _Z27test_addmsc_4x16_16x16_confDv64_hiDv128_DB8_iDv64_u7__acc32S2_iiiii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nops ; mov crunpacksize, #0 -; CHECK-NEXT: paddxm [sp], #64 -; CHECK-NEXT: vextract.32 r25, x4, #3, vaddsign1 -; CHECK-NEXT: vextract.32 r26, x4, #4, vaddsign1 -; CHECK-NEXT: vextract.32 r27, x4, #5, vaddsign1 -; CHECK-NEXT: vextract.32 r28, x4, #6, vaddsign1 -; CHECK-NEXT: vextract.32 r29, x4, #7, vaddsign1 -; CHECK-NEXT: vextract.32 r30, x4, #8, vaddsign1 -; CHECK-NEXT: mova r7, #50; vextract.32 r31, x4, #9, vaddsign1 -; CHECK-NEXT: vshuffle x2, x0, x0, r7 -; CHECK-NEXT: vextract.32 r8, x4, #10, vaddsign1 -; CHECK-NEXT: st r8, [sp, #-48]; vextract.32 r9, x4, #11, vaddsign1 // 4-byte Folded Spill -; CHECK-NEXT: st r9, [sp, #-52]; vextract.32 r10, x4, #12, vaddsign1 // 4-byte Folded Spill -; CHECK-NEXT: st r10, [sp, #-56]; vextract.32 r11, x4, #13, vaddsign1 // 4-byte Folded Spill -; CHECK-NEXT: st r11, [sp, #-60]; vextract.32 r12, x4, #14, vaddsign1 // 4-byte Folded Spill -; CHECK-NEXT: st r12, [sp, #-64]; vextract.32 r18, x2, #1, vaddsign1 // 4-byte Folded Spill -; CHECK-NEXT: vextract.32 r19, x2, #2, vaddsign1 -; CHECK-NEXT: vextract.32 r20, x2, #3, vaddsign1 -; CHECK-NEXT: vextract.32 r21, x2, #4, vaddsign1 -; CHECK-NEXT: vextract.32 r22, x2, #5, vaddsign1 -; CHECK-NEXT: vextract.32 r16, x2, #6, vaddsign1 -; CHECK-NEXT: vextract.32 r7, x2, #7, vaddsign1 -; CHECK-NEXT: vextract.32 r23, x2, #9, vaddsign1 -; CHECK-NEXT: vextract.32 r17, x2, #0, vaddsign1 -; CHECK-NEXT: vextract.32 r24, x2, #10, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x0, x0, r17 -; CHECK-NEXT: vextract.32 r17, x2, #15, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x0, x0, r18 -; CHECK-NEXT: vextract.32 r18, x2, #14, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x0, x0, r19 -; CHECK-NEXT: vextract.32 r19, x2, #13, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x0, x0, r20 -; CHECK-NEXT: vextract.32 r20, x2, #12, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x0, x0, r21 -; CHECK-NEXT: vextract.32 r21, x2, #11, vaddsign1 -; CHECK-NEXT: vextract.32 r22, x2, #8, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x0, x0, r22 -; CHECK-NEXT: vpush.hi.32 x2, x0, r22 -; CHECK-NEXT: vextract.32 r22, x4, #0, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x2, x2, r23 -; CHECK-NEXT: vextract.32 r23, x4, #1, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x2, x2, r24 -; CHECK-NEXT: vpush.hi.32 x6, x0, r22 -; CHECK-NEXT: vextract.32 r24, x4, #2, vaddsign1 -; CHECK-NEXT: vextract.32 r22, x4, #15, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x6, r23 -; CHECK-NEXT: vpush.hi.32 x2, x2, r21 -; CHECK-NEXT: vextract.32 r23, x5, #1, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r24 -; CHECK-NEXT: vpush.hi.32 x2, x2, r20 -; CHECK-NEXT: vextract.32 r24, x5, #2, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r25 -; CHECK-NEXT: vpush.hi.32 x2, x2, r19 -; CHECK-NEXT: vextract.32 r25, x5, #3, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r26 -; CHECK-NEXT: vpush.hi.32 x2, x2, r18 -; CHECK-NEXT: vextract.32 r26, x5, #4, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r27 -; CHECK-NEXT: vpush.hi.32 x2, x2, r17 -; CHECK-NEXT: vextract.32 r27, x5, #5, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r28 -; CHECK-NEXT: vextract.32 r28, x5, #6, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r29 -; CHECK-NEXT: vextract.32 r29, x5, #7, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r30 -; CHECK-NEXT: vextract.32 r30, x5, #8, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r31 -; CHECK-NEXT: vextract.32 r31, x5, #9, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r8 -; CHECK-NEXT: vextract.32 r8, x5, #10, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r9 -; CHECK-NEXT: vextract.32 r9, x5, #11, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r10 -; CHECK-NEXT: vextract.32 r10, x5, #12, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r11 -; CHECK-NEXT: vextract.32 r11, x5, #13, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r12 -; CHECK-NEXT: vextract.32 r12, x5, #14, vaddsign1 -; CHECK-NEXT: vextract.32 r22, x5, #0, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x4, r22 -; CHECK-NEXT: vpush.hi.32 x6, x0, r22 -; CHECK-NEXT: vpush.hi.32 x0, x0, r16 -; CHECK-NEXT: vextract.32 r22, x5, #15, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x6, x6, r23 -; CHECK-NEXT: nez r23, r1; vpush.hi.32 x0, x0, r7 -; CHECK-NEXT: mov unpacksign0, r23 -; CHECK-NEXT: vpush.hi.32 x6, x6, r24 -; CHECK-NEXT: vmov wl0, wh2 -; CHECK-NEXT: vpush.hi.32 x6, x6, r25 -; CHECK-NEXT: vpush.hi.32 x6, x6, r26 -; CHECK-NEXT: mova r23, #11; vpush.hi.32 x6, x6, r27 -; CHECK-NEXT: mova r24, #10; lshl r4, r4, r23; vpush.hi.32 x6, x6, r28 -; CHECK-NEXT: mova r24, #12; lshl r3, r3, r24; vpush.hi.32 x6, x6, r29 -; CHECK-NEXT: mova r23, #9; lshl r5, r5, r24; vpush.hi.32 x6, x6, r30 -; CHECK-NEXT: mova r24, #8; lshl r0, r0, r23; vpush.hi.32 x6, x6, r31 -; CHECK-NEXT: lshl r1, r1, r24; vpush.hi.32 x6, x6, r8 -; CHECK-NEXT: vunpack y3, x6, unpacksign0; or r0, r1, r0; vpush.hi.32 x6, x6, r9 -; CHECK-NEXT: or r1, r3, r2; vpush.hi.32 x6, x6, r10 -; CHECK-NEXT: mova r3, #13; vunpack y2, x4, unpacksign0; or r1, r1, r4; vpush.hi.32 x6, x6, r11 -; CHECK-NEXT: lshl r3, r6, r3; vpush.hi.32 x6, x6, r12 -; CHECK-NEXT: lda r8, [sp, #-48]; or r1, r1, r5; vpush.hi.32 x6, x6, r22 // 4-byte Folded Reload -; CHECK-NEXT: lda r9, [sp, #-52]; or r1, r1, r0; mov r2, #200 // 4-byte Folded Reload -; CHECK-NEXT: lda r10, [sp, #-56]; or r1, r1, r2 // 4-byte Folded Reload -; CHECK-NEXT: lda r11, [sp, #-60]; or r0, r0, r3; vmov wl0, wh0; vmsc dm0, dm1, x0, y3,r1 // 4-byte Folded Reload -; CHECK-NEXT: lda r12, [sp, #-64]; or r0, r0, r2 // 4-byte Folded Reload -; CHECK-NEXT: ret lr; vaddmsc dm0, dm0, dm2, x0, y2,r0 +; CHECK-NEXT: mova r17, #11; nopb ; nops ; nez r16, r1; nopm ; nopv +; CHECK-NEXT: mova r7, #10; nopb ; lshl r4, r4, r17 +; CHECK-NEXT: mova r7, #12; lshl r3, r3, r7 +; CHECK-NEXT: mova r17, #9; lshl r5, r5, r7 +; CHECK-NEXT: mova r7, #8; lshl r0, r0, r17 +; CHECK-NEXT: vunpack y3, x5, unpacksign0; lshl r1, r1, r7 +; CHECK-NEXT: vunpack y1, x4, unpacksign0; or r0, r1, r0 +; CHECK-NEXT: or r1, r3, r2; mov crunpacksize, #0 +; CHECK-NEXT: or r1, r1, r4; mov unpacksign0, r16 +; CHECK-NEXT: mova r4, #13; or r1, r1, r5; mov r2, #50 +; CHECK-NEXT: lshl r2, r6, r4; vshuffle x0, x0, x0, r2 +; CHECK-NEXT: mova r3, #200; or r1, r1, r0; vmov wl4, wh0 +; CHECK-NEXT: or r1, r1, r3 +; CHECK-NEXT: or r0, r0, r2; vmsc dm0, dm1, x4, y3,r1 +; CHECK-NEXT: or r0, r0, r3 +; CHECK-NEXT: ret lr; vaddmsc dm0, dm0, dm2, x0, y1,r0 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: paddxm [sp], #-64 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: mov unpacksign0, #0 // Delay Slot 1 entry: %0 = bitcast <64 x i8> %a to <16 x i32>