diff --git a/llvm/lib/Target/AIE/AIE2.h b/llvm/lib/Target/AIE/AIE2.h
index 144c1d86ceab..2dfc00b04157 100644
--- a/llvm/lib/Target/AIE/AIE2.h
+++ b/llvm/lib/Target/AIE/AIE2.h
@@ -34,7 +34,6 @@ class MachineInstr;
 class MachineOperand;
 class PassRegistry;
 
-FunctionPass *createAIE2ISelDag(TargetMachine &TM);
 FunctionPass *createAIE2PreLegalizerCombiner();
 FunctionPass *createAIE2PostLegalizerCustomCombiner();
 FunctionPass *createAIE2PostLegalizerGenericCombiner();
diff --git a/llvm/lib/Target/AIE/AIE2ISelDAGToDAG.cpp b/llvm/lib/Target/AIE/AIE2ISelDAGToDAG.cpp
deleted file mode 100644
index fd6a6e1ea5bb..000000000000
--- a/llvm/lib/Target/AIE/AIE2ISelDAGToDAG.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-//===--AIE2ISelDAGToDAG.cpp -A dag to dag inst selector for AIEngine V2 ---===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an instruction selector for the AIEngine V2 target.
-//
-//===----------------------------------------------------------------------===//
-#include "AIE2Subtarget.h"
-#include "AIEISelDAGToDAG.h"
-#include "MCTargetDesc/AIE2MCTargetDesc.h"
-using namespace llvm;
-// AIEngine V2-specific code to select AIEngine V2 machine instructions for
-// SelectionDAG operations.
-class AIE2DAGToDAGISel : public AIEDAGToDAGISel {
-public:
-  explicit AIE2DAGToDAGISel(TargetMachine &TM) : AIEDAGToDAGISel(TM) {}
-
-  StringRef getPassName() const override {
-    return "AIE2 DAG->DAG Pattern Instruction Selection";
-  }
-
-  void Select(SDNode *Node) override;
-
-  // Complex Pattern Selectors.  Each one corresponds to a
-  // ComplexPattern<> in AIEInstrInfo.td
-  bool SelectFrameIndex(SDValue &N, SDValue &R);
-
-// Include the pieces autogenerated from the target description.
-#include "AIE2GenDAGISel.inc"
-};
-
-void AIE2DAGToDAGISel::Select(SDNode *Node) {
-  // If we have a custom node, we have already selected.
-  if (Node->isMachineOpcode()) {
-    LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
-    Node->setNodeId(-1);
-    return;
-  }
-  // Instruction Selection not handled by the auto-generated tablegen selection
-  // should be handled here.
-  unsigned Opcode = Node->getOpcode();
-  SDLoc DL(Node);
-  // EVT VT = Node->getValueType(0);
-// TODO Add code here
-  switch (Opcode) {
-  case ISD::Constant: {
-    break;
-  }
-  }
-  // Select the default instruction.
-  SelectCode(Node);
-}
-
-// Match a frame index that can be used in an addressing mode.
-bool AIE2DAGToDAGISel::SelectFrameIndex(SDValue &N, SDValue &R) {
-  if (N.getOpcode() != ISD::FrameIndex)
-    return false;
-  int FI = cast<FrameIndexSDNode>(N)->getIndex();
-  LLVM_DEBUG(dbgs() << "SelectFrameIndex: " << FI << "\n");
-  R = CurDAG->getTargetFrameIndex(FI, MVT::i32);
-  return true;
-}
-
-// This pass converts a legalized DAG into a AIE-specific DAG, ready
-// for instruction scheduling.
-FunctionPass *llvm::createAIE2ISelDag(TargetMachine &TM) {
-  return new AIE2DAGToDAGISel(TM);
-}
diff --git a/llvm/lib/Target/AIE/AIE2InstrPatterns.td b/llvm/lib/Target/AIE/AIE2InstrPatterns.td
index d6c7db3766b2..bff0201bc062 100644
--- a/llvm/lib/Target/AIE/AIE2InstrPatterns.td
+++ b/llvm/lib/Target/AIE/AIE2InstrPatterns.td
@@ -1068,3 +1068,8 @@ def : PatInaccessibleMem<(int_aie2_clr16f_conf),
 // DIVS
 def : Pat<(int_aie2_divs eR31:$sd_in, eR:$src0, eR:$src1),
           (DIVS eR31:$sd_in, eR:$src0, eR:$src1)>;
+
+// G_AIE_[SZ]EXT_EXTRACT_VECTOR_ELT
+defm : Extract_512<i32, v64i8, (i32 eR:$idx), VEXTRACT_D8, VEXTRACT_S8>;
+defm : Extract_512<i32, v32i16, (i32 eR:$idx), VEXTRACT_D16, VEXTRACT_S16>;
+defm : Extract_512<i32, v16i32, (i32 eR:$idx), VEXTRACT_D32, VEXTRACT_S32>;
diff --git a/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp b/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp
index 75606807153b..a00e41b6c4f3 100644
--- a/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp
+++ b/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp
@@ -103,8 +103,6 @@ class AIE2InstructionSelector : public AIEBaseInstructionSelector {
   bool selectG_AIE_STORE_CONV(MachineInstr &StoreI, MachineRegisterInfo &MRI);
   bool selectG_AIE_STORE_PACK(MachineInstr &StoreI, MachineRegisterInfo &MRI);
   bool selectStartLoop(MachineInstr &I, MachineRegisterInfo &MRI);
-  bool selectG_AIE_EXTRACT_VECTOR_ELT(MachineInstr &I,
-                                      MachineRegisterInfo &MRI);
   bool selectG_AIE_INSERT_VECTOR_ELT(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectG_AIE_PAD_VECTOR_UNDEF(MachineInstr &I, MachineOperand &DstReg,
                                     MachineOperand &SrcReg,
@@ -452,9 +450,6 @@ bool AIE2InstructionSelector::select(MachineInstr &I) {
   case AIE2::G_AIE_POSTINC_3D_SEXTLOAD:
   case AIE2::G_AIE_POSTINC_3D_ZEXTLOAD:
     return selectG_AIE_LOAD_STORE(I, MRI);
-  case AIE2::G_AIE_ZEXT_EXTRACT_VECTOR_ELT:
-  case AIE2::G_AIE_SEXT_EXTRACT_VECTOR_ELT:
-    return selectG_AIE_EXTRACT_VECTOR_ELT(I, MRI);
   case AIE2::G_AIE_INSERT_VECTOR_ELT:
     return selectG_AIE_INSERT_VECTOR_ELT(I, MRI);
   case AIE2::G_AIE_PAD_VECTOR_UNDEF:
@@ -3755,28 +3750,6 @@ createOpcodeCondRegPair(unsigned EltSize, Register LtReg, MachineIRBuilder &MIB,
   return std::make_pair(Opcode, SelReg);
 }
 
-static unsigned getExtractVecEltOpcode(unsigned EltSize, unsigned InstOpcode) {
-  unsigned Opcode = 0;
-  bool IsZextExtVecElt = InstOpcode == AIE2::G_AIE_ZEXT_EXTRACT_VECTOR_ELT;
-  switch (EltSize) {
-  case 8:
-    Opcode = IsZextExtVecElt ? AIE2::VEXTRACT_D8 : AIE2::VEXTRACT_S8;
-    break;
-  case 16:
-    Opcode = IsZextExtVecElt ? AIE2::VEXTRACT_D16 : AIE2::VEXTRACT_S16;
-    break;
-  case 32:
-    Opcode = IsZextExtVecElt ? AIE2::VEXTRACT_D32 : AIE2::VEXTRACT_S32;
-    break;
-  // there is no AIE vector with elt size 64, VEXTRACT_D64/VEXTRACT_S64 is
-  // selected only when the extracted value is another vector of size 64-bit.
-  default:
-    llvm_unreachable("Unexpected Extracted Vector Element Size");
-  }
-  assert(Opcode != 0 && "Expected a NonZero Opcode");
-  return Opcode;
-}
-
 static unsigned getInsertVecEltOpcode(unsigned EltSize, unsigned InstOpcode) {
   switch (EltSize) {
   case 8:
@@ -3931,24 +3904,6 @@ static SelSrcAndIdx getExtractOrInsertVectorEltInputs(
   return SelSrcIdx;
 }
 
-bool AIE2InstructionSelector::selectG_AIE_EXTRACT_VECTOR_ELT(
-    MachineInstr &I, MachineRegisterInfo &MRI) {
-  MachineOperand &RegOp0 = I.getOperand(1);
-  Register DstReg = I.getOperand(0).getReg();
-  Register SrcReg0 = RegOp0.getReg();
-  LLT SrcVecTy = MRI.getType(SrcReg0);
-  LLT SrcEltTy = SrcVecTy.getElementType();
-  unsigned EltSize = SrcEltTy.getSizeInBits();
-  SelSrcAndIdx SelSrcIdx =
-      getExtractOrInsertVectorEltInputs(I, TRI, MRI, TII, RBI, MIB);
-  unsigned Opcode = getExtractVecEltOpcode(EltSize, I.getOpcode());
-  MachineInstrBuilder MI = MIB.buildInstr(Opcode, {DstReg}, {})
-                               .addReg(SelSrcIdx.SrcReg)
-                               .addReg(SelSrcIdx.IdxReg);
-  I.eraseFromParent();
-  return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
-}
-
 bool AIE2InstructionSelector::selectG_AIE_INSERT_VECTOR_ELT(
     MachineInstr &I, MachineRegisterInfo &MRI) {
   Register DstVecReg = I.getOperand(0).getReg();
diff --git a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp
index 3402e5eec199..c43a7b095d0c 100644
--- a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp
+++ b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp
@@ -220,15 +220,6 @@ AIE2TargetMachine::getTargetTransformInfo(const Function &F) const {
   return TargetTransformInfo(AIE2TTIImpl(this, F));
 }
 
-bool AIE2PassConfig::addInstSelector() {
-  if (AIEDumpArtifacts)
-    addPass(createMachineFunctionDumperPass(/*Suffix=*/"before-isel"));
-  addPass(createAIE2ISelDag(getAIETargetMachine()));
-  if (AIEDumpArtifacts)
-    addPass(createMachineFunctionDumperPass(/*Suffix=*/"after-isel"));
-  return false;
-}
-
 unsigned
 AIE2TargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
   switch (Kind) {
diff --git a/llvm/lib/Target/AIE/AIE2TargetMachine.h b/llvm/lib/Target/AIE/AIE2TargetMachine.h
index a0928f19fef0..ce7ec36389ce 100644
--- a/llvm/lib/Target/AIE/AIE2TargetMachine.h
+++ b/llvm/lib/Target/AIE/AIE2TargetMachine.h
@@ -57,7 +57,6 @@ class AIE2PassConfig : public AIEBasePassConfig {
 
   bool addPreISel() override;
   void addPreEmitPass() override;
-  bool addInstSelector() override;
   bool addGlobalInstructionSelect() override;
   void addPreRegAlloc() override;
   bool addRegAssignAndRewriteOptimized() override;
diff --git a/llvm/lib/Target/AIE/AIEBaseInstrPatterns.td b/llvm/lib/Target/AIE/AIEBaseInstrPatterns.td
index 261366bc279b..3528c412725f 100644
--- a/llvm/lib/Target/AIE/AIEBaseInstrPatterns.td
+++ b/llvm/lib/Target/AIE/AIEBaseInstrPatterns.td
@@ -77,3 +77,21 @@ foreach vec512Ty = [v64i8, v32i16, v16i32] in {
   def : Pat<(vec512Ty (select (i32 eR:$rs1), VEC512:$rs2, VEC512:$rs3)),
             (vec512Ty (VSEL_32 VEC512:$rs2, VEC512:$rs3, (ADD_add_r_ri eR:$rs1, (i32 -1))))>;
 }
+
+// Make our generic extract vector elt instructions available to TableGen patterns.
+def vextract_zext : SDNode<"G_AIE_ZEXT_EXTRACT_VECTOR_ELT",
+                          SDTypeProfile<1, 2, [SDTCisInt<0>,SDTCisVec<1>, SDTCisInt<2>]>>;
+def : GINodeEquiv<G_AIE_ZEXT_EXTRACT_VECTOR_ELT, vextract_zext>;
+
+def vextract_sext : SDNode<"G_AIE_SEXT_EXTRACT_VECTOR_ELT",
+                          SDTypeProfile<1, 2, [SDTCisInt<0>,SDTCisVec<1>, SDTCisInt<2>]>>;
+def : GINodeEquiv<G_AIE_SEXT_EXTRACT_VECTOR_ELT, vextract_sext>;
+
+class Extr512Pat<ValueType DstTy, ValueType SrcTy, dag Idx, SDNode Op, Instruction Inst> :
+    Pat<(DstTy (Op SrcTy:$src1, Idx)),
+        (Inst SrcTy:$src1, Idx)>;
+
+multiclass Extract_512<ValueType DstTy, ValueType SrcTy, dag Idx, Instruction UnsignedOpc, Instruction SignedOpc> {
+  def : Extr512Pat<DstTy, SrcTy, Idx, vextract_zext, UnsignedOpc>;
+  def : Extr512Pat<DstTy, SrcTy, Idx, vextract_sext, SignedOpc>;
+}
diff --git a/llvm/lib/Target/AIE/CMakeLists.txt b/llvm/lib/Target/AIE/CMakeLists.txt
index f9f06ea5fa25..54296a8b667b 100644
--- a/llvm/lib/Target/AIE/CMakeLists.txt
+++ b/llvm/lib/Target/AIE/CMakeLists.txt
@@ -33,7 +33,6 @@ tablegen(LLVM AIE2GenPostLegalizerGIGenericCombiner.inc -gen-global-isel-combine
 tablegen(LLVM AIE2GenPostLegalizerGICustomCombiner.inc -gen-global-isel-combiner
               -combiners="AIE2PostLegalizerCustomCombiner")
 tablegen(LLVM AIE2GenCallingConv.inc -gen-callingconv)
-tablegen(LLVM AIE2GenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM AIE2GenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM AIE2GenFormats.inc -gen-instr-format)
 tablegen(LLVM AIE2GenInstrInfo.inc -gen-instr-info -base-instrinfo-class AIEBaseInstrInfo)
@@ -129,7 +128,6 @@ add_llvm_target(AIECodeGen
    AIE2FrameLowering.cpp
    AIE2InstrInfo.cpp
    AIE2InstructionSelector.cpp
-   AIE2ISelDAGToDAG.cpp
    AIE2ISelLowering.cpp
    AIE2LegalizerInfo.cpp
    AIE2PostLegalizerCustomCombiner.cpp
diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstrPatterns.td b/llvm/lib/Target/AIE/aie2p/AIE2PInstrPatterns.td
index a3aecfc37657..9a9b514d6b7c 100644
--- a/llvm/lib/Target/AIE/aie2p/AIE2PInstrPatterns.td
+++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstrPatterns.td
@@ -1096,3 +1096,14 @@ foreach Ty = [v64i8, v32i16, v16i32, v8i64] in {
 def : Pat<(Ty (vshift_node VEC512:$src1, VEC512:$src2, (i32 eR:$shift))),
           (VSHIFT VEC512:$src1, VEC512:$src2, eR:$shift)>;
 }
+
+// G_AIE_[SZ]EXT_EXTRACT_VECTOR_ELT
+defm : Extract_512<i32, v64i8, (i32 eR:$idx), VEXTRACT_8_vec_extract_r_vaddSign0, VEXTRACT_8_vec_extract_r_vaddSign1>;
+defm : Extract_512<i32, v32i16, (i32 eR:$idx), VEXTRACT_16_vec_extract_r_vaddSign0, VEXTRACT_16_vec_extract_r_vaddSign1>;
+defm : Extract_512<i32, v16i32, (i32 eR:$idx), VEXTRACT_32_vec_extract_r_vaddSign0, VEXTRACT_32_vec_extract_r_vaddSign1>;
+defm : Extract_512<i64, v8i64, (i32 eR:$idx), VEXTRACT_64_vec_extract_r_vaddSign0, VEXTRACT_64_vec_extract_r_vaddSign1>;
+
+defm : Extract_512<i32, v64i8, (i32 c6u:$idx), VEXTRACT_8_vec_extract_imm_vaddSign0, VEXTRACT_8_vec_extract_imm_vaddSign1>;
+defm : Extract_512<i32, v32i16, (i32 c6u:$idx), VEXTRACT_16_vec_extract_imm_vaddSign0, VEXTRACT_16_vec_extract_imm_vaddSign1>;
+defm : Extract_512<i32, v16i32, (i32 c6u:$idx), VEXTRACT_32_vec_extract_imm_vaddSign0, VEXTRACT_32_vec_extract_imm_vaddSign1>;
+defm : Extract_512<i64, v8i64, (i32 c6u:$idx), VEXTRACT_64_vec_extract_imm_vaddSign0, VEXTRACT_64_vec_extract_imm_vaddSign1>;
diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp
index 530fddc4e075..ddeac4ac5a00 100644
--- a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp
+++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp
@@ -58,8 +58,6 @@ class AIE2PInstructionSelector : public AIEBaseInstructionSelector {
                             unsigned crUPSModeVal);
   bool selectG_AIE_ADD_VECTOR_ELT_HI(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectVCONVbfp16(MachineInstr &I, MachineRegisterInfo &MRI);
-  bool selectG_AIE_EXTRACT_VECTOR_ELT(MachineInstr &I,
-                                      MachineRegisterInfo &MRI);
   bool selectG_AIE_INSERT_VECTOR_ELT(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectG_AIE_PAD_VECTOR_UNDEF(MachineInstr &I, MachineOperand &DstReg,
                                     MachineOperand &SrcReg,
@@ -427,9 +425,6 @@ bool AIE2PInstructionSelector::select(MachineInstr &I) {
     return selectG_UNMERGE_VALUES(MIB, I, MRI);
   case AIE2P::G_AIE_ADD_VECTOR_ELT_HI:
     return selectG_AIE_ADD_VECTOR_ELT_HI(I, MRI);
-  case AIE2P::G_AIE_ZEXT_EXTRACT_VECTOR_ELT:
-  case AIE2P::G_AIE_SEXT_EXTRACT_VECTOR_ELT:
-    return selectG_AIE_EXTRACT_VECTOR_ELT(I, MRI);
   case AIE2P::G_AIE_INSERT_VECTOR_ELT:
     return selectG_AIE_INSERT_VECTOR_ELT(I, MRI);
   case AIE2P::G_AIE_BROADCAST_VECTOR:
@@ -818,32 +813,6 @@ struct SelSrcAndIdx {
 };
 
 } // end anonymous namespace
-static unsigned getExtractVecEltOpcode(unsigned EltSize, unsigned InstOpcode) {
-  unsigned Opcode = 0;
-  bool IsZextExtVecElt = InstOpcode == AIE2P::G_AIE_ZEXT_EXTRACT_VECTOR_ELT;
-  switch (EltSize) {
-  case 8:
-    Opcode = IsZextExtVecElt ? AIE2P::VEXTRACT_8_vec_extract_r_vaddSign0
-                             : AIE2P::VEXTRACT_8_vec_extract_r_vaddSign1;
-    break;
-  case 16:
-    Opcode = IsZextExtVecElt ? AIE2P::VEXTRACT_16_vec_extract_r_vaddSign0
-                             : AIE2P::VEXTRACT_16_vec_extract_r_vaddSign1;
-    break;
-  case 32:
-    Opcode = IsZextExtVecElt ? AIE2P::VEXTRACT_32_vec_extract_r_vaddSign0
-                             : AIE2P::VEXTRACT_32_vec_extract_r_vaddSign1;
-    break;
-  case 64:
-    Opcode = IsZextExtVecElt ? AIE2P::VEXTRACT_64_vec_extract_r_vaddSign0
-                             : AIE2P::VEXTRACT_64_vec_extract_r_vaddSign1;
-    break;
-  default:
-    llvm_unreachable("Unexpected Extracted Vector Element Size");
-  }
-  assert(Opcode != 0 && "Expected a NonZero Opcode");
-  return Opcode;
-}
 
 static unsigned getInsertVecEltOpcode(unsigned EltSize, unsigned InstOpcode) {
   switch (EltSize) {
@@ -1189,24 +1158,6 @@ static SelSrcAndIdx getExtractOrInsertVectorEltInputs(
   return SelSrcIdx;
 }
 
-bool AIE2PInstructionSelector::selectG_AIE_EXTRACT_VECTOR_ELT(
-    MachineInstr &I, MachineRegisterInfo &MRI) {
-  MachineOperand &RegOp0 = I.getOperand(1);
-  Register DstReg = I.getOperand(0).getReg();
-  Register SrcReg0 = RegOp0.getReg();
-  LLT SrcVecTy = MRI.getType(SrcReg0);
-  LLT SrcEltTy = SrcVecTy.getElementType();
-  unsigned EltSize = SrcEltTy.getSizeInBits();
-  SelSrcAndIdx SelSrcIdx =
-      getExtractOrInsertVectorEltInputs(I, TRI, MRI, TII, RBI, MIB);
-  unsigned Opcode = getExtractVecEltOpcode(EltSize, I.getOpcode());
-  MachineInstrBuilder MI = MIB.buildInstr(Opcode, {DstReg}, {})
-                               .addReg(SelSrcIdx.SrcReg)
-                               .addReg(SelSrcIdx.IdxReg);
-  I.eraseFromParent();
-  return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
-}
-
 bool AIE2PInstructionSelector::selectG_AIE_INSERT_VECTOR_ELT(
     MachineInstr &I, MachineRegisterInfo &MRI) {
   Register DstVecReg = I.getOperand(0).getReg();
diff --git a/llvm/test/CodeGen/AIE/GlobalISel/inst-select-extract-vector-elem.mir b/llvm/test/CodeGen/AIE/GlobalISel/inst-select-extract-vector-elem.mir
index 858920759ed1..9946733dc431 100644
--- a/llvm/test/CodeGen/AIE/GlobalISel/inst-select-extract-vector-elem.mir
+++ b/llvm/test/CodeGen/AIE/GlobalISel/inst-select-extract-vector-elem.mir
@@ -30,9 +30,8 @@ body:             |
     ; AIE2P: liveins: $x0
     ; AIE2P-NEXT: {{  $}}
     ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
-    ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1
-    ; AIE2P-NEXT: [[VEXTRACT_16_vec_extract_r_vaddSign0_:%[0-9]+]]:er = VEXTRACT_16_vec_extract_r_vaddSign0 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign0
-    ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_16_vec_extract_r_vaddSign0_]]
+    ; AIE2P-NEXT: [[VEXTRACT_16_vec_extract_imm_vaddSign0_:%[0-9]+]]:er = VEXTRACT_16_vec_extract_imm_vaddSign0 [[COPY]], 1, implicit $vaddsign0
+    ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_16_vec_extract_imm_vaddSign0_]]
     ; AIE2P-NEXT: PseudoRET implicit $lr, implicit $r0
     %1:vregbank(<32 x s16>) = COPY $x0
     %2:gprregbank(s32) = G_CONSTANT i32 1
@@ -63,9 +62,8 @@ body:             |
     ; AIE2P: liveins: $x0
     ; AIE2P-NEXT: {{  $}}
     ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
-    ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1
-    ; AIE2P-NEXT: [[VEXTRACT_16_vec_extract_r_vaddSign1_:%[0-9]+]]:er = VEXTRACT_16_vec_extract_r_vaddSign1 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign1
-    ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_16_vec_extract_r_vaddSign1_]]
+    ; AIE2P-NEXT: [[VEXTRACT_16_vec_extract_imm_vaddSign1_:%[0-9]+]]:er = VEXTRACT_16_vec_extract_imm_vaddSign1 [[COPY]], 1, implicit $vaddsign1
+    ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_16_vec_extract_imm_vaddSign1_]]
     ; AIE2P-NEXT: PseudoRET implicit $lr, implicit $r0
     %1:vregbank(<32 x s16>) = COPY $x0
     %2:gprregbank(s32) = G_CONSTANT i32 1
@@ -96,9 +94,8 @@ body:             |
     ; AIE2P: liveins: $x0
     ; AIE2P-NEXT: {{  $}}
     ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
-    ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1
-    ; AIE2P-NEXT: [[VEXTRACT_8_vec_extract_r_vaddSign1_:%[0-9]+]]:er = VEXTRACT_8_vec_extract_r_vaddSign1 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign1
-    ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_8_vec_extract_r_vaddSign1_]]
+    ; AIE2P-NEXT: [[VEXTRACT_8_vec_extract_imm_vaddSign1_:%[0-9]+]]:er = VEXTRACT_8_vec_extract_imm_vaddSign1 [[COPY]], 1, implicit $vaddsign1
+    ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_8_vec_extract_imm_vaddSign1_]]
     ; AIE2P-NEXT: PseudoRET implicit $lr, implicit $r0
     %1:vregbank(<64 x s8>) = COPY $x0
     %2:gprregbank(s32) = G_CONSTANT i32 1
@@ -129,9 +126,8 @@ body:             |
     ; AIE2P: liveins: $x0
     ; AIE2P-NEXT: {{  $}}
     ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
-    ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1
-    ; AIE2P-NEXT: [[VEXTRACT_8_vec_extract_r_vaddSign0_:%[0-9]+]]:er = VEXTRACT_8_vec_extract_r_vaddSign0 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign0
-    ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_8_vec_extract_r_vaddSign0_]]
+    ; AIE2P-NEXT: [[VEXTRACT_8_vec_extract_imm_vaddSign0_:%[0-9]+]]:er = VEXTRACT_8_vec_extract_imm_vaddSign0 [[COPY]], 1, implicit $vaddsign0
+    ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_8_vec_extract_imm_vaddSign0_]]
     ; AIE2P-NEXT: PseudoRET implicit $lr, implicit $r0
     %1:vregbank(<64 x s8>) = COPY $x0
     %2:gprregbank(s32) = G_CONSTANT i32 1
@@ -162,9 +158,8 @@ body:             |
     ; AIE2P: liveins: $x0
     ; AIE2P-NEXT: {{  $}}
     ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
-    ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1
-    ; AIE2P-NEXT: [[VEXTRACT_32_vec_extract_r_vaddSign1_:%[0-9]+]]:er = VEXTRACT_32_vec_extract_r_vaddSign1 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign1
-    ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_32_vec_extract_r_vaddSign1_]]
+    ; AIE2P-NEXT: [[VEXTRACT_32_vec_extract_imm_vaddSign1_:%[0-9]+]]:er = VEXTRACT_32_vec_extract_imm_vaddSign1 [[COPY]], 1, implicit $vaddsign1
+    ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_32_vec_extract_imm_vaddSign1_]]
     ; AIE2P-NEXT: PseudoRET implicit $lr, implicit $r0
     %1:vregbank(<16 x s32>) = COPY $x0
     %2:gprregbank(s32) = G_CONSTANT i32 1
diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-extract-vector-elem.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-extract-vector-elem.mir
index 694837d39ad8..b5326820a03f 100644
--- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-extract-vector-elem.mir
+++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-extract-vector-elem.mir
@@ -21,9 +21,8 @@ body:             |
     ; AIE2P: liveins: $x0
     ; AIE2P-NEXT: {{  $}}
     ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
-    ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1
-    ; AIE2P-NEXT: [[VEXTRACT_64_vec_extract_r_vaddSign1_:%[0-9]+]]:el = VEXTRACT_64_vec_extract_r_vaddSign1 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign1
-    ; AIE2P-NEXT: PseudoRET implicit $lr, implicit [[VEXTRACT_64_vec_extract_r_vaddSign1_]]
+    ; AIE2P-NEXT: [[VEXTRACT_64_vec_extract_imm_vaddSign1_:%[0-9]+]]:el = VEXTRACT_64_vec_extract_imm_vaddSign1 [[COPY]], 1, implicit $vaddsign1
+    ; AIE2P-NEXT: PseudoRET implicit $lr, implicit [[VEXTRACT_64_vec_extract_imm_vaddSign1_]]
     %1:vregbank(<8 x s64>) = COPY $x0
     %2:gprregbank(s32) = G_CONSTANT i32 1
     %0:gprregbank(s64) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %1(<8 x s64>), %2(s32)
@@ -63,10 +62,9 @@ body:             |
     ; AIE2P: liveins: $bmll0
     ; AIE2P-NEXT: {{  $}}
     ; AIE2P-NEXT: [[COPY:%[0-9]+]]:acc512 = COPY $bmll0
-    ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1
     ; AIE2P-NEXT: [[COPY1:%[0-9]+]]:mxm = COPY [[COPY]]
-    ; AIE2P-NEXT: [[VEXTRACT_64_vec_extract_r_vaddSign1_:%[0-9]+]]:el = VEXTRACT_64_vec_extract_r_vaddSign1 [[COPY1]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign1
-    ; AIE2P-NEXT: PseudoRET implicit $lr, implicit [[VEXTRACT_64_vec_extract_r_vaddSign1_]]
+    ; AIE2P-NEXT: [[VEXTRACT_64_vec_extract_imm_vaddSign1_:%[0-9]+]]:el = VEXTRACT_64_vec_extract_imm_vaddSign1 [[COPY1]], 1, implicit $vaddsign1
+    ; AIE2P-NEXT: PseudoRET implicit $lr, implicit [[VEXTRACT_64_vec_extract_imm_vaddSign1_]]
     %1:accregbank(<8 x s64>) = COPY $bmll0
     %2:gprregbank(s32) = G_CONSTANT i32 1
     %0:gprregbank(s64) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %1(<8 x s64>), %2(s32)
@@ -84,9 +82,8 @@ body:             |
     ; AIE2P: liveins: $x0
     ; AIE2P-NEXT: {{  $}}
     ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
-    ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1
-    ; AIE2P-NEXT: [[VEXTRACT_64_vec_extract_r_vaddSign0_:%[0-9]+]]:el = VEXTRACT_64_vec_extract_r_vaddSign0 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign0
-    ; AIE2P-NEXT: PseudoRET implicit $lr, implicit [[VEXTRACT_64_vec_extract_r_vaddSign0_]]
+    ; AIE2P-NEXT: [[VEXTRACT_64_vec_extract_imm_vaddSign0_:%[0-9]+]]:el = VEXTRACT_64_vec_extract_imm_vaddSign0 [[COPY]], 1, implicit $vaddsign0
+    ; AIE2P-NEXT: PseudoRET implicit $lr, implicit [[VEXTRACT_64_vec_extract_imm_vaddSign0_]]
     %1:vregbank(<8 x s64>) = COPY $x0
     %2:gprregbank(s32) = G_CONSTANT i32 1
     %0:gprregbank(s64) = G_AIE_ZEXT_EXTRACT_VECTOR_ELT %1(<8 x s64>), %2(s32)
diff --git a/llvm/test/CodeGen/AIE/aie2p/extractelement.ll b/llvm/test/CodeGen/AIE/aie2p/extractelement.ll
index e3085508f508..5a4f32e168f2 100644
--- a/llvm/test/CodeGen/AIE/aie2p/extractelement.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/extractelement.ll
@@ -13,10 +13,10 @@ define i64 @extract_v4i64(<4 x i64> inreg %v) nounwind {
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
 ; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nop // Delay Slot 5
-; AIE2P-NEXT:    mova r0, #3 // Delay Slot 4
+; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    nop // Delay Slot 4
 ; AIE2P-NEXT:    vmov x0, bmll0 // Delay Slot 3
-; AIE2P-NEXT:    vextract.64 r1:r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    vextract.64 r1:r0, x0, #3, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <4 x i64> %v, i32 3
   ret i64 %1
@@ -42,10 +42,10 @@ define i64 @extract_v8i64(<8 x i64> inreg %v) nounwind {
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
 ; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nop // Delay Slot 5
-; AIE2P-NEXT:    mova r0, #7 // Delay Slot 4
+; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    nop // Delay Slot 4
 ; AIE2P-NEXT:    vmov x0, bmll0 // Delay Slot 3
-; AIE2P-NEXT:    vextract.64 r1:r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    vextract.64 r1:r0, x0, #7, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <8 x i64> %v, i32 7
   ret i64 %1
@@ -71,10 +71,10 @@ define i64 @extract_v16i64(<16 x i64> inreg %v) nounwind {
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
 ; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nop // Delay Slot 5
-; AIE2P-NEXT:    mova r0, #7 // Delay Slot 4
+; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    nop // Delay Slot 4
 ; AIE2P-NEXT:    vmov x0, bmll0 // Delay Slot 3
-; AIE2P-NEXT:    vextract.64 r1:r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    vextract.64 r1:r0, x0, #7, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <16 x i64> %v, i32 7
   ret i64 %1
@@ -106,10 +106,10 @@ define i32 @extract_v64i32(<64 x i32> inreg %v) nounwind {
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
 ; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nop // Delay Slot 5
-; AIE2P-NEXT:    mova r0, #7 // Delay Slot 4
+; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    nop // Delay Slot 4
 ; AIE2P-NEXT:    vmov x0, bmll0 // Delay Slot 3
-; AIE2P-NEXT:    vextract.32 r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    vextract.32 r0, x0, #7, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <64 x i32> %v, i32 7
   ret i32 %1
@@ -152,10 +152,10 @@ define i64 @extract_v32i64(<32 x i64> inreg %v) nounwind {
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
 ; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nop // Delay Slot 5
-; AIE2P-NEXT:    mova r0, #7 // Delay Slot 4
+; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    nop // Delay Slot 4
 ; AIE2P-NEXT:    vmov x0, bmll0 // Delay Slot 3
-; AIE2P-NEXT:    vextract.64 r1:r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    vextract.64 r1:r0, x0, #7, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <32 x i64> %v, i32 7
   ret i64 %1
diff --git a/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll b/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll
index 7e9a3fa85496..63f0c74d7687 100644
--- a/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll
@@ -16,423 +16,406 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-LABEL: test_load_store_unaligned:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    mova m0, #-600; nopb ; nopx
-; CHECK-NEXT:    paddxm [sp], #640
-; CHECK-NEXT:    st p7, [sp, #-640] // 4-byte Folded Spill
+; CHECK-NEXT:    mova m0, #-560
+; CHECK-NEXT:    paddxm [sp], #576
+; CHECK-NEXT:    st p7, [sp, #-576] // 4-byte Folded Spill
 ; CHECK-NEXT:    mov p7, sp
-; CHECK-NEXT:    st r8, [sp, #-604] // 4-byte Folded Spill
-; CHECK-NEXT:    st r9, [sp, #-608] // 4-byte Folded Spill
-; CHECK-NEXT:    st r10, [sp, #-612] // 4-byte Folded Spill
-; CHECK-NEXT:    st r11, [sp, #-616] // 4-byte Folded Spill
-; CHECK-NEXT:    st r12, [sp, #-620] // 4-byte Folded Spill
-; CHECK-NEXT:    st r13, [sp, #-624] // 4-byte Folded Spill
-; CHECK-NEXT:    st r14, [sp, #-628] // 4-byte Folded Spill
-; CHECK-NEXT:    st r15, [sp, #-632] // 4-byte Folded Spill
-; CHECK-NEXT:    st p6, [sp, #-636] // 4-byte Folded Spill
 ; CHECK-NEXT:    padda [p7], m0
-; CHECK-NEXT:    st.s16 r16, [p7, #0]
+; CHECK-NEXT:    st.s16 r0, [p7, #0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova r0, #0
-; CHECK-NEXT:    vextract.32 r16, x0, r0, vaddsign1
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.32 r0, x0, #0, vaddsign1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st.s16 r17, [p7, #2]
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st.s16 r1, [p7, #2]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova r1, #1
-; CHECK-NEXT:    vextract.32 r17, x0, r1, vaddsign1
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st.s16 r18, [p7, #4]
+; CHECK-NEXT:    vextract.32 r1, x0, #1, vaddsign1
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova r2, #2
-; CHECK-NEXT:    vextract.32 r18, x0, r2, vaddsign1
+; CHECK-NEXT:    st.s16 r2, [p7, #4]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova r4, #4
-; CHECK-NEXT:    st.s16 r19, [p7, #6]
-; CHECK-NEXT:    mova r3, #3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.32 r2, x0, #2, vaddsign1
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    st.s16 r3, [p7, #6]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.32 r3, x0, #3, vaddsign1
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    st.s16 r4, [p7, #8]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.32 r4, x0, #4, vaddsign1
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    st.s16 r5, [p7, #10]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov q0, wl2
+; CHECK-NEXT:    vextract.32 r5, x0, #5, vaddsign1
+; CHECK-NEXT:    vextract.32 r6, x0, #6, vaddsign1
+; CHECK-NEXT:    vextract.32 r7, x0, #7, vaddsign1
+; CHECK-NEXT:    st.s16 r6, [p7, #12]
 ; CHECK-NEXT:    mov p0, sp
-; CHECK-NEXT:    mov p2, sp
-; CHECK-NEXT:    vextract.32 r19, x0, r3, vaddsign1
-; CHECK-NEXT:    mova r5, #5
-; CHECK-NEXT:    mov p1, sp
-; CHECK-NEXT:    st.s16 r20, [p7, #8]
-; CHECK-NEXT:    vextract.32 r20, x0, r4, vaddsign1
-; CHECK-NEXT:    vextract.32 r21, x0, r5, vaddsign1
-; CHECK-NEXT:    mova m0, #-584
-; CHECK-NEXT:    padda [p1], #-512
+; CHECK-NEXT:    vmov wl0, q0
+; CHECK-NEXT:    mova m0, #-544
 ; CHECK-NEXT:    padda [p0], m0
-; CHECK-NEXT:    mova m0, #-568
-; CHECK-NEXT:    st p1, [sp, #-24] // 4-byte Folded Spill
-; CHECK-NEXT:    mov p1, sp
-; CHECK-NEXT:    st p7, [sp, #-28] // 4-byte Folded Spill
-; CHECK-NEXT:    mov r24, p0
-; CHECK-NEXT:    st.s16 r21, [p7, #10]
+; CHECK-NEXT:    mova m0, #-528
+; CHECK-NEXT:    mov r27, p7
+; CHECK-NEXT:    st.s16 r7, [p7, #14]
+; CHECK-NEXT:    mov r16, p0
 ; CHECK-NEXT:    mov p0, sp
 ; CHECK-NEXT:    padda [p0], m0
-; CHECK-NEXT:    mova m0, #-544
-; CHECK-NEXT:    padda [p2], m0
-; CHECK-NEXT:    mova m0, #-480
-; CHECK-NEXT:    padda [p1], m0
-; CHECK-NEXT:    st p1, [sp, #-16] // 4-byte Folded Spill
-; CHECK-NEXT:    st.s16 r22, [p7, #12]
+; CHECK-NEXT:    vextract.32 r0, x0, #0, vaddsign1
+; CHECK-NEXT:    vextract.32 r1, x0, #1, vaddsign1
+; CHECK-NEXT:    mov p7, r16
+; CHECK-NEXT:    st r1, [p7, #4]
+; CHECK-NEXT:    st r0, [p7, #0]
+; CHECK-NEXT:    st.s8 r0, [p0, #0]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova r6, #6
-; CHECK-NEXT:    vextract.32 r22, x0, r6, vaddsign1
-; CHECK-NEXT:    vmov q0, wl2
-; CHECK-NEXT:    mov p1, sp
-; CHECK-NEXT:    padda [p1], #-448
-; CHECK-NEXT:    st p1, [sp, #-20] // 4-byte Folded Spill
-; CHECK-NEXT:    st.s16 r23, [p7, #14]
-; CHECK-NEXT:    mova r7, #7
-; CHECK-NEXT:    vextract.32 r23, x0, r7, vaddsign1
-; CHECK-NEXT:    vmov wl0, q0
-; CHECK-NEXT:    vextract.32 r16, x0, r0, vaddsign1
-; CHECK-NEXT:    vextract.32 r17, x0, r1, vaddsign1
-; CHECK-NEXT:    mov p7, r24
-; CHECK-NEXT:    st r17, [p7, #4]
-; CHECK-NEXT:    st r16, [p7, #0]
-; CHECK-NEXT:    st p7, [sp, #-32] // 4-byte Folded Spill
-; CHECK-NEXT:    st.s8 r24, [p0, #0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mov p1, sp
-; CHECK-NEXT:    vextract.16 r24, x4, r0, vaddsign1
-; CHECK-NEXT:    vextract.32 r18, x0, r2, vaddsign1
-; CHECK-NEXT:    padda [p1], #-320
-; CHECK-NEXT:    st r18, [p7, #8]
-; CHECK-NEXT:    st p1, [sp, #-36] // 4-byte Folded Spill
-; CHECK-NEXT:    st.s8 r25, [p0, #1]
+; CHECK-NEXT:    vextract.16 r0, x4, #0, vaddsign1
+; CHECK-NEXT:    vextract.32 r2, x0, #2, vaddsign1
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st r2, [p7, #8]
+; CHECK-NEXT:    st.s8 r1, [p0, #1]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.16 r25, x4, r1, vaddsign1
-; CHECK-NEXT:    vextract.32 r19, x0, r3, vaddsign1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st r19, [p7, #12]
-; CHECK-NEXT:    lda p7, [sp, #-24] // 4-byte Folded Reload
-; CHECK-NEXT:    st.s8 r26, [p0, #2]
+; CHECK-NEXT:    vextract.16 r1, x4, #1, vaddsign1
+; CHECK-NEXT:    vextract.32 r3, x0, #3, vaddsign1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.16 r26, x4, r2, vaddsign1
+; CHECK-NEXT:    st r3, [p7, #12]
+; CHECK-NEXT:    st.s8 r2, [p0, #2]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.16 r2, x4, #2, vaddsign1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mov p1, sp
-; CHECK-NEXT:    mova m0, #-304
-; CHECK-NEXT:    padda [p1], m0
-; CHECK-NEXT:    st p1, [sp, #-40] // 4-byte Folded Spill
-; CHECK-NEXT:    st.s8 r27, [p0, #3]
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st.s8 r3, [p0, #3]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.16 r27, x4, r3, vaddsign1
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.16 r3, x4, #3, vaddsign1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st.s8 r28, [p0, #4]
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st.s8 r4, [p0, #4]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.16 r28, x4, r4, vaddsign1
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.16 r4, x4, #4, vaddsign1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st.s8 r29, [p0, #5]
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st r8, [sp, #-568] // 4-byte Folded Spill
+; CHECK-NEXT:    st p6, [sp, #-572] // 4-byte Folded Spill
+; CHECK-NEXT:    st.s8 r5, [p0, #5]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.16 r29, x4, r5, vaddsign1
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.16 r5, x4, #5, vaddsign1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st.s8 r30, [p0, #6]
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st.s8 r6, [p0, #6]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.16 r30, x4, r6, vaddsign1
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.16 r6, x4, #6, vaddsign1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st.s8 r31, [p0, #7]
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st.s8 r7, [p0, #7]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.16 r31, x4, r7, vaddsign1
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.16 r7, x4, #7, vaddsign1
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mova dj0, #8
-; CHECK-NEXT:    st.s8 r8, [p0, dj0]
+; CHECK-NEXT:    st.s8 r16, [p0, dj0]
+; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova r16, #8
-; CHECK-NEXT:    vextract.16 r8, x4, r16, vaddsign1
+; CHECK-NEXT:    vextract.16 r16, x4, #8, vaddsign1
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mova dj0, #9
-; CHECK-NEXT:    st.s8 r9, [p0, dj0]
+; CHECK-NEXT:    st.s8 r17, [p0, dj0]
+; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova r17, #9
-; CHECK-NEXT:    vextract.16 r9, x4, r17, vaddsign1
+; CHECK-NEXT:    vextract.16 r17, x4, #9, vaddsign1
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mova dj2, #10
-; CHECK-NEXT:    st.s8 r10, [p0, dj2]
+; CHECK-NEXT:    st.s8 r18, [p0, dj2]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova r18, #10
-; CHECK-NEXT:    vextract.16 r10, x4, r18, vaddsign1
-; CHECK-NEXT:    mova dj1, #12
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.16 r18, x4, #10, vaddsign1
+; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mova dj0, #11
-; CHECK-NEXT:    st.s8 r11, [p0, dj0]
-; CHECK-NEXT:    mova dj3, #14
-; CHECK-NEXT:    mova dj4, #32
-; CHECK-NEXT:    mova r19, #11
-; CHECK-NEXT:    vextract.16 r11, x4, r19, vaddsign1
-; CHECK-NEXT:    mova dj5, #36
-; CHECK-NEXT:    mov p5, sp
-; CHECK-NEXT:    st.s8 r12, [p0, dj1]
-; CHECK-NEXT:    mov p3, sp
-; CHECK-NEXT:    mov p4, sp
-; CHECK-NEXT:    mova r20, #12
-; CHECK-NEXT:    vextract.16 r12, x4, r20, vaddsign1
-; CHECK-NEXT:    padda [p5], #-384
-; CHECK-NEXT:    mova dj0, #13
-; CHECK-NEXT:    st.s8 r13, [p0, dj0]
-; CHECK-NEXT:    mova m0, #-288
-; CHECK-NEXT:    mov p1, sp
-; CHECK-NEXT:    mova r21, #13
-; CHECK-NEXT:    vextract.16 r13, x4, r21, vaddsign1
-; CHECK-NEXT:    padda [p1], m0
-; CHECK-NEXT:    mova m0, #-272
-; CHECK-NEXT:    st.s8 r14, [p0, dj3]
-; CHECK-NEXT:    mova r22, #14
-; CHECK-NEXT:    padda [p3], m0
-; CHECK-NEXT:    mova m0, #-240
-; CHECK-NEXT:    vextract.16 r14, x4, r22, vaddsign1
-; CHECK-NEXT:    padda [p4], m0
-; CHECK-NEXT:    mova m0, #-208
-; CHECK-NEXT:    st p4, [sp, #-44] // 4-byte Folded Spill
-; CHECK-NEXT:    mov p4, sp
-; CHECK-NEXT:    padda [p4], m0
-; CHECK-NEXT:    st p4, [sp, #-48] // 4-byte Folded Spill
-; CHECK-NEXT:    mova dj0, #15
-; CHECK-NEXT:    st.s8 r15, [p0, dj0]
-; CHECK-NEXT:    mova r23, #15
-; CHECK-NEXT:    vextract.16 r15, x4, r23, vaddsign1
-; CHECK-NEXT:    vextract.16 r8, x6, r16, vaddsign1
-; CHECK-NEXT:    vextract.32 r16, x10, r16, vaddsign1
-; CHECK-NEXT:    vextract.16 r9, x6, r17, vaddsign1
-; CHECK-NEXT:    vextract.32 r17, x10, r17, vaddsign1
-; CHECK-NEXT:    st r16, [p5, dj4]
-; CHECK-NEXT:    st r17, [p5, dj5]
-; CHECK-NEXT:    st.s16 r24, [p2, #0]
+; CHECK-NEXT:    st.s8 r19, [p0, dj0]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova dj6, #40
-; CHECK-NEXT:    vextract.16 r24, x6, r0, vaddsign1
-; CHECK-NEXT:    vextract.16 r10, x6, r18, vaddsign1
-; CHECK-NEXT:    vextract.32 r18, x10, r18, vaddsign1
-; CHECK-NEXT:    mova dj5, #8
-; CHECK-NEXT:    lda.s8 r16, [p0, dj5]
-; CHECK-NEXT:    st r18, [p5, dj6]
-; CHECK-NEXT:    st.s16 r25, [p2, #2]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova dj7, #44
-; CHECK-NEXT:    vextract.16 r25, x6, r1, vaddsign1
-; CHECK-NEXT:    vextract.16 r11, x6, r19, vaddsign1
-; CHECK-NEXT:    vextract.32 r19, x10, r19, vaddsign1
-; CHECK-NEXT:    mova dj5, #9
-; CHECK-NEXT:    lda.s8 r17, [p0, dj5]
-; CHECK-NEXT:    st r19, [p5, dj7]
-; CHECK-NEXT:    st.s16 r26, [p2, #4]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.16 r26, x6, r2, vaddsign1
-; CHECK-NEXT:    mova dj1, #48
-; CHECK-NEXT:    vextract.16 r12, x6, r20, vaddsign1
-; CHECK-NEXT:    vextract.32 r20, x10, r20, vaddsign1
-; CHECK-NEXT:    mova dj5, #10
-; CHECK-NEXT:    lda.s8 r18, [p0, dj5]
-; CHECK-NEXT:    st r20, [p5, dj1]
-; CHECK-NEXT:    st.s16 r27, [p2, #6]
+; CHECK-NEXT:    vextract.16 r19, x4, #11, vaddsign1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.16 r27, x6, r3, vaddsign1
-; CHECK-NEXT:    mova dj3, #52
-; CHECK-NEXT:    vextract.16 r13, x6, r21, vaddsign1
-; CHECK-NEXT:    vextract.32 r21, x10, r21, vaddsign1
-; CHECK-NEXT:    mova dj5, #11
-; CHECK-NEXT:    lda.s8 r19, [p0, dj5]
-; CHECK-NEXT:    st r21, [p5, dj3]
-; CHECK-NEXT:    st.s16 r28, [p2, #8]
+; CHECK-NEXT:    mova dj1, #12
+; CHECK-NEXT:    st.s8 r20, [p0, dj1]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.16 r28, x6, r4, vaddsign1
+; CHECK-NEXT:    vextract.16 r20, x4, #12, vaddsign1
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mova dj0, #13
+; CHECK-NEXT:    st.s8 r21, [p0, dj0]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova dj5, #12
-; CHECK-NEXT:    lda.s8 r20, [p0, dj5]
-; CHECK-NEXT:    st.s16 r29, [p2, #10]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova dj2, #60
-; CHECK-NEXT:    vextract.16 r29, x6, r5, vaddsign1
-; CHECK-NEXT:    vextract.16 r15, x6, r23, vaddsign1
-; CHECK-NEXT:    vextract.32 r23, x10, r23, vaddsign1
-; CHECK-NEXT:    mova dj5, #13
-; CHECK-NEXT:    lda.s8 r21, [p0, dj5]
-; CHECK-NEXT:    st r23, [p5, dj2]
-; CHECK-NEXT:    st.s16 r30, [p2, #12]
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.16 r21, x4, #13, vaddsign1
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    mova dj3, #14
+; CHECK-NEXT:    st.s8 r22, [p0, dj3]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.16 r30, x6, r6, vaddsign1
-; CHECK-NEXT:    vextract.32 r24, x8, r0, vaddsign1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st r24, [p7, #0]
-; CHECK-NEXT:    st.s16 r31, [p2, #14]
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.16 r22, x4, #14, vaddsign1
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    mova dj0, #15
+; CHECK-NEXT:    st.s8 r23, [p0, dj0]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.16 r31, x6, r7, vaddsign1
-; CHECK-NEXT:    vextract.32 r25, x8, r1, vaddsign1
-; CHECK-NEXT:    mova dj0, #16
-; CHECK-NEXT:    st r25, [p7, #4]
-; CHECK-NEXT:    st.s16 r8, [p2, dj0]
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.16 r23, x4, #15, vaddsign1
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    mov p2, sp
+; CHECK-NEXT:    padda [p2], #-512
+; CHECK-NEXT:    st.s16 r0, [p2, #0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova dj0, #18
-; CHECK-NEXT:    st.s16 r9, [p2, dj0]
+; CHECK-NEXT:    vextract.16 r0, x6, #0, vaddsign1
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st.s16 r1, [p2, #2]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.32 r26, x8, r2, vaddsign1
-; CHECK-NEXT:    vextract.32 r27, x8, r3, vaddsign1
-; CHECK-NEXT:    mova dj0, #20
-; CHECK-NEXT:    st r26, [p7, #8]
-; CHECK-NEXT:    st r27, [p7, #12]
-; CHECK-NEXT:    st.s16 r10, [p2, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.16 r1, x6, #1, vaddsign1
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st.s16 r2, [p2, #4]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova dj0, #22
-; CHECK-NEXT:    st.s16 r11, [p2, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.16 r2, x6, #2, vaddsign1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vextract.32 r28, x8, r4, vaddsign1
-; CHECK-NEXT:    vextract.32 r29, x8, r5, vaddsign1
-; CHECK-NEXT:    mova dj0, #24
-; CHECK-NEXT:    st r28, [p7, #16]
-; CHECK-NEXT:    st r29, [p7, #20]
-; CHECK-NEXT:    st.s16 r12, [p2, dj0]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    st.s16 r3, [p2, #6]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vextract.16 r3, x6, #3, vaddsign1
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mova dj4, #32
+; CHECK-NEXT:    st.s16 r4, [p2, #8]
+; CHECK-NEXT:    mova dj5, #36
+; CHECK-NEXT:    mova dj6, #40
+; CHECK-NEXT:    mova dj7, #44
+; CHECK-NEXT:    vextract.16 r4, x6, #4, vaddsign1
 ; CHECK-NEXT:    vmov x2, bmll0
+; CHECK-NEXT:    mov p1, sp
+; CHECK-NEXT:    st.s16 r5, [p2, #10]
+; CHECK-NEXT:    mov p5, sp
 ; CHECK-NEXT:    vmov bmll0, x2
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    mova m0, #-480
+; CHECK-NEXT:    vextract.16 r5, x6, #5, vaddsign1
+; CHECK-NEXT:    padda [p1], m0
+; CHECK-NEXT:    mova m0, #-416
+; CHECK-NEXT:    st.s16 r6, [p2, #12]
+; CHECK-NEXT:    mov r30, p1
+; CHECK-NEXT:    mov p1, sp
+; CHECK-NEXT:    mov r29, p7
+; CHECK-NEXT:    vextract.16 r6, x6, #6, vaddsign1
+; CHECK-NEXT:    vextract.32 r0, x8, #0, vaddsign1
+; CHECK-NEXT:    mov p7, r30
+; CHECK-NEXT:    st r0, [p7, #0]
+; CHECK-NEXT:    st.s16 r7, [p2, #14]
+; CHECK-NEXT:    padda [p1], #-448
+; CHECK-NEXT:    mov r31, p1
+; CHECK-NEXT:    mov p1, sp
+; CHECK-NEXT:    vextract.16 r7, x6, #7, vaddsign1
+; CHECK-NEXT:    vextract.32 r1, x8, #1, vaddsign1
+; CHECK-NEXT:    mova dj0, #16
+; CHECK-NEXT:    st r1, [p7, #4]
+; CHECK-NEXT:    st.s16 r16, [p2, dj0]
+; CHECK-NEXT:    padda [p1], m0
+; CHECK-NEXT:    mova m0, #-352
+; CHECK-NEXT:    mov r8, p1
+; CHECK-NEXT:    vextract.16 r16, x6, #8, vaddsign1
 ; CHECK-NEXT:    vmov x0, bmll0
+; CHECK-NEXT:    mova dj0, #18
+; CHECK-NEXT:    st.s16 r17, [p2, dj0]
+; CHECK-NEXT:    mov p1, sp
+; CHECK-NEXT:    padda [p5], m0
+; CHECK-NEXT:    vextract.16 r17, x6, #9, vaddsign1
+; CHECK-NEXT:    vextract.32 r2, x8, #2, vaddsign1
+; CHECK-NEXT:    vextract.32 r3, x8, #3, vaddsign1
+; CHECK-NEXT:    mova dj0, #20
+; CHECK-NEXT:    st r2, [p7, #8]
+; CHECK-NEXT:    st r3, [p7, #12]
+; CHECK-NEXT:    st.s16 r18, [p2, dj0]
+; CHECK-NEXT:    mova m0, #-288
+; CHECK-NEXT:    padda [p1], m0
+; CHECK-NEXT:    mov r28, p1
+; CHECK-NEXT:    vextract.16 r18, x6, #10, vaddsign1
 ; CHECK-NEXT:    vmov bmll0, x2
+; CHECK-NEXT:    mova dj0, #22
+; CHECK-NEXT:    st.s16 r19, [p2, dj0]
+; CHECK-NEXT:    mov r30, p7
+; CHECK-NEXT:    mova dj2, #60
+; CHECK-NEXT:    vextract.16 r19, x6, #11, vaddsign1
+; CHECK-NEXT:    vextract.32 r4, x8, #4, vaddsign1
+; CHECK-NEXT:    vextract.32 r5, x8, #5, vaddsign1
+; CHECK-NEXT:    mova dj0, #24
+; CHECK-NEXT:    st r4, [p7, #16]
+; CHECK-NEXT:    st r5, [p7, #20]
+; CHECK-NEXT:    st.s16 r20, [p2, dj0]
+; CHECK-NEXT:    mova dj1, #48
+; CHECK-NEXT:    mova dj3, #52
+; CHECK-NEXT:    vextract.16 r20, x6, #12, vaddsign1
+; CHECK-NEXT:    vextract.16 r21, x6, #13, vaddsign1
+; CHECK-NEXT:    vextract.16 r22, x6, #14, vaddsign1
 ; CHECK-NEXT:    mova dj0, #26
-; CHECK-NEXT:    st.s16 r13, [p2, dj0]
-; CHECK-NEXT:    vextract.16 r14, x6, r22, vaddsign1
-; CHECK-NEXT:    vextract.32 r22, x10, r22, vaddsign1
-; CHECK-NEXT:    mova dj5, #14
-; CHECK-NEXT:    vextract.32 r30, x8, r6, vaddsign1
-; CHECK-NEXT:    vextract.32 r31, x8, r7, vaddsign1
-; CHECK-NEXT:    mova dj0, #28
-; CHECK-NEXT:    st r30, [p7, #24]
-; CHECK-NEXT:    st r31, [p7, #28]
-; CHECK-NEXT:    st.s16 r14, [p2, dj0]
-; CHECK-NEXT:    vextract.64 r25:r24, x0, r0, vaddsign1
+; CHECK-NEXT:    st.s16 r21, [p2, dj0]
+; CHECK-NEXT:    vextract.16 r23, x6, #15, vaddsign1
+; CHECK-NEXT:    vextract.64 r1:r0, x0, #0, vaddsign1
 ; CHECK-NEXT:    vmov x0, bmll0
+; CHECK-NEXT:    vextract.32 r6, x8, #6, vaddsign1
+; CHECK-NEXT:    vextract.32 r7, x8, #7, vaddsign1
+; CHECK-NEXT:    mova dj0, #28
+; CHECK-NEXT:    st r6, [p7, #24]
+; CHECK-NEXT:    st r7, [p7, #28]
+; CHECK-NEXT:    st.s16 r22, [p2, dj0]
 ; CHECK-NEXT:    vmov bmll0, x2
-; CHECK-NEXT:    vextract.64 r27:r26, x0, r1, vaddsign1
+; CHECK-NEXT:    vextract.64 r3:r2, x0, #1, vaddsign1
 ; CHECK-NEXT:    vmov x0, bmll0
-; CHECK-NEXT:    mova dj0, #30
-; CHECK-NEXT:    lda p7, [sp, #-16] // 4-byte Folded Reload
-; CHECK-NEXT:    st.s16 r15, [p2, dj0]
 ; CHECK-NEXT:    vmov bmll0, x2
-; CHECK-NEXT:    vextract.64 r29:r28, x0, r2, vaddsign1
+; CHECK-NEXT:    vextract.64 r5:r4, x0, #2, vaddsign1
+; CHECK-NEXT:    mova dj0, #30
+; CHECK-NEXT:    st.s16 r23, [p2, dj0]
 ; CHECK-NEXT:    vmov x0, bmll0
-; CHECK-NEXT:    vextract.64 r31:r30, x0, r3, vaddsign1
+; CHECK-NEXT:    mov p7, r31
+; CHECK-NEXT:    vextract.64 r7:r6, x0, #3, vaddsign1
 ; CHECK-NEXT:    vmov x0, bmll1
+; CHECK-NEXT:    mov r31, p7
 ; CHECK-NEXT:    mova dj0, #36
-; CHECK-NEXT:    st r24, [p7, #0]
-; CHECK-NEXT:    st r25, [p7, #4]
-; CHECK-NEXT:    st r26, [p7, #8]
-; CHECK-NEXT:    st r27, [p7, #12]
-; CHECK-NEXT:    st r28, [p7, #16]
-; CHECK-NEXT:    st r29, [p7, #20]
-; CHECK-NEXT:    st r30, [p7, #24]
-; CHECK-NEXT:    st r31, [p7, #28]
-; CHECK-NEXT:    vextract.64 r29:r28, x0, r0, vaddsign1
+; CHECK-NEXT:    st r0, [p7, #0]
+; CHECK-NEXT:    st r1, [p7, #4]
+; CHECK-NEXT:    st r2, [p7, #8]
+; CHECK-NEXT:    st r3, [p7, #12]
+; CHECK-NEXT:    st r4, [p7, #16]
+; CHECK-NEXT:    st r5, [p7, #20]
+; CHECK-NEXT:    vextract.64 r5:r4, x0, #0, vaddsign1
 ; CHECK-NEXT:    vmov x0, bmll1
-; CHECK-NEXT:    vextract.32 r0, x10, r0, vaddsign1
-; CHECK-NEXT:    lda p7, [sp, #-20] // 4-byte Folded Reload
-; CHECK-NEXT:    vextract.64 r31:r30, x0, r1, vaddsign1
+; CHECK-NEXT:    st r6, [p7, #24]
+; CHECK-NEXT:    st r7, [p7, #28]
+; CHECK-NEXT:    mov p7, r8
+; CHECK-NEXT:    vextract.64 r7:r6, x0, #1, vaddsign1
 ; CHECK-NEXT:    vmov x0, bmll1
-; CHECK-NEXT:    vextract.32 r1, x10, r1, vaddsign1
-; CHECK-NEXT:    vextract.64 r9:r8, x0, r2, vaddsign1
+; CHECK-NEXT:    mov r8, p7
+; CHECK-NEXT:    vextract.64 r17:r16, x0, #2, vaddsign1
 ; CHECK-NEXT:    vmov x0, bmll1
-; CHECK-NEXT:    vextract.32 r2, x10, r2, vaddsign1
-; CHECK-NEXT:    st r0, [p5, #0]
-; CHECK-NEXT:    vextract.64 r11:r10, x0, r3, vaddsign1
+; CHECK-NEXT:    st r4, [p7, #0]
+; CHECK-NEXT:    st r5, [p7, #4]
+; CHECK-NEXT:    vextract.32 r4, x10, #4, vaddsign1
+; CHECK-NEXT:    vextract.32 r5, x10, #5, vaddsign1
+; CHECK-NEXT:    vextract.64 r19:r18, x0, #3, vaddsign1
 ; CHECK-NEXT:    vmov x0, bmll1
-; CHECK-NEXT:    vextract.32 r3, x10, r3, vaddsign1
-; CHECK-NEXT:    st r1, [p5, #4]
-; CHECK-NEXT:    vextract.64 r13:r12, x0, r4, vaddsign1
-; CHECK-NEXT:    vmov x0, bmll1
-; CHECK-NEXT:    vextract.32 r4, x10, r4, vaddsign1
-; CHECK-NEXT:    st r2, [p5, #8]
-; CHECK-NEXT:    lda r2, [sp, #-28] // 4-byte Folded Reload
-; CHECK-NEXT:    vextract.64 r15:r14, x0, r5, vaddsign1
-; CHECK-NEXT:    vmov x0, bmll1
-; CHECK-NEXT:    vextract.32 r5, x10, r5, vaddsign1
-; CHECK-NEXT:    st r3, [p5, #12]
-; CHECK-NEXT:    vextract.64 r27:r26, x0, r6, vaddsign1
+; CHECK-NEXT:    st r6, [p7, #8]
+; CHECK-NEXT:    st r7, [p7, #12]
+; CHECK-NEXT:    vextract.32 r6, x10, #6, vaddsign1
+; CHECK-NEXT:    vextract.32 r7, x10, #7, vaddsign1
+; CHECK-NEXT:    vextract.64 r21:r20, x0, #4, vaddsign1
 ; CHECK-NEXT:    vmov x0, bmll1
-; CHECK-NEXT:    vextract.32 r6, x10, r6, vaddsign1
+; CHECK-NEXT:    st r16, [p7, #16]
+; CHECK-NEXT:    st r17, [p7, #20]
+; CHECK-NEXT:    vextract.32 r16, x10, #8, vaddsign1
+; CHECK-NEXT:    vextract.32 r17, x10, #9, vaddsign1
 ; CHECK-NEXT:    st r4, [p5, #16]
-; CHECK-NEXT:    st r13, [p7, dj0]
-; CHECK-NEXT:    mova dj0, #56
-; CHECK-NEXT:    vextract.64 r25:r24, x0, r7, vaddsign1
-; CHECK-NEXT:    st r28, [p7, #0]
-; CHECK-NEXT:    st r29, [p7, #4]
-; CHECK-NEXT:    st r30, [p7, #8]
-; CHECK-NEXT:    st r31, [p7, #12]
-; CHECK-NEXT:    st r8, [p7, #16]
-; CHECK-NEXT:    st r9, [p7, #20]
-; CHECK-NEXT:    st r10, [p7, #24]
-; CHECK-NEXT:    st r11, [p7, #28]
-; CHECK-NEXT:    st r12, [p7, dj4]
-; CHECK-NEXT:    st r14, [p7, dj6]
-; CHECK-NEXT:    st r15, [p7, dj7]
-; CHECK-NEXT:    vextract.32 r7, x10, r7, vaddsign1
 ; CHECK-NEXT:    st r5, [p5, #20]
-; CHECK-NEXT:    lda r15, [sp, #-632] // 4-byte Folded Reload
-; CHECK-NEXT:    lda r14, [sp, #-628] // 4-byte Folded Reload
-; CHECK-NEXT:    lda r13, [sp, #-624] // 4-byte Folded Reload
-; CHECK-NEXT:    lda r12, [sp, #-620] // 4-byte Folded Reload
-; CHECK-NEXT:    lda r11, [sp, #-616] // 4-byte Folded Reload
-; CHECK-NEXT:    lda r10, [sp, #-612] // 4-byte Folded Reload
-; CHECK-NEXT:    lda r9, [sp, #-608] // 4-byte Folded Reload
-; CHECK-NEXT:    lda r8, [sp, #-604] // 4-byte Folded Reload
-; CHECK-NEXT:    st r22, [p5, dj0]
-; CHECK-NEXT:    lda.s8 r22, [p0, dj5]
-; CHECK-NEXT:    mova dj5, #15
-; CHECK-NEXT:    st r26, [p7, dj1]
-; CHECK-NEXT:    st r27, [p7, dj3]
+; CHECK-NEXT:    vextract.64 r23:r22, x0, #5, vaddsign1
+; CHECK-NEXT:    vmov x0, bmll1
+; CHECK-NEXT:    st r18, [p7, #24]
+; CHECK-NEXT:    st r19, [p7, #28]
+; CHECK-NEXT:    vextract.32 r18, x10, #10, vaddsign1
+; CHECK-NEXT:    vextract.32 r19, x10, #11, vaddsign1
 ; CHECK-NEXT:    st r6, [p5, #24]
-; CHECK-NEXT:    st r24, [p7, dj0]
-; CHECK-NEXT:    mov r24, p7
-; CHECK-NEXT:    st r25, [p7, dj2]
 ; CHECK-NEXT:    st r7, [p5, #28]
-; CHECK-NEXT:    lda.s8 r23, [p0, dj5]
-; CHECK-NEXT:    mov p7, r2
+; CHECK-NEXT:    st r21, [p7, dj0]
+; CHECK-NEXT:    mova dj0, #56
+; CHECK-NEXT:    st r17, [p5, dj5]
+; CHECK-NEXT:    mova dj5, #8
+; CHECK-NEXT:    vextract.64 r3:r2, x0, #6, vaddsign1
+; CHECK-NEXT:    vmov x0, bmll1
+; CHECK-NEXT:    st r20, [p7, dj4]
+; CHECK-NEXT:    vextract.32 r20, x10, #12, vaddsign1
+; CHECK-NEXT:    vextract.32 r21, x10, #13, vaddsign1
+; CHECK-NEXT:    st r16, [p5, dj4]
+; CHECK-NEXT:    lda.s8 r16, [p0, dj5]
+; CHECK-NEXT:    mova dj5, #9
+; CHECK-NEXT:    vextract.64 r1:r0, x0, #7, vaddsign1
+; CHECK-NEXT:    st r22, [p7, dj6]
+; CHECK-NEXT:    st r23, [p7, dj7]
+; CHECK-NEXT:    vextract.32 r22, x10, #14, vaddsign1
+; CHECK-NEXT:    vextract.32 r23, x10, #15, vaddsign1
+; CHECK-NEXT:    st r18, [p5, dj6]
+; CHECK-NEXT:    st r19, [p5, dj7]
+; CHECK-NEXT:    lda.s8 r17, [p0, dj5]
+; CHECK-NEXT:    mova dj5, #10
+; CHECK-NEXT:    st r2, [p7, dj1]
+; CHECK-NEXT:    st r3, [p7, dj3]
+; CHECK-NEXT:    vextract.32 r2, x10, #2, vaddsign1
+; CHECK-NEXT:    vextract.32 r3, x10, #3, vaddsign1
+; CHECK-NEXT:    st r20, [p5, dj1]
+; CHECK-NEXT:    st r21, [p5, dj3]
+; CHECK-NEXT:    st r0, [p7, dj0]
+; CHECK-NEXT:    st r1, [p7, dj2]
+; CHECK-NEXT:    mov p7, r27
+; CHECK-NEXT:    lda.s8 r18, [p0, dj5]
+; CHECK-NEXT:    mova dj5, #11
+; CHECK-NEXT:    vextract.32 r0, x10, #0, vaddsign1
+; CHECK-NEXT:    vextract.32 r1, x10, #1, vaddsign1
+; CHECK-NEXT:    st r22, [p5, dj0]
+; CHECK-NEXT:    st r23, [p5, dj2]
+; CHECK-NEXT:    lda.s8 r19, [p0, dj5]
+; CHECK-NEXT:    mova dj5, #12
+; CHECK-NEXT:    st r2, [p5, #8]
+; CHECK-NEXT:    st r3, [p5, #12]
+; CHECK-NEXT:    lda.s8 r20, [p0, dj5]
+; CHECK-NEXT:    mova dj5, #13
+; CHECK-NEXT:    st r0, [p5, #0]
+; CHECK-NEXT:    st r1, [p5, #4]
 ; CHECK-NEXT:    lda.s16 r0, [p7], #2
 ; CHECK-NEXT:    lda.s16 r1, [p7, #0]
-; CHECK-NEXT:    mov p7, r2
-; CHECK-NEXT:    lda p7, [sp, #-36] // 4-byte Folded Reload
+; CHECK-NEXT:    mov p7, r27
+; CHECK-NEXT:    lda.s8 r21, [p0, dj5]
+; CHECK-NEXT:    mova dj5, #14
 ; CHECK-NEXT:    lda.s16 r2, [p7, #4]
 ; CHECK-NEXT:    lda.s16 r3, [p7, #6]
 ; CHECK-NEXT:    lda.s16 r4, [p7, #8]
 ; CHECK-NEXT:    lda.s16 r5, [p7, #10]
 ; CHECK-NEXT:    lda.s16 r6, [p7, #12]
 ; CHECK-NEXT:    lda.s16 r7, [p7, #14]
+; CHECK-NEXT:    mov p7, r28
+; CHECK-NEXT:    lda.s8 r22, [p0, dj5]
+; CHECK-NEXT:    mova dj5, #15
+; CHECK-NEXT:    lda.s8 r23, [p0, dj5]
 ; CHECK-NEXT:    st.s16 r0, [p7], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -454,7 +437,6 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    lda r2, [sp, #-32] // 4-byte Folded Reload
 ; CHECK-NEXT:    st.s16 r3, [p7], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -477,47 +459,48 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    lda.s8 r5, [p0, #5]
 ; CHECK-NEXT:    st.s16 r6, [p7], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    mova m0, #-272
 ; CHECK-NEXT:    st.s16 r7, [p7, #0]
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    mov p7, r2
+; CHECK-NEXT:    mov p1, sp
+; CHECK-NEXT:    padda [p1], m0
+; CHECK-NEXT:    mov r26, p1
+; CHECK-NEXT:    mov p1, sp
+; CHECK-NEXT:    padda [p1], #-256
+; CHECK-NEXT:    mov p7, r29
 ; CHECK-NEXT:    lda r0, [p7], #4
 ; CHECK-NEXT:    lda r1, [p7, #0]
-; CHECK-NEXT:    lda p7, [sp, #-40] // 4-byte Folded Reload
-; CHECK-NEXT:    mov p7, r2
+; CHECK-NEXT:    mov p7, r29
 ; CHECK-NEXT:    lda r2, [p7, #8]
-; CHECK-NEXT:    lda.s8 r5, [p0, #5]
 ; CHECK-NEXT:    lda r3, [p7, #12]
 ; CHECK-NEXT:    lda.s8 r6, [p0, #6]
 ; CHECK-NEXT:    lda.s8 r7, [p0, #7]
+; CHECK-NEXT:    mov p7, r26
 ; CHECK-NEXT:    st r0, [p7], #4
 ; CHECK-NEXT:    st r1, [p7], #4
 ; CHECK-NEXT:    st r2, [p7], #4
-; CHECK-NEXT:    lda.s8 r2, [p0, #2]
 ; CHECK-NEXT:    st r3, [p7, #0]
 ; CHECK-NEXT:    mov p7, p0
-; CHECK-NEXT:    lda.s8 r3, [p0, #3]
 ; CHECK-NEXT:    lda.s8 r0, [p7], #1
-; CHECK-NEXT:    lda.s8 r1, [p7, #0]
 ; CHECK-NEXT:    st.s8 r0, [p1], #1
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lda.s8 r2, [p0, #2]
+; CHECK-NEXT:    lda.s8 r3, [p0, #3]
 ; CHECK-NEXT:    mov p0, p2
+; CHECK-NEXT:    lda.s8 r1, [p7, #0]
 ; CHECK-NEXT:    lda.s16 r0, [p0], #2
-; CHECK-NEXT:    lda p7, [sp, #-640] // 4-byte Folded Reload
+; CHECK-NEXT:    lda p7, [sp, #-576] // 4-byte Folded Reload
 ; CHECK-NEXT:    st.s8 r1, [p1], #1
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -596,16 +579,14 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova dj5, #20
-; CHECK-NEXT:    lda.s16 r18, [p2, dj5]
+; CHECK-NEXT:    nop
 ; CHECK-NEXT:    st.s8 r19, [p1], #1
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mova dj5, #22
-; CHECK-NEXT:    lda.s16 r19, [p2, dj5]
+; CHECK-NEXT:    nop
 ; CHECK-NEXT:    st.s8 r20, [p1], #1
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -631,15 +612,26 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    mov p3, sp
+; CHECK-NEXT:    mova m0, #-240
+; CHECK-NEXT:    padda [p3], m0
+; CHECK-NEXT:    st.s16 r0, [p3], #2
+; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st.s16 r0, [p3], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st.s16 r1, [p3], #2
+; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mova dj5, #20
+; CHECK-NEXT:    lda.s16 r18, [p2, dj5]
+; CHECK-NEXT:    mova dj5, #22
+; CHECK-NEXT:    lda.s16 r19, [p2, dj5]
 ; CHECK-NEXT:    mova dj5, #24
 ; CHECK-NEXT:    lda.s16 r20, [p2, dj5]
 ; CHECK-NEXT:    mova dj5, #26
@@ -647,15 +639,11 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    mova dj5, #28
 ; CHECK-NEXT:    lda.s16 r22, [p2, dj5]
 ; CHECK-NEXT:    mova dj5, #30
+; CHECK-NEXT:    mov p1, r30
 ; CHECK-NEXT:    lda.s16 r23, [p2, dj5]
-; CHECK-NEXT:    lda p1, [sp, #-24] // 4-byte Folded Reload
-; CHECK-NEXT:    st.s16 r1, [p3], #2
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    mov p0, p1
+; CHECK-NEXT:    lda r0, [p0], #4
+; CHECK-NEXT:    lda r1, [p0, #0]
 ; CHECK-NEXT:    st.s16 r2, [p3], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -663,6 +651,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    lda r2, [p1, #8]
 ; CHECK-NEXT:    st.s16 r3, [p3], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -670,6 +659,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    lda r3, [p1, #12]
 ; CHECK-NEXT:    st.s16 r4, [p3], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -677,6 +667,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    lda r4, [p1, #16]
 ; CHECK-NEXT:    st.s16 r5, [p3], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -684,9 +675,6 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    lda r2, [p1, #8]
-; CHECK-NEXT:    lda r3, [p1, #12]
-; CHECK-NEXT:    lda r4, [p1, #16]
 ; CHECK-NEXT:    lda r5, [p1, #20]
 ; CHECK-NEXT:    st.s16 r6, [p3], #2
 ; CHECK-NEXT:    nop
@@ -694,27 +682,26 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    mov p0, p1
+; CHECK-NEXT:    nop
 ; CHECK-NEXT:    lda r6, [p1, #24]
-; CHECK-NEXT:    lda r0, [p0], #4
 ; CHECK-NEXT:    st.s16 r7, [p3], #2
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    mov p4, sp
+; CHECK-NEXT:    mova m0, #-208
+; CHECK-NEXT:    padda [p4], m0
+; CHECK-NEXT:    mov r25, p4
+; CHECK-NEXT:    mov p0, r25
 ; CHECK-NEXT:    lda r7, [p1, #28]
-; CHECK-NEXT:    lda p1, [sp, #-16] // 4-byte Folded Reload
-; CHECK-NEXT:    lda r1, [p0, #0]
-; CHECK-NEXT:    lda p0, [sp, #-44] // 4-byte Folded Reload
+; CHECK-NEXT:    st r0, [p0], #4
 ; CHECK-NEXT:    st.s16 r16, [p3], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    mov p1, r31
+; CHECK-NEXT:    st r1, [p0], #4
+; CHECK-NEXT:    lda r1, [p1, #4]
 ; CHECK-NEXT:    st.s16 r17, [p3], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -722,6 +709,8 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st r2, [p0], #4
+; CHECK-NEXT:    lda r2, [p1, #8]
 ; CHECK-NEXT:    st.s16 r18, [p3], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -729,6 +718,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st r3, [p0], #4
 ; CHECK-NEXT:    st.s16 r19, [p3], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -736,6 +726,8 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st r4, [p0], #4
+; CHECK-NEXT:    lda r4, [p1, #16]
 ; CHECK-NEXT:    st.s16 r20, [p3], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -743,6 +735,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st r5, [p0], #4
 ; CHECK-NEXT:    st.s16 r21, [p3], #2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -750,43 +743,33 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    st r6, [p0], #4
+; CHECK-NEXT:    lda r6, [p1, #24]
 ; CHECK-NEXT:    st.s16 r22, [p3], #2
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    st r0, [p0], #4
-; CHECK-NEXT:    st.s16 r23, [p3, #0]
 ; CHECK-NEXT:    mov p6, sp
 ; CHECK-NEXT:    mova m0, #-176
 ; CHECK-NEXT:    mov p4, sp
 ; CHECK-NEXT:    padda [p4], m0
-; CHECK-NEXT:    mova m0, #-112
-; CHECK-NEXT:    padda [p6], m0
-; CHECK-NEXT:    st r1, [p0], #4
-; CHECK-NEXT:    lda r1, [p1, #4]
-; CHECK-NEXT:    st r2, [p0], #4
-; CHECK-NEXT:    lda r2, [p1, #8]
-; CHECK-NEXT:    st r3, [p0], #4
-; CHECK-NEXT:    st r4, [p0], #4
-; CHECK-NEXT:    lda r4, [p1, #16]
-; CHECK-NEXT:    st r5, [p0], #4
-; CHECK-NEXT:    st r6, [p0], #4
-; CHECK-NEXT:    lda r6, [p1, #24]
+; CHECK-NEXT:    mova m0, #-144
+; CHECK-NEXT:    mov r24, p4
 ; CHECK-NEXT:    st r7, [p0, #0]
+; CHECK-NEXT:    lda r8, [sp, #-568] // 4-byte Folded Reload
+; CHECK-NEXT:    st.s16 r23, [p3, #0]
+; CHECK-NEXT:    mov p4, sp
+; CHECK-NEXT:    padda [p4], m0
+; CHECK-NEXT:    mova m0, #-80
 ; CHECK-NEXT:    mov p0, p1
-; CHECK-NEXT:    mov p1, r24
+; CHECK-NEXT:    mov p1, r8
+; CHECK-NEXT:    padda [p6], m0
 ; CHECK-NEXT:    lda r16, [p1, dj4]
-; CHECK-NEXT:    lda r0, [p0], #12
-; CHECK-NEXT:    lda p0, [sp, #-48] // 4-byte Folded Reload
 ; CHECK-NEXT:    lda r18, [p1, dj6]
+; CHECK-NEXT:    lda r0, [p0], #12
 ; CHECK-NEXT:    lda r20, [p1, dj1]
 ; CHECK-NEXT:    lda r22, [p1, dj0]
 ; CHECK-NEXT:    lda r3, [p0], #8
 ; CHECK-NEXT:    lda r5, [p0], #8
 ; CHECK-NEXT:    lda r7, [p0, #0]
+; CHECK-NEXT:    mov p0, r24
 ; CHECK-NEXT:    st r0, [p0], #4
 ; CHECK-NEXT:    st r1, [p0], #4
 ; CHECK-NEXT:    lda r1, [p1, #4]
@@ -855,12 +838,12 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32>
 ; CHECK-NEXT:    st r18, [p6], #4
 ; CHECK-NEXT:    st r19, [p6], #4
 ; CHECK-NEXT:    st r20, [p6], #4
-; CHECK-NEXT:    lda p6, [sp, #-636] // 4-byte Folded Reload
+; CHECK-NEXT:    lda p6, [sp, #-572] // 4-byte Folded Reload
 ; CHECK-NEXT:    ret lr
 ; CHECK-NEXT:    st r21, [p6], #4 // Delay Slot 5
 ; CHECK-NEXT:    st r22, [p6], #4 // Delay Slot 4
 ; CHECK-NEXT:    st r23, [p6, #0] // Delay Slot 3
-; CHECK-NEXT:    paddxm [sp], #-640 // Delay Slot 2
+; CHECK-NEXT:    paddxm [sp], #-576 // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
   %a.addr = alloca <8 x i16>, align 8
diff --git a/llvm/test/CodeGen/AIE/aie2p/streams.ll b/llvm/test/CodeGen/AIE/aie2p/streams.ll
index b60b3e354b80..100f16ce3693 100644
--- a/llvm/test/CodeGen/AIE/aie2p/streams.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/streams.ll
@@ -138,41 +138,41 @@ define dso_local void @_Z19test_put_ms_v64bf16Dv64_u6__bf16ii(<64 x bfloat> noun
 ; CHECK-LABEL: _Z19test_put_ms_v64bf16Dv64_u6__bf16ii:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    mova r2, #0; nopb ; nopx ; mov r28, r1
-; CHECK-NEXT:    mova r0, #1; vextract.32 r5, x4, r2, vaddsign1
-; CHECK-NEXT:    mova r3, #2; vextract.32 r6, x4, r0, vaddsign1
-; CHECK-NEXT:    mova r4, #3; mov ms, r5; vextract.32 r7, x4, r3, vaddsign1
-; CHECK-NEXT:    mova r5, #4; mov ms, r6; vextract.32 r6, x4, r4, vaddsign1
-; CHECK-NEXT:    mova r16, #5; mov ms, r7; vextract.32 r7, x4, r5, vaddsign1
-; CHECK-NEXT:    mova r17, #6; mov ms, r6; vextract.32 r6, x4, r16, vaddsign1
-; CHECK-NEXT:    mova r18, #7; mov ms, r7; vextract.32 r7, x4, r17, vaddsign1
-; CHECK-NEXT:    mova r19, #8; mov ms, r6; vextract.32 r6, x4, r18, vaddsign1
-; CHECK-NEXT:    mova r20, #9; mov ms, r7; vextract.32 r7, x4, r19, vaddsign1
-; CHECK-NEXT:    mova r21, #10; mov ms, r6; vextract.32 r6, x4, r20, vaddsign1
-; CHECK-NEXT:    mova r22, #11; mov ms, r7; vextract.32 r7, x4, r21, vaddsign1
-; CHECK-NEXT:    mova r23, #12; mov ms, r6; vextract.32 r6, x4, r22, vaddsign1
-; CHECK-NEXT:    mova r24, #13; mov ms, r7; vextract.32 r7, x4, r23, vaddsign1
-; CHECK-NEXT:    mova r25, #14; mov ms, r6; vextract.32 r6, x4, r24, vaddsign1
-; CHECK-NEXT:    mova r26, #15; mov ms, r7; vextract.32 r7, x4, r25, vaddsign1
-; CHECK-NEXT:    mov ms, r6; vextract.32 r6, x4, r26, vaddsign1
-; CHECK-NEXT:    mov ms, r7; vextract.32 r2, x5, r2, vaddsign1
-; CHECK-NEXT:    mov ms, r6; vextract.32 r0, x5, r0, vaddsign1
-; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, r3, vaddsign1
-; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, r4, vaddsign1
-; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, r5, vaddsign1
-; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, r16, vaddsign1
-; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, r17, vaddsign1
-; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, r18, vaddsign1
-; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, r19, vaddsign1
-; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, r20, vaddsign1
-; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, r21, vaddsign1
-; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, r22, vaddsign1
-; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, r23, vaddsign1
-; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, r24, vaddsign1
-; CHECK-NEXT:    mov ms, r2; ret lr; vextract.32 r2, x5, r25, vaddsign1
-; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, r26, vaddsign1 // Delay Slot 5
-; CHECK-NEXT:    mov ms, r2 // Delay Slot 4
-; CHECK-NEXT:    mov ms, r0, r28 // Delay Slot 3
+; CHECK-NEXT:    nopa ; nopx ; mov r28, r1
+; CHECK-NEXT:    vextract.32 r0, x4, #0, vaddsign1
+; CHECK-NEXT:    vextract.32 r2, x4, #1, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x4, #2, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x4, #3, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x4, #4, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x4, #5, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x4, #6, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x4, #7, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x4, #8, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x4, #9, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x4, #10, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x4, #11, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x4, #12, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x4, #13, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x4, #14, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x4, #15, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, #0, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, #1, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, #2, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, #3, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, #4, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, #5, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, #6, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, #7, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, #8, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, #9, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, #10, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, #11, vaddsign1
+; CHECK-NEXT:    mov ms, r0; vextract.32 r0, x5, #12, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, #13, vaddsign1
+; CHECK-NEXT:    mov ms, r0; ret lr; vextract.32 r0, x5, #14, vaddsign1
+; CHECK-NEXT:    mov ms, r2; vextract.32 r2, x5, #15, vaddsign1 // Delay Slot 5
+; CHECK-NEXT:    mov ms, r0 // Delay Slot 4
+; CHECK-NEXT:    mov ms, r2, r28 // Delay Slot 3
 ; CHECK-NEXT:    nop // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
diff --git a/llvm/test/CodeGen/AIE/aie2p/vmac.ll b/llvm/test/CodeGen/AIE/aie2p/vmac.ll
index c315920d4b12..e512da792d18 100644
--- a/llvm/test/CodeGen/AIE/aie2p/vmac.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/vmac.ll
@@ -105,115 +105,117 @@ define dso_local inreg noundef <64 x i32> @_Z27test_addmac_4x16_16x16_confDv64_h
 ; CHECK-LABEL: _Z27test_addmac_4x16_16x16_confDv64_hiDv128_DB8_iDv64_u7__acc32S2_iiiii:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    mova r7, #50; mov crunpacksize, #0
-; CHECK-NEXT:    mova r23, #5; vshuffle x0, x0, x0, r7
-; CHECK-NEXT:    mova r28, #1; vextract.32 r29, x4, r23, vaddsign1
-; CHECK-NEXT:    mova r20, #2; vextract.32 r16, x0, r28, vaddsign1
-; CHECK-NEXT:    mova r21, #3; vextract.32 r17, x0, r20, vaddsign1
-; CHECK-NEXT:    mova r22, #4; vextract.32 r18, x0, r21, vaddsign1
-; CHECK-NEXT:    vextract.32 r19, x0, r22, vaddsign1
-; CHECK-NEXT:    mova r24, #0; vextract.32 r25, x0, r23, vaddsign1
-; CHECK-NEXT:    vextract.32 r7, x0, r24, vaddsign1
-; CHECK-NEXT:    vextract.32 r23, x5, r23, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x2, x0, r7
-; CHECK-NEXT:    vpush.hi.32 x2, x2, r16
+; CHECK-NEXT:    nopa ; nopb ; nops ; nopx ; mov crunpacksize, #0; nopv
+; CHECK-NEXT:    paddxm [sp], #64
+; CHECK-NEXT:    vextract.32 r25, x4, #3, vaddsign1
+; CHECK-NEXT:    vextract.32 r26, x4, #4, vaddsign1
+; CHECK-NEXT:    vextract.32 r27, x4, #5, vaddsign1
+; CHECK-NEXT:    vextract.32 r28, x4, #6, vaddsign1
+; CHECK-NEXT:    vextract.32 r29, x4, #7, vaddsign1
+; CHECK-NEXT:    vextract.32 r30, x4, #8, vaddsign1
+; CHECK-NEXT:    mova r7, #50; vextract.32 r31, x4, #9, vaddsign1
+; CHECK-NEXT:    vshuffle x2, x0, x0, r7
+; CHECK-NEXT:    vextract.32 r8, x4, #10, vaddsign1
+; CHECK-NEXT:    st r8, [sp, #-48]; vextract.32 r9, x4, #11, vaddsign1 // 4-byte Folded Spill
+; CHECK-NEXT:    st r9, [sp, #-52]; vextract.32 r10, x4, #12, vaddsign1 // 4-byte Folded Spill
+; CHECK-NEXT:    st r10, [sp, #-56]; vextract.32 r11, x4, #13, vaddsign1 // 4-byte Folded Spill
+; CHECK-NEXT:    st r11, [sp, #-60]; vextract.32 r12, x4, #14, vaddsign1 // 4-byte Folded Spill
+; CHECK-NEXT:    st r12, [sp, #-64]; vextract.32 r18, x2, #1, vaddsign1 // 4-byte Folded Spill
+; CHECK-NEXT:    vextract.32 r19, x2, #2, vaddsign1
+; CHECK-NEXT:    vextract.32 r20, x2, #3, vaddsign1
+; CHECK-NEXT:    vextract.32 r21, x2, #4, vaddsign1
+; CHECK-NEXT:    vextract.32 r22, x2, #5, vaddsign1
+; CHECK-NEXT:    vextract.32 r16, x2, #6, vaddsign1
+; CHECK-NEXT:    vextract.32 r7, x2, #7, vaddsign1
+; CHECK-NEXT:    vextract.32 r23, x2, #9, vaddsign1
+; CHECK-NEXT:    vextract.32 r17, x2, #0, vaddsign1
+; CHECK-NEXT:    vextract.32 r24, x2, #10, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r17
+; CHECK-NEXT:    vextract.32 r17, x2, #15, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r18
+; CHECK-NEXT:    vextract.32 r18, x2, #14, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r19
+; CHECK-NEXT:    vextract.32 r19, x2, #13, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r20
+; CHECK-NEXT:    vextract.32 r20, x2, #12, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r21
+; CHECK-NEXT:    vextract.32 r21, x2, #11, vaddsign1
+; CHECK-NEXT:    vextract.32 r22, x2, #8, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r22
+; CHECK-NEXT:    vpush.hi.32 x2, x0, r22
+; CHECK-NEXT:    vextract.32 r22, x4, #0, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x2, x2, r23
+; CHECK-NEXT:    vextract.32 r23, x4, #1, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x2, x2, r24
+; CHECK-NEXT:    vpush.hi.32 x6, x0, r22
+; CHECK-NEXT:    vextract.32 r24, x4, #2, vaddsign1
+; CHECK-NEXT:    vextract.32 r22, x4, #15, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x6, r23
+; CHECK-NEXT:    vpush.hi.32 x2, x2, r21
+; CHECK-NEXT:    vextract.32 r23, x5, #1, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r24
+; CHECK-NEXT:    vpush.hi.32 x2, x2, r20
+; CHECK-NEXT:    vextract.32 r24, x5, #2, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r25
+; CHECK-NEXT:    vpush.hi.32 x2, x2, r19
+; CHECK-NEXT:    vextract.32 r25, x5, #3, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r26
+; CHECK-NEXT:    vpush.hi.32 x2, x2, r18
+; CHECK-NEXT:    vextract.32 r26, x5, #4, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r27
 ; CHECK-NEXT:    vpush.hi.32 x2, x2, r17
-; CHECK-NEXT:    mova r17, #11; vpush.hi.32 x2, x2, r18
-; CHECK-NEXT:    vextract.32 r27, x0, r17, vaddsign1
-; CHECK-NEXT:    mova r7, #8; vpush.hi.32 x2, x2, r19
-; CHECK-NEXT:    mova r18, #10; vextract.32 r19, x0, r7, vaddsign1
-; CHECK-NEXT:    vextract.32 r26, x0, r18, vaddsign1
-; CHECK-NEXT:    mova r16, #9; vpush.hi.32 x2, x2, r25
-; CHECK-NEXT:    vextract.32 r25, x0, r16, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x6, x0, r19
-; CHECK-NEXT:    mova r19, #12; vpush.hi.32 x6, x6, r25
-; CHECK-NEXT:    vextract.32 r25, x0, r19, vaddsign1
+; CHECK-NEXT:    vextract.32 r27, x5, #5, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r28
+; CHECK-NEXT:    vextract.32 r28, x5, #6, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r29
+; CHECK-NEXT:    vextract.32 r29, x5, #7, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r30
+; CHECK-NEXT:    vextract.32 r30, x5, #8, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r31
+; CHECK-NEXT:    vextract.32 r31, x5, #9, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r8
+; CHECK-NEXT:    vextract.32 r8, x5, #10, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r9
+; CHECK-NEXT:    vextract.32 r9, x5, #11, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r10
+; CHECK-NEXT:    vextract.32 r10, x5, #12, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r11
+; CHECK-NEXT:    vextract.32 r11, x5, #13, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r12
+; CHECK-NEXT:    vextract.32 r12, x5, #14, vaddsign1
+; CHECK-NEXT:    vextract.32 r22, x5, #0, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r22
+; CHECK-NEXT:    vpush.hi.32 x6, x0, r22
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r16
+; CHECK-NEXT:    vextract.32 r22, x5, #15, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x6, x6, r23
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r7
+; CHECK-NEXT:    nez r23, r1; vpush.hi.32 x6, x6, r24
+; CHECK-NEXT:    mov unpacksign0, r23
+; CHECK-NEXT:    vmov wl0, wh2
+; CHECK-NEXT:    vunpack y2, x4, unpacksign0; vpush.hi.32 x6, x6, r25
 ; CHECK-NEXT:    vpush.hi.32 x6, x6, r26
-; CHECK-NEXT:    vextract.32 r26, x4, r28, vaddsign1
-; CHECK-NEXT:    vextract.32 r28, x5, r28, vaddsign1
 ; CHECK-NEXT:    vpush.hi.32 x6, x6, r27
-; CHECK-NEXT:    vextract.32 r27, x4, r20, vaddsign1
-; CHECK-NEXT:    vextract.32 r20, x5, r20, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x6, x6, r25
-; CHECK-NEXT:    vextract.32 r25, x4, r24, vaddsign1
-; CHECK-NEXT:    vextract.32 r24, x5, r24, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x0, r25
-; CHECK-NEXT:    vextract.32 r25, x4, r21, vaddsign1
-; CHECK-NEXT:    vextract.32 r21, x5, r21, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r26
-; CHECK-NEXT:    vextract.32 r26, x4, r22, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r27
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r25
-; CHECK-NEXT:    mova r27, #6; vpush.hi.32 x8, x8, r26
-; CHECK-NEXT:    vextract.32 r26, x4, r27, vaddsign1
-; CHECK-NEXT:    mova r25, #13; vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vextract.32 r29, x0, r25, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r26
-; CHECK-NEXT:    mova r26, #7; vpush.hi.32 x6, x6, r29
-; CHECK-NEXT:    vextract.32 r29, x4, r26, vaddsign1
-; CHECK-NEXT:    vextract.32 r29, x4, r7, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vextract.32 r29, x4, r16, vaddsign1
-; CHECK-NEXT:    vextract.32 r29, x4, r18, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vextract.32 r29, x4, r17, vaddsign1
-; CHECK-NEXT:    vextract.32 r29, x4, r19, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    mova r30, #14; vextract.32 r29, x4, r25, vaddsign1
-; CHECK-NEXT:    vextract.32 r29, x4, r30, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    mova r29, #15; vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vextract.32 r31, x4, r29, vaddsign1
-; CHECK-NEXT:    vextract.32 r22, x5, r22, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x4, x8, r31
-; CHECK-NEXT:    vpush.hi.32 x8, x0, r24
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r28
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r20
-; CHECK-NEXT:    vextract.32 r20, x5, r27, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r21
-; CHECK-NEXT:    vextract.32 r21, x5, r26, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r22
-; CHECK-NEXT:    vextract.32 r22, x5, r7, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r23
-; CHECK-NEXT:    vextract.32 r23, x5, r16, vaddsign1
-; CHECK-NEXT:    vextract.32 r16, x0, r30, vaddsign1
-; CHECK-NEXT:    lshl r0, r0, r16; vpush.hi.32 x8, x8, r20
-; CHECK-NEXT:    vextract.32 r20, x5, r18, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r21
-; CHECK-NEXT:    vpush.hi.32 x6, x6, r16
-; CHECK-NEXT:    vextract.32 r21, x5, r17, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r22
-; CHECK-NEXT:    vextract.32 r22, x5, r19, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r23
-; CHECK-NEXT:    vextract.32 r23, x5, r25, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r20
-; CHECK-NEXT:    vextract.32 r20, x5, r30, vaddsign1
-; CHECK-NEXT:    lshl r4, r4, r17; vpush.hi.32 x8, x8, r21
-; CHECK-NEXT:    nez r21, r1; vpush.hi.32 x8, x8, r22
-; CHECK-NEXT:    lshl r3, r3, r18; mov unpacksign0, r21
-; CHECK-NEXT:    lshl r1, r1, r7; vextract.32 r22, x5, r29, vaddsign1
-; CHECK-NEXT:    or r0, r1, r0; vextract.32 r2, x0, r27, vaddsign1
-; CHECK-NEXT:    or r1, r3, r2; vextract.32 r3, x0, r29, vaddsign1
-; CHECK-NEXT:    lshl r5, r5, r19; vpush.hi.32 x8, x8, r23
-; CHECK-NEXT:    vunpack y2, x4, unpacksign0; or r1, r1, r4; vextract.32 r5, x0, r26, vaddsign1
-; CHECK-NEXT:    or r1, r1, r5; vpush.hi.32 x0, x6, r3
-; CHECK-NEXT:    vunpack y4, x8, unpacksign0; vpush.hi.32 x8, x8, r20
-; CHECK-NEXT:    vpush.hi.32 x2, x2, r2
-; CHECK-NEXT:    vmov wl0, wh0
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r22
-; CHECK-NEXT:    lshl r4, r6, r25; mov unpacksign0, #0
-; CHECK-NEXT:    mova r3, #200; or r1, r1, r0; mov unpacksign0, r21
-; CHECK-NEXT:    or r1, r1, r3; vpush.hi.32 x2, x2, r5
-; CHECK-NEXT:    or r0, r0, r4; vmov wl0, wh2; vmac dm0, dm1, x0, y4,r1
-; CHECK-NEXT:    or r0, r0, r3
+; CHECK-NEXT:    mova r24, #10; vpush.hi.32 x6, x6, r28
+; CHECK-NEXT:    mova r24, #12; lshl r3, r3, r24; vpush.hi.32 x6, x6, r29
+; CHECK-NEXT:    mova r25, #11; lshl r5, r5, r24; vpush.hi.32 x6, x6, r30
+; CHECK-NEXT:    mova r25, #9; lshl r4, r4, r25; mov unpacksign0, #0
+; CHECK-NEXT:    mova r24, #8; lshl r0, r0, r25; vpush.hi.32 x6, x6, r31
+; CHECK-NEXT:    lshl r1, r1, r24; vpush.hi.32 x6, x6, r8
+; CHECK-NEXT:    vunpack y3, x6, unpacksign0; or r0, r1, r0; vpush.hi.32 x6, x6, r9
+; CHECK-NEXT:    or r1, r3, r2; vpush.hi.32 x6, x6, r10
+; CHECK-NEXT:    mova r3, #13; or r1, r1, r4; vpush.hi.32 x6, x6, r11
+; CHECK-NEXT:    mova r2, #200; lshl r3, r6, r3; vpush.hi.32 x6, x6, r12
+; CHECK-NEXT:    lda r8, [sp, #-48]; or r1, r1, r5; vpush.hi.32 x6, x6, r22 // 4-byte Folded Reload
+; CHECK-NEXT:    lda r9, [sp, #-52]; or r1, r1, r0; mov unpacksign0, r23 // 4-byte Folded Reload
+; CHECK-NEXT:    lda r10, [sp, #-56]; or r1, r1, r2 // 4-byte Folded Reload
+; CHECK-NEXT:    lda r11, [sp, #-60]; or r0, r0, r3; vmov wl0, wh0; vmac dm0, dm1, x0, y3,r1 // 4-byte Folded Reload
+; CHECK-NEXT:    lda r12, [sp, #-64]; or r0, r0, r2 // 4-byte Folded Reload
 ; CHECK-NEXT:    ret lr; vaddmac dm0, dm0, dm2, x0, y2,r0
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
 ; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    nop // Delay Slot 2
+; CHECK-NEXT:    paddxm [sp], #-64 // Delay Slot 2
 ; CHECK-NEXT:    mov unpacksign0, #0 // Delay Slot 1
 entry:
   %0 = bitcast <64 x i8> %a to <16 x i32>
@@ -258,115 +260,117 @@ define dso_local inreg noundef <64 x i32> @_Z27test_addmsc_4x16_16x16_confDv64_h
 ; CHECK-LABEL: _Z27test_addmsc_4x16_16x16_confDv64_hiDv128_DB8_iDv64_u7__acc32S2_iiiii:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    mova r7, #50; mov crunpacksize, #0
-; CHECK-NEXT:    mova r23, #5; vshuffle x0, x0, x0, r7
-; CHECK-NEXT:    mova r28, #1; vextract.32 r29, x4, r23, vaddsign1
-; CHECK-NEXT:    mova r20, #2; vextract.32 r16, x0, r28, vaddsign1
-; CHECK-NEXT:    mova r21, #3; vextract.32 r17, x0, r20, vaddsign1
-; CHECK-NEXT:    mova r22, #4; vextract.32 r18, x0, r21, vaddsign1
-; CHECK-NEXT:    vextract.32 r19, x0, r22, vaddsign1
-; CHECK-NEXT:    mova r24, #0; vextract.32 r25, x0, r23, vaddsign1
-; CHECK-NEXT:    vextract.32 r7, x0, r24, vaddsign1
-; CHECK-NEXT:    vextract.32 r23, x5, r23, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x2, x0, r7
-; CHECK-NEXT:    vpush.hi.32 x2, x2, r16
+; CHECK-NEXT:    nopa ; nopb ; nops ; nopx ; mov crunpacksize, #0; nopv
+; CHECK-NEXT:    paddxm [sp], #64
+; CHECK-NEXT:    vextract.32 r25, x4, #3, vaddsign1
+; CHECK-NEXT:    vextract.32 r26, x4, #4, vaddsign1
+; CHECK-NEXT:    vextract.32 r27, x4, #5, vaddsign1
+; CHECK-NEXT:    vextract.32 r28, x4, #6, vaddsign1
+; CHECK-NEXT:    vextract.32 r29, x4, #7, vaddsign1
+; CHECK-NEXT:    vextract.32 r30, x4, #8, vaddsign1
+; CHECK-NEXT:    mova r7, #50; vextract.32 r31, x4, #9, vaddsign1
+; CHECK-NEXT:    vshuffle x2, x0, x0, r7
+; CHECK-NEXT:    vextract.32 r8, x4, #10, vaddsign1
+; CHECK-NEXT:    st r8, [sp, #-48]; vextract.32 r9, x4, #11, vaddsign1 // 4-byte Folded Spill
+; CHECK-NEXT:    st r9, [sp, #-52]; vextract.32 r10, x4, #12, vaddsign1 // 4-byte Folded Spill
+; CHECK-NEXT:    st r10, [sp, #-56]; vextract.32 r11, x4, #13, vaddsign1 // 4-byte Folded Spill
+; CHECK-NEXT:    st r11, [sp, #-60]; vextract.32 r12, x4, #14, vaddsign1 // 4-byte Folded Spill
+; CHECK-NEXT:    st r12, [sp, #-64]; vextract.32 r18, x2, #1, vaddsign1 // 4-byte Folded Spill
+; CHECK-NEXT:    vextract.32 r19, x2, #2, vaddsign1
+; CHECK-NEXT:    vextract.32 r20, x2, #3, vaddsign1
+; CHECK-NEXT:    vextract.32 r21, x2, #4, vaddsign1
+; CHECK-NEXT:    vextract.32 r22, x2, #5, vaddsign1
+; CHECK-NEXT:    vextract.32 r16, x2, #6, vaddsign1
+; CHECK-NEXT:    vextract.32 r7, x2, #7, vaddsign1
+; CHECK-NEXT:    vextract.32 r23, x2, #9, vaddsign1
+; CHECK-NEXT:    vextract.32 r17, x2, #0, vaddsign1
+; CHECK-NEXT:    vextract.32 r24, x2, #10, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r17
+; CHECK-NEXT:    vextract.32 r17, x2, #15, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r18
+; CHECK-NEXT:    vextract.32 r18, x2, #14, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r19
+; CHECK-NEXT:    vextract.32 r19, x2, #13, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r20
+; CHECK-NEXT:    vextract.32 r20, x2, #12, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r21
+; CHECK-NEXT:    vextract.32 r21, x2, #11, vaddsign1
+; CHECK-NEXT:    vextract.32 r22, x2, #8, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r22
+; CHECK-NEXT:    vpush.hi.32 x2, x0, r22
+; CHECK-NEXT:    vextract.32 r22, x4, #0, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x2, x2, r23
+; CHECK-NEXT:    vextract.32 r23, x4, #1, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x2, x2, r24
+; CHECK-NEXT:    vpush.hi.32 x6, x0, r22
+; CHECK-NEXT:    vextract.32 r24, x4, #2, vaddsign1
+; CHECK-NEXT:    vextract.32 r22, x4, #15, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x6, r23
+; CHECK-NEXT:    vpush.hi.32 x2, x2, r21
+; CHECK-NEXT:    vextract.32 r23, x5, #1, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r24
+; CHECK-NEXT:    vpush.hi.32 x2, x2, r20
+; CHECK-NEXT:    vextract.32 r24, x5, #2, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r25
+; CHECK-NEXT:    vpush.hi.32 x2, x2, r19
+; CHECK-NEXT:    vextract.32 r25, x5, #3, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r26
+; CHECK-NEXT:    vpush.hi.32 x2, x2, r18
+; CHECK-NEXT:    vextract.32 r26, x5, #4, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r27
 ; CHECK-NEXT:    vpush.hi.32 x2, x2, r17
-; CHECK-NEXT:    mova r17, #11; vpush.hi.32 x2, x2, r18
-; CHECK-NEXT:    vextract.32 r27, x0, r17, vaddsign1
-; CHECK-NEXT:    mova r7, #8; vpush.hi.32 x2, x2, r19
-; CHECK-NEXT:    mova r18, #10; vextract.32 r19, x0, r7, vaddsign1
-; CHECK-NEXT:    vextract.32 r26, x0, r18, vaddsign1
-; CHECK-NEXT:    mova r16, #9; vpush.hi.32 x2, x2, r25
-; CHECK-NEXT:    vextract.32 r25, x0, r16, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x6, x0, r19
-; CHECK-NEXT:    mova r19, #12; vpush.hi.32 x6, x6, r25
-; CHECK-NEXT:    vextract.32 r25, x0, r19, vaddsign1
+; CHECK-NEXT:    vextract.32 r27, x5, #5, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r28
+; CHECK-NEXT:    vextract.32 r28, x5, #6, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r29
+; CHECK-NEXT:    vextract.32 r29, x5, #7, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r30
+; CHECK-NEXT:    vextract.32 r30, x5, #8, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r31
+; CHECK-NEXT:    vextract.32 r31, x5, #9, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r8
+; CHECK-NEXT:    vextract.32 r8, x5, #10, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r9
+; CHECK-NEXT:    vextract.32 r9, x5, #11, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r10
+; CHECK-NEXT:    vextract.32 r10, x5, #12, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r11
+; CHECK-NEXT:    vextract.32 r11, x5, #13, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r12
+; CHECK-NEXT:    vextract.32 r12, x5, #14, vaddsign1
+; CHECK-NEXT:    vextract.32 r22, x5, #0, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x4, x4, r22
+; CHECK-NEXT:    vpush.hi.32 x6, x0, r22
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r16
+; CHECK-NEXT:    vextract.32 r22, x5, #15, vaddsign1
+; CHECK-NEXT:    vpush.hi.32 x6, x6, r23
+; CHECK-NEXT:    vpush.hi.32 x0, x0, r7
+; CHECK-NEXT:    nez r23, r1; vpush.hi.32 x6, x6, r24
+; CHECK-NEXT:    mov unpacksign0, r23
+; CHECK-NEXT:    vmov wl0, wh2
+; CHECK-NEXT:    vunpack y2, x4, unpacksign0; vpush.hi.32 x6, x6, r25
 ; CHECK-NEXT:    vpush.hi.32 x6, x6, r26
-; CHECK-NEXT:    vextract.32 r26, x4, r28, vaddsign1
-; CHECK-NEXT:    vextract.32 r28, x5, r28, vaddsign1
 ; CHECK-NEXT:    vpush.hi.32 x6, x6, r27
-; CHECK-NEXT:    vextract.32 r27, x4, r20, vaddsign1
-; CHECK-NEXT:    vextract.32 r20, x5, r20, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x6, x6, r25
-; CHECK-NEXT:    vextract.32 r25, x4, r24, vaddsign1
-; CHECK-NEXT:    vextract.32 r24, x5, r24, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x0, r25
-; CHECK-NEXT:    vextract.32 r25, x4, r21, vaddsign1
-; CHECK-NEXT:    vextract.32 r21, x5, r21, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r26
-; CHECK-NEXT:    vextract.32 r26, x4, r22, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r27
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r25
-; CHECK-NEXT:    mova r27, #6; vpush.hi.32 x8, x8, r26
-; CHECK-NEXT:    vextract.32 r26, x4, r27, vaddsign1
-; CHECK-NEXT:    mova r25, #13; vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vextract.32 r29, x0, r25, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r26
-; CHECK-NEXT:    mova r26, #7; vpush.hi.32 x6, x6, r29
-; CHECK-NEXT:    vextract.32 r29, x4, r26, vaddsign1
-; CHECK-NEXT:    vextract.32 r29, x4, r7, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vextract.32 r29, x4, r16, vaddsign1
-; CHECK-NEXT:    vextract.32 r29, x4, r18, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vextract.32 r29, x4, r17, vaddsign1
-; CHECK-NEXT:    vextract.32 r29, x4, r19, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    mova r30, #14; vextract.32 r29, x4, r25, vaddsign1
-; CHECK-NEXT:    vextract.32 r29, x4, r30, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    mova r29, #15; vpush.hi.32 x8, x8, r29
-; CHECK-NEXT:    vextract.32 r31, x4, r29, vaddsign1
-; CHECK-NEXT:    vextract.32 r22, x5, r22, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x4, x8, r31
-; CHECK-NEXT:    vpush.hi.32 x8, x0, r24
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r28
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r20
-; CHECK-NEXT:    vextract.32 r20, x5, r27, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r21
-; CHECK-NEXT:    vextract.32 r21, x5, r26, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r22
-; CHECK-NEXT:    vextract.32 r22, x5, r7, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r23
-; CHECK-NEXT:    vextract.32 r23, x5, r16, vaddsign1
-; CHECK-NEXT:    vextract.32 r16, x0, r30, vaddsign1
-; CHECK-NEXT:    lshl r0, r0, r16; vpush.hi.32 x8, x8, r20
-; CHECK-NEXT:    vextract.32 r20, x5, r18, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r21
-; CHECK-NEXT:    vpush.hi.32 x6, x6, r16
-; CHECK-NEXT:    vextract.32 r21, x5, r17, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r22
-; CHECK-NEXT:    vextract.32 r22, x5, r19, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r23
-; CHECK-NEXT:    vextract.32 r23, x5, r25, vaddsign1
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r20
-; CHECK-NEXT:    vextract.32 r20, x5, r30, vaddsign1
-; CHECK-NEXT:    lshl r4, r4, r17; vpush.hi.32 x8, x8, r21
-; CHECK-NEXT:    nez r21, r1; vpush.hi.32 x8, x8, r22
-; CHECK-NEXT:    lshl r3, r3, r18; mov unpacksign0, r21
-; CHECK-NEXT:    lshl r1, r1, r7; vextract.32 r22, x5, r29, vaddsign1
-; CHECK-NEXT:    or r0, r1, r0; vextract.32 r2, x0, r27, vaddsign1
-; CHECK-NEXT:    or r1, r3, r2; vextract.32 r3, x0, r29, vaddsign1
-; CHECK-NEXT:    lshl r5, r5, r19; vpush.hi.32 x8, x8, r23
-; CHECK-NEXT:    vunpack y2, x4, unpacksign0; or r1, r1, r4; vextract.32 r5, x0, r26, vaddsign1
-; CHECK-NEXT:    or r1, r1, r5; vpush.hi.32 x0, x6, r3
-; CHECK-NEXT:    vunpack y4, x8, unpacksign0; vpush.hi.32 x8, x8, r20
-; CHECK-NEXT:    vpush.hi.32 x2, x2, r2
-; CHECK-NEXT:    vmov wl0, wh0
-; CHECK-NEXT:    vpush.hi.32 x8, x8, r22
-; CHECK-NEXT:    lshl r4, r6, r25; mov unpacksign0, #0
-; CHECK-NEXT:    mova r3, #200; or r1, r1, r0; mov unpacksign0, r21
-; CHECK-NEXT:    or r1, r1, r3; vpush.hi.32 x2, x2, r5
-; CHECK-NEXT:    or r0, r0, r4; vmov wl0, wh2; vmsc dm0, dm1, x0, y4,r1
-; CHECK-NEXT:    or r0, r0, r3
+; CHECK-NEXT:    mova r24, #10; vpush.hi.32 x6, x6, r28
+; CHECK-NEXT:    mova r24, #12; lshl r3, r3, r24; vpush.hi.32 x6, x6, r29
+; CHECK-NEXT:    mova r25, #11; lshl r5, r5, r24; vpush.hi.32 x6, x6, r30
+; CHECK-NEXT:    mova r25, #9; lshl r4, r4, r25; mov unpacksign0, #0
+; CHECK-NEXT:    mova r24, #8; lshl r0, r0, r25; vpush.hi.32 x6, x6, r31
+; CHECK-NEXT:    lshl r1, r1, r24; vpush.hi.32 x6, x6, r8
+; CHECK-NEXT:    vunpack y3, x6, unpacksign0; or r0, r1, r0; vpush.hi.32 x6, x6, r9
+; CHECK-NEXT:    or r1, r3, r2; vpush.hi.32 x6, x6, r10
+; CHECK-NEXT:    mova r3, #13; or r1, r1, r4; vpush.hi.32 x6, x6, r11
+; CHECK-NEXT:    mova r2, #200; lshl r3, r6, r3; vpush.hi.32 x6, x6, r12
+; CHECK-NEXT:    lda r8, [sp, #-48]; or r1, r1, r5; vpush.hi.32 x6, x6, r22 // 4-byte Folded Reload
+; CHECK-NEXT:    lda r9, [sp, #-52]; or r1, r1, r0; mov unpacksign0, r23 // 4-byte Folded Reload
+; CHECK-NEXT:    lda r10, [sp, #-56]; or r1, r1, r2 // 4-byte Folded Reload
+; CHECK-NEXT:    lda r11, [sp, #-60]; or r0, r0, r3; vmov wl0, wh0; vmsc dm0, dm1, x0, y3,r1 // 4-byte Folded Reload
+; CHECK-NEXT:    lda r12, [sp, #-64]; or r0, r0, r2 // 4-byte Folded Reload
 ; CHECK-NEXT:    ret lr; vaddmsc dm0, dm0, dm2, x0, y2,r0
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
 ; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    nop // Delay Slot 2
+; CHECK-NEXT:    paddxm [sp], #-64 // Delay Slot 2
 ; CHECK-NEXT:    mov unpacksign0, #0 // Delay Slot 1
 entry:
   %0 = bitcast <64 x i8> %a to <16 x i32>
diff --git a/llvm/test/CodeGen/AIE/extractelement.ll b/llvm/test/CodeGen/AIE/extractelement.ll
index d8396d2ba51e..6083cc4f8bf5 100644
--- a/llvm/test/CodeGen/AIE/extractelement.ll
+++ b/llvm/test/CodeGen/AIE/extractelement.ll
@@ -72,11 +72,11 @@ define signext i8 @extract_v16i8_signext(<16 x i8> %v) nounwind {
 ; AIE2P-LABEL: extract_v16i8_signext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #0 // Delay Slot 3
-; AIE2P-NEXT:    vextract.8 r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.8 r0, x0, #0, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <16 x i8> %v, i32 0
   ret i8 %1
@@ -96,11 +96,11 @@ define zeroext i8 @extract_v16i8_zeroext(<16 x i8> %v) nounwind {
 ; AIE2P-LABEL: extract_v16i8_zeroext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #0 // Delay Slot 3
-; AIE2P-NEXT:    vextract.8 r0, x0, r0, vaddsign0 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.8 r0, x0, #0, vaddsign0 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <16 x i8> %v, i32 0
   ret i8 %1
@@ -144,11 +144,11 @@ define signext i16 @extract_v8i16_signext(<8 x i16> %v) nounwind {
 ; AIE2P-LABEL: extract_v8i16_signext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #0 // Delay Slot 3
-; AIE2P-NEXT:    vextract.16 r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.16 r0, x0, #0, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <8 x i16> %v, i32 0
   ret i16 %1
@@ -168,11 +168,11 @@ define zeroext i16 @extract_v8i16_zeroext(<8 x i16> %v) nounwind {
 ; AIE2P-LABEL: extract_v8i16_zeroext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #0 // Delay Slot 3
-; AIE2P-NEXT:    vextract.16 r0, x0, r0, vaddsign0 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.16 r0, x0, #0, vaddsign0 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <8 x i16> %v, i32 0
   ret i16 %1
@@ -234,25 +234,21 @@ define i32 @extract_v4i32(<4 x i32> %v) nounwind {
 ; AIE2P-LABEL: extract_v4i32:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    mova r0, #0; nopx
-; AIE2P-NEXT:    mova r2, #1
-; AIE2P-NEXT:    mova r3, #2
-; AIE2P-NEXT:    mova r4, #3
-; AIE2P-NEXT:    vextract.32 r2, x0, r2, vaddsign1
-; AIE2P-NEXT:    vextract.32 r3, x0, r3, vaddsign1
-; AIE2P-NEXT:    vextract.32 r1, x0, r0, vaddsign1
-; AIE2P-NEXT:    vextract.32 r4, x0, r4, vaddsign1
+; AIE2P-NEXT:    nopx ; vextract.32 r1, x0, #1, vaddsign1
+; AIE2P-NEXT:    vextract.32 r2, x0, #2, vaddsign1
+; AIE2P-NEXT:    vextract.32 r0, x0, #0, vaddsign1
+; AIE2P-NEXT:    vextract.32 r3, x0, #3, vaddsign1
+; AIE2P-NEXT:    vpush.hi.32 x0, x0, r0
 ; AIE2P-NEXT:    vpush.hi.32 x0, x0, r1
 ; AIE2P-NEXT:    vpush.hi.32 x0, x0, r2
 ; AIE2P-NEXT:    vpush.hi.32 x0, x0, r3
-; AIE2P-NEXT:    vpush.hi.32 x0, x0, r4
 ; AIE2P-NEXT:    vpush.hi.32 x0, x0, r0
 ; AIE2P-NEXT:    vpush.hi.32 x0, x0, r0
 ; AIE2P-NEXT:    ret lr
 ; AIE2P-NEXT:    vpush.hi.32 x0, x0, r0 // Delay Slot 5
 ; AIE2P-NEXT:    vpush.hi.32 x0, x0, r0 // Delay Slot 4
 ; AIE2P-NEXT:    vmov wl0, wh0 // Delay Slot 3
-; AIE2P-NEXT:    vextract.32 r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    vextract.32 r0, x0, #0, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <4 x i32> %v, i32 0
   ret i32 %1
@@ -289,14 +285,10 @@ define i32 @extract_v4i32_dyn(<4 x i32> %v, i32 %idx) nounwind {
 ; AIE2P-LABEL: extract_v4i32_dyn:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    mova r0, #0; nopx
-; AIE2P-NEXT:    mova r2, #1
-; AIE2P-NEXT:    mova r3, #2
-; AIE2P-NEXT:    mova r4, #3
-; AIE2P-NEXT:    vextract.32 r2, x0, r2, vaddsign1
-; AIE2P-NEXT:    vextract.32 r3, x0, r3, vaddsign1
-; AIE2P-NEXT:    vextract.32 r0, x0, r0, vaddsign1
-; AIE2P-NEXT:    vextract.32 r4, x0, r4, vaddsign1
+; AIE2P-NEXT:    nopx ; vextract.32 r2, x0, #1, vaddsign1
+; AIE2P-NEXT:    vextract.32 r3, x0, #2, vaddsign1
+; AIE2P-NEXT:    vextract.32 r0, x0, #0, vaddsign1
+; AIE2P-NEXT:    vextract.32 r4, x0, #3, vaddsign1
 ; AIE2P-NEXT:    vpush.hi.32 x0, x0, r0
 ; AIE2P-NEXT:    vpush.hi.32 x0, x0, r2
 ; AIE2P-NEXT:    vpush.hi.32 x0, x0, r3
@@ -328,11 +320,11 @@ define signext i8 @extract_v32i8_signext(<32 x i8> %v) nounwind {
 ; AIE2P-LABEL: extract_v32i8_signext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #1 // Delay Slot 3
-; AIE2P-NEXT:    vextract.8 r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.8 r0, x0, #1, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <32 x i8> %v, i32 1
   ret i8 %1
@@ -352,11 +344,11 @@ define zeroext i8 @extract_v32i8_zeroext(<32 x i8> %v) nounwind {
 ; AIE2P-LABEL: extract_v32i8_zeroext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #1 // Delay Slot 3
-; AIE2P-NEXT:    vextract.8 r0, x0, r0, vaddsign0 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.8 r0, x0, #1, vaddsign0 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <32 x i8> %v, i32 1
   ret i8 %1
@@ -400,11 +392,11 @@ define signext i16 @extract_v16i16_signext(<16 x i16> %v) nounwind {
 ; AIE2P-LABEL: extract_v16i16_signext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #11 // Delay Slot 3
-; AIE2P-NEXT:    vextract.16 r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.16 r0, x0, #11, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <16 x i16> %v, i32 11
   ret i16 %1
@@ -424,11 +416,11 @@ define zeroext i16 @extract_v16i16_zeroext(<16 x i16> %v) nounwind {
 ; AIE2P-LABEL: extract_v16i16_zeroext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #11 // Delay Slot 3
-; AIE2P-NEXT:    vextract.16 r0, x0, r0, vaddsign0 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.16 r0, x0, #11, vaddsign0 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <16 x i16> %v, i32 11
   ret i16 %1
@@ -472,11 +464,11 @@ define i32 @extract_v8i32(<8 x i32> %v) nounwind {
 ; AIE2P-LABEL: extract_v8i32:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #7 // Delay Slot 3
-; AIE2P-NEXT:    vextract.32 r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.32 r0, x0, #7, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <8 x i32> %v, i32 7
   ret i32 %1
@@ -521,11 +513,11 @@ define signext i8 @extract_v64i8_signext(<64 x i8> %v) nounwind {
 ; AIE2P-LABEL: extract_v64i8_signext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #1 // Delay Slot 3
-; AIE2P-NEXT:    vextract.8 r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.8 r0, x0, #1, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <64 x i8> %v, i32 1
   ret i8 %1
@@ -545,11 +537,11 @@ define zeroext i8 @extract_v64i8_zeroext(<64 x i8> %v) nounwind {
 ; AIE2P-LABEL: extract_v64i8_zeroext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #1 // Delay Slot 3
-; AIE2P-NEXT:    vextract.8 r0, x0, r0, vaddsign0 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.8 r0, x0, #1, vaddsign0 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <64 x i8> %v, i32 1
   ret i8 %1
@@ -593,11 +585,11 @@ define signext i16 @extract_v32i16_signext(<32 x i16> %v) nounwind {
 ; AIE2P-LABEL: extract_v32i16_signext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #11 // Delay Slot 3
-; AIE2P-NEXT:    vextract.16 r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.16 r0, x0, #11, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <32 x i16> %v, i32 11
   ret i16 %1
@@ -617,11 +609,11 @@ define zeroext i16 @extract_v32i16_zeroext(<32 x i16> %v) nounwind {
 ; AIE2P-LABEL: extract_v32i16_zeroext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #11 // Delay Slot 3
-; AIE2P-NEXT:    vextract.16 r0, x0, r0, vaddsign0 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.16 r0, x0, #11, vaddsign0 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <32 x i16> %v, i32 11
   ret i16 %1
@@ -665,11 +657,11 @@ define i32 @extract_v16i32(<16 x i32> %v) nounwind {
 ; AIE2P-LABEL: extract_v16i32:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #7 // Delay Slot 3
-; AIE2P-NEXT:    vextract.32 r0, x0, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.32 r0, x0, #7, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <16 x i32> %v, i32 7
   ret i32 %1
@@ -714,11 +706,11 @@ define signext i8 @extract_v128i8_signext(<128 x i8> %v) nounwind {
 ; AIE2P-LABEL: extract_v128i8_signext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #1 // Delay Slot 3
-; AIE2P-NEXT:    vextract.8 r0, x4, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.8 r0, x4, #1, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <128 x i8> %v, i32 1
   ret i8 %1
@@ -738,11 +730,11 @@ define zeroext i8 @extract_v128i8_zeroext(<128 x i8> %v) nounwind {
 ; AIE2P-LABEL: extract_v128i8_zeroext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #1 // Delay Slot 3
-; AIE2P-NEXT:    vextract.8 r0, x4, r0, vaddsign0 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.8 r0, x4, #1, vaddsign0 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <128 x i8> %v, i32 1
   ret i8 %1
@@ -796,11 +788,11 @@ define signext i16 @extract_v64i16_signext(<64 x i16> %v) nounwind {
 ; AIE2P-LABEL: extract_v64i16_signext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #11 // Delay Slot 3
-; AIE2P-NEXT:    vextract.16 r0, x4, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.16 r0, x4, #11, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <64 x i16> %v, i32 11
   ret i16 %1
@@ -820,11 +812,11 @@ define zeroext i16 @extract_v64i16_zeroext(<64 x i16> %v) nounwind {
 ; AIE2P-LABEL: extract_v64i16_zeroext:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #11 // Delay Slot 3
-; AIE2P-NEXT:    vextract.16 r0, x4, r0, vaddsign0 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.16 r0, x4, #11, vaddsign0 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <64 x i16> %v, i32 11
   ret i16 %1
@@ -878,11 +870,11 @@ define i32 @extract_v32i32(<32 x i32> %v) nounwind {
 ; AIE2P-LABEL: extract_v32i32:
 ; AIE2P:         .p2align 4
 ; AIE2P-NEXT:  // %bb.0:
-; AIE2P-NEXT:    nopa ; nopb ; nops ; ret lr; nopm ; nopv
-; AIE2P-NEXT:    nopx // Delay Slot 5
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    mova r0, #7 // Delay Slot 3
-; AIE2P-NEXT:    vextract.32 r0, x4, r0, vaddsign1 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    vextract.32 r0, x4, #7, vaddsign1 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
   %1 = extractelement <32 x i32> %v, i32 7
   ret i32 %1