From 1b8f97b40cdceebf2749cab580065ac95d03aa5e Mon Sep 17 00:00:00 2001
From: Abnikant Singh <abnikant.singh@amd.com>
Date: Wed, 5 Feb 2025 11:42:00 +0530
Subject: [PATCH] 1) [AIE2P] Enable post-pre incr and offset load combine 2)
 [AIE2P] Support postinc 2D/3D, and offset load/store

---
 llvm/lib/Target/AIE/AIE2InstrInfo.cpp         |   11 +
 llvm/lib/Target/AIE/AIE2InstrInfo.h           |    3 +
 .../Target/AIE/AIE2InstructionSelector.cpp    |   32 -
 llvm/lib/Target/AIE/AIEBaseInstrInfo.h        |   13 +
 .../Target/AIE/AIEBaseInstructionSelector.cpp |   30 +
 .../Target/AIE/AIEBaseInstructionSelector.h   |    2 +
 llvm/lib/Target/AIE/AIECombine.td             |    4 +-
 llvm/lib/Target/AIE/AIECombinerHelper.cpp     |   16 +-
 llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp  |   11 +
 llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h    |    3 +
 .../AIE/aie2p/AIE2PInstructionSelector.cpp    |  294 ++-
 .../inst-select-vector-indexed-load-store.mir |  959 +++++++++-
 .../inst-select-vector-pre-post-increment.mir | 1277 ++++++++++++-
 .../AIE/aie2p/combine-loads-stores.mir        | 1661 +++++++++++++++++
 llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll     |  251 +--
 .../CodeGen/AIE/aie2p/ldst-fifo-stores.ll     |  158 +-
 16 files changed, 4433 insertions(+), 292 deletions(-)
 create mode 100644 llvm/test/CodeGen/AIE/aie2p/combine-loads-stores.mir

diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp
index c6db4e0e0dc4..e520bd7216e2 100644
--- a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp
+++ b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp
@@ -229,6 +229,13 @@ unsigned AIE2InstrInfo::getOffsetMemOpcode(unsigned BaseMemOpcode) const {
   llvm_unreachable("not a generic load/store");
 }
 
+bool AIE2InstrInfo::isGenericOffsetMemOpcode(unsigned Opcode) const {
+  return ((Opcode == AIE2::G_AIE_OFFSET_STORE) ||
+          (Opcode == AIE2::G_AIE_OFFSET_LOAD) ||
+          (Opcode == AIE2::G_AIE_OFFSET_SEXTLOAD) ||
+          (Opcode == AIE2::G_AIE_OFFSET_ZEXTLOAD));
+}
+
 std::optional<unsigned> AIE2InstrInfo::getCombinedPostIncOpcode(
     MachineInstr &BaseMemI, MachineInstr &PostIncI, TypeSize Size) const {
   switch (PostIncI.getOpcode()) {
@@ -1624,3 +1631,7 @@ unsigned AIE2InstrInfo::getScalarRegSize() const { return 32; }
 unsigned AIE2InstrInfo::getBasicVecRegSize() const { return 256; }
 
 unsigned AIE2InstrInfo::getBasicVectorBitSize() const { return 512; }
+
+unsigned AIE2InstrInfo::getMaxVectorBitSize() const { return 1024; }
+
+unsigned AIE2InstrInfo::getMaxSupportedLdStIncSize() const { return 512; }
diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.h b/llvm/lib/Target/AIE/AIE2InstrInfo.h
index 377f82f31305..feebe2140b68 100644
--- a/llvm/lib/Target/AIE/AIE2InstrInfo.h
+++ b/llvm/lib/Target/AIE/AIE2InstrInfo.h
@@ -71,6 +71,8 @@ class AIE2InstrInfo : public AIE2GenInstrInfo {
   unsigned getBasicVecRegSize() const override;
 
   unsigned getBasicVectorBitSize() const override;
+  unsigned getMaxVectorBitSize() const override;
+  unsigned getMaxSupportedLdStIncSize() const override;
 
   virtual unsigned
   getNumReservedDelaySlots(const MachineInstr &MI) const override;
@@ -83,6 +85,7 @@ class AIE2InstrInfo : public AIE2GenInstrInfo {
   bool isBooleanNoOp(unsigned Opc) const override;
   bool isBooleanNot(unsigned Opc) const override;
   bool isConstStep(const MachineInstr &MI, int64_t &Step) const override;
+  bool isGenericOffsetMemOpcode(unsigned Opcode) const override;
 
   bool verifyGenericInstruction(const MachineInstr &MI,
                                 StringRef &ErrInfo) const override;
diff --git a/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp b/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp
index 2869df693bc0..0c4656182298 100644
--- a/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp
+++ b/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp
@@ -52,7 +52,6 @@ class AIE2InstructionSelector : public AIEBaseInstructionSelector {
                                MachineRegisterInfo &MRI) override;
   Register createSparseRegSequence(Register Vec, Register Mask,
                                    MachineRegisterInfo &MRI);
-  void insertPtrAddForOffset(MachineRegisterInfo &MRI, MachineInstr &MemI);
 
   bool select(MachineInstr &I) override;
   bool selectCascadeStreamInsn(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -996,37 +995,6 @@ bool AIE2InstructionSelector::selectG_AIE_LOAD_UNPACK(
   return constrainSelectedInstRegOperands(*NewInstr.getInstr(), TII, TRI, RBI);
 }
 
-void AIE2InstructionSelector::insertPtrAddForOffset(MachineRegisterInfo &MRI,
-                                                    MachineInstr &MemI) {
-  // The offset is not an immediate or the immediate does not fit the immediate
-  // range. Instruction select PTR_ADD for the splitting of instruction. E.g.:
-  // $x0 = G_AIE_OFFSET_LOAD %ptr, %offset has to be selected to
-  // %new_ptr = PTR_ADD %ptr, %offset
-  // $wh0 = VLDA_dmw_lda_w_ag_idx_imm %new_ptr, #32
-  // $wl0 = VLDA_dmw_lda_w_ag_idx_imm %new_ptr, #0
-
-  // This function only gets called for G_AIE_OFFSET_LOAD AND G_AIE_OFFSET_STORE
-  // Both instruction have the pointer and the offset in the same operands
-  assert((MemI.getOpcode() == AIE2::G_AIE_OFFSET_LOAD ||
-          MemI.getOpcode() == AIE2::G_AIE_OFFSET_STORE) &&
-         "Unexpected instruction in instrPtrAddForOffset");
-  const unsigned PointerRegIndex = 1;
-  const unsigned OffsetRegIndex = 2;
-
-  Register NewPtrReg =
-      MRI.cloneVirtualRegister(MemI.getOperand(PointerRegIndex).getReg());
-  MachineInstrBuilder NewPtr =
-      MIB.buildInstr(TargetOpcode::G_PTR_ADD)
-          .addDef(NewPtrReg)
-          .addReg(MemI.getOperand(PointerRegIndex).getReg())
-          .addReg(MemI.getOperand(OffsetRegIndex).getReg());
-
-  if (!selectImpl(*NewPtr.getInstr(), *CoverageInfo))
-    llvm_unreachable("Unexpected failure selecting G_PTR_ADD");
-
-  MemI.getOperand(PointerRegIndex).setReg(NewPtrReg);
-}
-
 std::optional<LoadStoreOpcodes>
 AIE2InstructionSelector::getCombinedOpcodeSRSUPS(const MachineInstr &MemOp,
                                                  const MachineInstr &CombOp,
diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h
index dfd91fe55ef6..4345f8733d4b 100644
--- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h
+++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h
@@ -232,6 +232,8 @@ struct AIEBaseInstrInfo : public TargetInstrInfo {
     return false;
   }
 
+  virtual bool isGenericOffsetMemOpcode(unsigned Opcode) const { return false; }
+
   // Used for Load/Store combiners
   virtual unsigned getOffsetMemOpcode(unsigned BaseMemOpcode) const {
     llvm_unreachable("Target didn't implement getOffsetMemOpcode");
@@ -567,6 +569,17 @@ struct AIEBaseInstrInfo : public TargetInstrInfo {
     llvm_unreachable("Target didn't implement getVecRegSize!");
   }
 
+  /// Return the maximum supported vector size for this target.
+  virtual unsigned getMaxVectorBitSize() const {
+    llvm_unreachable("Target didn't implement getMaxVectorBitSize!");
+  }
+
+  /// Return the maximum vector size the target supports for a combined
+  /// load-store increment.
+  virtual unsigned getMaxSupportedLdStIncSize() const {
+    llvm_unreachable("Target didn't implement getMaxSupportedLdStIncSize!");
+  }
+
   /// Abstract operations to help the decoding of complex operations.
   struct AbstractOp {
     enum class OperationType : unsigned {
diff --git a/llvm/lib/Target/AIE/AIEBaseInstructionSelector.cpp b/llvm/lib/Target/AIE/AIEBaseInstructionSelector.cpp
index 157bdf7dea06..70c4db4c0312 100644
--- a/llvm/lib/Target/AIE/AIEBaseInstructionSelector.cpp
+++ b/llvm/lib/Target/AIE/AIEBaseInstructionSelector.cpp
@@ -761,3 +761,33 @@ void AIEBaseInstructionSelector::makeDeadMI(MachineInstr &MI,
     Def->setReg(NewReg);
   }
 }
+
+void AIEBaseInstructionSelector::insertPtrAddForOffset(MachineRegisterInfo &MRI,
+                                                       MachineInstr &MemI) {
+  // The offset is not an immediate or the immediate does not fit the immediate
+  // range. Instruction select PTR_ADD for the splitting of instruction. E.g.:
+  // $x0 = G_AIE_OFFSET_LOAD %ptr, %offset has to be selected to
+  // %new_ptr = PTR_ADD %ptr, %offset
+  // $wh0 = VLDA_dmw_lda_w_ag_idx_imm %new_ptr, #32
+  // $wl0 = VLDA_dmw_lda_w_ag_idx_imm %new_ptr, #0
+
+  // This function only gets called for G_AIE_OFFSET_LOAD AND G_AIE_OFFSET_STORE
+  // Both instruction have the pointer and the offset in the same operands
+  assert(TII.isGenericOffsetMemOpcode(MemI.getOpcode()) &&
+         "Unexpected instruction in instrPtrAddForOffset");
+  const unsigned PointerRegIndex = 1;
+  const unsigned OffsetRegIndex = 2;
+
+  Register NewPtrReg =
+      MRI.cloneVirtualRegister(MemI.getOperand(PointerRegIndex).getReg());
+  MachineInstrBuilder NewPtr =
+      MIB.buildInstr(TargetOpcode::G_PTR_ADD)
+          .addDef(NewPtrReg)
+          .addReg(MemI.getOperand(PointerRegIndex).getReg())
+          .addReg(MemI.getOperand(OffsetRegIndex).getReg());
+
+  if (!selectImpl(*NewPtr.getInstr(), *CoverageInfo))
+    llvm_unreachable("Unexpected failure selecting G_PTR_ADD");
+
+  MemI.getOperand(PointerRegIndex).setReg(NewPtrReg);
+}
diff --git a/llvm/lib/Target/AIE/AIEBaseInstructionSelector.h b/llvm/lib/Target/AIE/AIEBaseInstructionSelector.h
index 40376238b10d..ef472d0d5ab0 100644
--- a/llvm/lib/Target/AIE/AIEBaseInstructionSelector.h
+++ b/llvm/lib/Target/AIE/AIEBaseInstructionSelector.h
@@ -159,6 +159,8 @@ class AIEBaseInstructionSelector : public InstructionSelector {
   bool selectVSUB_MIN_MAX(MachineInstr &I, MachineRegisterInfo &MRI,
                           MachineIRBuilder &MIB);
 
+  void insertPtrAddForOffset(MachineRegisterInfo &MRI, MachineInstr &MemI);
+
 protected:
   void makeDeadMI(MachineInstr &MI, MachineRegisterInfo &MRI);
   virtual std::optional<AddressingModeInfo>
diff --git a/llvm/lib/Target/AIE/AIECombine.td b/llvm/lib/Target/AIE/AIECombine.td
index 3ef954b5887e..18661b382be1 100644
--- a/llvm/lib/Target/AIE/AIECombine.td
+++ b/llvm/lib/Target/AIE/AIECombine.td
@@ -197,5 +197,7 @@ def AIE2PostLegalizerCustomCombiner
 
 def AIE2PPostLegalizerCustomCombiner
     : GICombiner<"AIE2PPostLegalizerCustomCombinerImpl", [ combine_load_store_increment,
-                                                          combine_add_vector_elt_undef ]> {
+                                                           combine_offset_load_store_ptradd,
+                                                           combine_offset_load_store_share_ptradd,
+                                                           combine_add_vector_elt_undef ]> {
 }
diff --git a/llvm/lib/Target/AIE/AIECombinerHelper.cpp b/llvm/lib/Target/AIE/AIECombinerHelper.cpp
index acefe19ee6e9..d019faf568a6 100644
--- a/llvm/lib/Target/AIE/AIECombinerHelper.cpp
+++ b/llvm/lib/Target/AIE/AIECombinerHelper.cpp
@@ -58,8 +58,14 @@ MachineInstr *findPreIncMatch(MachineInstr &MemI, MachineRegisterInfo &MRI,
                               const AIEBaseInstrInfo &TII) {
   // This is currently done with patterns in instruction selection.
   // No need to do it here.
-  if (MRI.getType(MemI.getOperand(0).getReg()).getSizeInBits() >= 1024)
+  MachineFunction &MF = *MemI.getMF();
+  const Triple &TT = MF.getTarget().getTargetTriple();
+  const unsigned VecSize =
+      MRI.getType(MemI.getOperand(0).getReg()).getSizeInBits();
+  if (VecSize > TII.getMaxSupportedLdStIncSize()) {
     return nullptr;
+  }
+
   if (!EnableOffsetCombine)
     return nullptr;
   Register Addr = MemI.getOperand(1).getReg();
@@ -320,9 +326,13 @@ MachineInstr *findPostIncMatch(MachineInstr &MemI, MachineRegisterInfo &MRI,
                                const AIEBaseInstrInfo &TII) {
   if (!EnablePostIncCombine)
     return nullptr;
-  if (MRI.getType(MemI.getOperand(0).getReg()).getSizeInBits() >= 1024)
+  MachineFunction &MF = *MemI.getMF();
+  const Triple &TT = MF.getTarget().getTargetTriple();
+  const unsigned VecSize =
+      MRI.getType(MemI.getOperand(0).getReg()).getSizeInBits();
+  if (VecSize > TII.getMaxSupportedLdStIncSize()) {
     return nullptr;
-
+  }
   Register Addr = MemI.getOperand(1).getReg();
   for (auto &PtrInc : MRI.use_nodbg_instructions(Addr)) {
     if (MemI.getParent() != PtrInc.getParent())
diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp
index ab5b2d8aa2d0..9266223a9d37 100644
--- a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp
+++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp
@@ -259,6 +259,13 @@ unsigned AIE2PInstrInfo::getOffsetMemOpcode(unsigned BaseMemOpcode) const {
   llvm_unreachable("not a generic load/store");
 }
 
+bool AIE2PInstrInfo::isGenericOffsetMemOpcode(unsigned Opcode) const {
+  return ((Opcode == AIE2P::G_AIE_OFFSET_STORE) ||
+          (Opcode == AIE2P::G_AIE_OFFSET_LOAD) ||
+          (Opcode == AIE2P::G_AIE_OFFSET_SEXTLOAD) ||
+          (Opcode == AIE2P::G_AIE_OFFSET_ZEXTLOAD));
+}
+
 std::optional<unsigned> AIE2PInstrInfo::getCombinedPostIncOpcode(
     MachineInstr &BaseMemI, MachineInstr &PostIncI, TypeSize Size) const {
   switch (PostIncI.getOpcode()) {
@@ -1737,3 +1744,7 @@ unsigned AIE2PInstrInfo::getScalarRegSize() const { return 32; }
 unsigned AIE2PInstrInfo::getBasicVecRegSize() const { return 256; }
 
 unsigned AIE2PInstrInfo::getBasicVectorBitSize() const { return 512; }
+
+unsigned AIE2PInstrInfo::getMaxVectorBitSize() const { return 2048; }
+
+unsigned AIE2PInstrInfo::getMaxSupportedLdStIncSize() const { return 2048; }
diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h
index a377dc372886..2e86375c57b8 100644
--- a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h
+++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h
@@ -71,6 +71,8 @@ class AIE2PInstrInfo : public AIE2PGenInstrInfo {
   unsigned getBasicVecRegSize() const override;
 
   unsigned getBasicVectorBitSize() const override;
+  unsigned getMaxVectorBitSize() const override;
+  unsigned getMaxSupportedLdStIncSize() const override;
 
   virtual unsigned
   getNumReservedDelaySlots(const MachineInstr &MI) const override;
@@ -83,6 +85,7 @@ class AIE2PInstrInfo : public AIE2PGenInstrInfo {
   bool isBooleanNoOp(unsigned Opc) const override;
   bool isBooleanNot(unsigned Opc) const override;
   bool isConstStep(const MachineInstr &MI, int64_t &Step) const override;
+  bool isGenericOffsetMemOpcode(unsigned Opcode) const override;
 
   bool verifyGenericInstruction(const MachineInstr &MI,
                                 StringRef &ErrInfo) const override;
diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp
index a7f73bfa9978..459fef2076e9 100644
--- a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp
+++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp
@@ -1747,7 +1747,6 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
     // Scalar loads are handled in tablegen patterns mostly and loads to ptr in
     // selectG_LOAD
     MachineInstr *PtrDef = MRI.getVRegDef(I.getOperand(1).getReg());
-    // TODO: handle remaining load sizes
     if (LoadStoreSize == 128) {
       assert(RBID == AIE2P::VRegBankID &&
              "128-bit vectors should be in the Vector Register Bank");
@@ -1785,7 +1784,7 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
         }
         llvm_unreachable(
             "512-bit vector type must be in AccRegBank or VRegBank "
-            "or FifoRegBankID");
+            "or FifoRegBank");
       }
       if (RBID == AIE2P::AccRegBankID) {
         return {/*ISelOpcode=*/AIE2P::VLDA_dmx_lda_bm_idx_imm,
@@ -1803,7 +1802,7 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
                 /*OffsetOpcode=*/{}};
       }
       llvm_unreachable("512-bit vector type must be in AccRegBank or VRegBank "
-                       "or FifoRegBankID");
+                       "or FifoRegBank");
     }
     if (LoadStoreSize == 1024) {
       unsigned RBID = deriveRegBankID(I.getOperand(0).getReg(), MRI, RBI);
@@ -1823,7 +1822,7 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
                 /*OffsetOpcode=*/AIE2P::VLDA_dmx_lda_x_idx_imm};
       }
       llvm_unreachable("1024-bit vector type must be in AccRegBank or VRegBank "
-                       "or FifoRegBankID");
+                       "or FifoRegBank");
     } else if (LoadStoreSize == 2048) {
       if (RBID == AIE2P::AccRegBankID) {
         return {/*ISelOpcode=*/AIE2P::VLDA_dmx_lda_bm_idx_imm,
@@ -1835,7 +1834,6 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
     break;
   }
   case AIE2P::G_AIE_OFFSET_LOAD: {
-    // TODO: handle remaining load sizes
     if (LoadStoreSize == 128) {
       assert(RBID == AIE2P::VRegBankID &&
              "128-bit vectors should be in the Vector Register Bank");
@@ -1867,7 +1865,36 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
         return {ISelOpcode, FitsImmediateRange,
                 /*OffsetOpcode=*/AIE2P::VLDA_dmx_lda_x_idx_imm};
       }
-      llvm_unreachable("512-bit vector type must be in AccRegBank or VRegBank");
+      if (RBID == AIE2P::FifoRegBankID) {
+        ISelOpcode = FitsImmediateRange ? AIE2P::VLDA_dmx_lda_fifohl_idx_imm
+                                        : AIE2P::VLDA_dmx_lda_fifohl_idx;
+        return {ISelOpcode, FitsImmediateRange,
+                /*OffsetOpcode=*/AIE2P::VLDA_dmx_lda_fifohl_idx_imm};
+      }
+      llvm_unreachable("Vector type must be in AccRegBank or VRegBank "
+                       "or FifoRegBank");
+    } else if (LoadStoreSize == 1024 || LoadStoreSize == 2048) {
+      FitsImmediateRange =
+          (LoadStoreSize == 1024)
+              ? checkImmediateRangeSplitting<4, 64, 64>(Offset)
+              : checkImmediateRangeSplitting<4, 64, 192>(Offset);
+      if (RBID == AIE2P::AccRegBankID) {
+        return {/*ISelOpcode=*/AIE2P::VLDA_dmx_lda_bm_idx_imm,
+                FitsImmediateRange,
+                /*OffsetOpcode=*/AIE2P::VLDA_dmx_lda_bm_idx_imm};
+      }
+      if (RBID == AIE2P::VRegBankID) {
+        return {/*ISelOpcode=*/AIE2P::VLDA_dmx_lda_x_idx_imm,
+                FitsImmediateRange,
+                /*OffsetOpcode=*/AIE2P::VLDA_dmx_lda_x_idx_imm};
+      }
+      if (RBID == AIE2P::FifoRegBankID) {
+        return {/*ISelOpcode=*/AIE2P::VLDA_dmx_lda_fifohl_idx_imm,
+                FitsImmediateRange,
+                /*OffsetOpcode=*/AIE2P::VLDA_dmx_lda_fifohl_idx_imm};
+      }
+      llvm_unreachable("Vector type must be in AccRegBank or VRegBank "
+                       "or FifoRegBank");
     }
     if (LoadStoreSize == 20 || LoadStoreSize == 32) {
       FitsImmediateRange = checkImmediateRange<4, 4>(Offset);
@@ -1908,7 +1935,6 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
     break;
   }
   case AIE2P::G_AIE_POSTINC_LOAD: {
-    // TODO: handle remaining load sizes
     if (LoadStoreSize == 128) {
       assert(RBID == AIE2P::VRegBankID &&
              "128-bit vectors should be in the Vector Register Bank");
@@ -1926,7 +1952,8 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
                                       : AIE2P::VLDA_dmw_lda_w_pstm_nrm;
       return {ISelOpcode, FitsImmediateRange,
               /*OffsetOpcode=*/AIE2P::VLDA_dmw_lda_w_idx_imm};
-    } else if (LoadStoreSize == 512) {
+    } else if (LoadStoreSize == 512 || LoadStoreSize == 1024 ||
+               LoadStoreSize == 2048) {
       FitsImmediateRange = checkImmediateRange<4, 64>(Offset);
       if (RBID == AIE2P::AccRegBankID) {
         ISelOpcode = FitsImmediateRange ? AIE2P::VLDA_dmx_lda_bm_pstm_nrm_imm
@@ -1940,7 +1967,15 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
         return {ISelOpcode, FitsImmediateRange,
                 /*OffsetOpcode=*/AIE2P::VLDA_dmx_lda_x_idx_imm};
       }
-      llvm_unreachable("512-bit vector type must be in AccRegBank or VRegBank");
+      if (RBID == AIE2P::FifoRegBankID) {
+        ISelOpcode = FitsImmediateRange
+                         ? AIE2P::VLDA_dmx_lda_fifohl_pstm_nrm_imm
+                         : AIE2P::VLDA_dmx_lda_fifohl_pstm_nrm;
+        return {ISelOpcode, FitsImmediateRange,
+                /*OffsetOpcode=*/AIE2P::VLDA_dmx_lda_fifohl_idx_imm};
+      }
+      llvm_unreachable("Vector type must be in AccRegBank or VRegBank "
+                       "or FifoRegBank");
     }
     if (LoadStoreSize == 20 || LoadStoreSize == 32) {
       FitsImmediateRange = checkImmediateRange<4, 4>(Offset);
@@ -1992,14 +2027,19 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
              "256-bit vectors should be in the Vector Register Bank");
       return {/*ISelOpcode=*/AIE2P::VLDA_2D_dmw_lda_w, NoImmediate,
               /*OffsetOpcode=*/{}};
-    } else if (LoadStoreSize == 512) {
+    } else if (LoadStoreSize == 512 || LoadStoreSize == 1024 ||
+               LoadStoreSize == 2048) {
       if (RBID == AIE2P::AccRegBankID)
         return {/*ISelOpcode=*/AIE2P::VLDA_2D_dmx_lda_bm, NoImmediate,
-                /*OffsetOpcode=*/{}};
+                /*OffsetOpcode=*/{AIE2P::VLDA_dmx_lda_bm_idx_imm}};
       if (RBID == AIE2P::VRegBankID)
         return {/*ISelOpcode=*/AIE2P::VLDA_2D_dmx_lda_x, NoImmediate,
-                /*OffsetOpcode=*/{}};
-      llvm_unreachable("512-bit vector type must be in AccRegBank or VRegBank");
+                /*OffsetOpcode=*/{AIE2P::VLDA_dmw_lda_w_idx_imm}};
+      if (RBID == AIE2P::FifoRegBankID)
+        return {/*ISelOpcode=*/AIE2P::VLDA_2D_dmx_lda_fifohl, NoImmediate,
+                /*OffsetOpcode=*/{AIE2P::VLDA_dmx_lda_fifohl_idx_imm}};
+      llvm_unreachable("Vector type must be in AccRegBank or VRegBank "
+                       "or FifoRegBank");
     }
     if (LoadStoreSize == 20 || LoadStoreSize == 32) {
       return {/*ISelOpcode=*/AIE2P::LDA_2D_dms_lda, NoImmediate,
@@ -2039,14 +2079,19 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
              "256-bit vectors should be in the Vector Register Bank");
       return {/*ISelOpcode=*/AIE2P::VLDA_3D_dmw_lda_w, NoImmediate,
               /*OffsetOpcode=*/{}};
-    } else if (LoadStoreSize == 512) {
+    } else if (LoadStoreSize == 512 || LoadStoreSize == 1024 ||
+               LoadStoreSize == 2048) {
       if (RBID == AIE2P::AccRegBankID)
         return {/*ISelOpcode=*/AIE2P::VLDA_3D_dmx_lda_bm, NoImmediate,
-                /*OffsetOpcode=*/{}};
+                /*OffsetOpcode=*/{AIE2P::VLDA_dmx_lda_bm_idx_imm}};
       if (RBID == AIE2P::VRegBankID)
         return {/*ISelOpcode=*/AIE2P::VLDA_3D_dmx_lda_x, NoImmediate,
-                /*OffsetOpcode=*/{}};
-      llvm_unreachable("512-bit vector type must be in AccRegBank or VRegBank");
+                /*OffsetOpcode=*/{AIE2P::VLDA_dmw_lda_w_idx_imm}};
+      if (RBID == AIE2P::FifoRegBankID)
+        return {/*ISelOpcode=*/AIE2P::VLDA_3D_dmx_lda_fifohl, NoImmediate,
+                /*OffsetOpcode=*/{AIE2P::VLDA_dmx_lda_fifohl_idx_imm}};
+      llvm_unreachable("Vector type must be in AccRegBank or VRegBank "
+                       "or FifoRegBank");
     }
     if (LoadStoreSize == 20 || LoadStoreSize == 32) {
       return {/*ISelOpcode=*/AIE2P::LDA_3D_dms_lda, NoImmediate,
@@ -2077,7 +2122,6 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
   case AIE2P::G_STORE: {
     // Scalar stores are handled in tablegen patterns
     MachineInstr *PtrDef = MRI.getVRegDef(I.getOperand(1).getReg());
-    // TODO: handle remaining store sizes
     if (LoadStoreSize == 128) {
       assert(RBID == AIE2P::VRegBankID &&
              "128-bit vectors should be in the Vector Register Bank");
@@ -2114,7 +2158,7 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
                   /*OffsetOpcode=*/{}};
         }
         llvm_unreachable("512-bit vector type must be in AccRegBank or "
-                         "VRegBank or FifoRegBankID");
+                         "VRegBank or FifoRegBank");
       } else {
         if (RBID == AIE2P::AccRegBankID) {
           return {/*ISelOpcode=*/AIE2P::VST_dmx_sts_bm_idx_imm,
@@ -2132,7 +2176,7 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
                   /*OffsetOpcode=*/AIE2P::VST_dmx_sts_fifohl_idx_imm};
         }
         llvm_unreachable("512-bit vector type must be in AccRegBank or "
-                         "VRegBank or FifoRegBankID");
+                         "VRegBank or FifoRegBank");
       }
     } else if (LoadStoreSize == 1024) {
       if (RBID == AIE2P::FifoRegBankID) {
@@ -2151,7 +2195,7 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
                 /*OffsetOpcode=*/AIE2P::VST_dmx_sts_x_idx_imm};
       }
       llvm_unreachable("1024-bit vector type must be in AccRegBank or "
-                       "VRegBank or FifoRegBankID");
+                       "VRegBank or FifoRegBank");
     } else if (LoadStoreSize == 2048) {
       assert(RBID == AIE2P::AccRegBankID &&
              "2048-bit vectors should be in the Accumulator Register Bank");
@@ -2164,7 +2208,6 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
     break;
   }
   case AIE2P::G_AIE_OFFSET_STORE: {
-    // TODO: handle remaining store sizes
     if (LoadStoreSize == 128) {
       assert(RBID == AIE2P::VRegBankID &&
              "128-bit vectors should be in the Vector Register Bank");
@@ -2196,7 +2239,35 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
         return {ISelOpcode, FitsImmediateRange,
                 /*OffsetOpcode=*/AIE2P::VST_dmx_sts_x_idx_imm};
       }
-      llvm_unreachable("512-bit vector type must be in AccRegBank or VRegBank");
+      if (RBID == AIE2P::FifoRegBankID) {
+        ISelOpcode = FitsImmediateRange ? AIE2P::VST_dmx_sts_fifohl_idx_imm
+                                        : AIE2P::VST_dmx_sts_fifohl_idx;
+        return {ISelOpcode, FitsImmediateRange,
+                /*OffsetOpcode=*/AIE2P::VST_dmx_sts_fifohl_idx_imm};
+      }
+      llvm_unreachable("512-bit vector type must be in AccRegBank or "
+                       "VRegBank or FifoRegBank");
+    } else if (LoadStoreSize == 1024 || LoadStoreSize == 2048) {
+      FitsImmediateRange =
+          (LoadStoreSize == 1024)
+              ? checkImmediateRangeSplitting<4, 64, 64>(Offset)
+              : checkImmediateRangeSplitting<4, 64, 192>(Offset);
+      if (RBID == AIE2P::AccRegBankID) {
+        return {/*ISelOpcode=*/AIE2P::VST_dmx_sts_bm_idx_imm,
+                FitsImmediateRange,
+                /*OffsetOpcode=*/AIE2P::VST_dmx_sts_bm_idx_imm};
+      }
+      if (RBID == AIE2P::VRegBankID) {
+        return {/*ISelOpcode=*/AIE2P::VST_dmx_sts_x_idx_imm, FitsImmediateRange,
+                /*OffsetOpcode=*/AIE2P::VST_dmx_sts_x_idx_imm};
+      }
+      if (RBID == AIE2P::FifoRegBankID) {
+        return {/*ISelOpcode=*/AIE2P::VST_dmx_sts_fifohl_idx_imm,
+                FitsImmediateRange,
+                /*OffsetOpcode=*/AIE2P::VST_dmx_sts_fifohl_idx_imm};
+      }
+      llvm_unreachable("Vector type must be in AccRegBank or "
+                       "VRegBank or FifoRegBank");
     }
     if (LoadStoreSize == 20 || LoadStoreSize == 32) {
       FitsImmediateRange = checkImmediateRange<4, 4>(Offset);
@@ -2219,7 +2290,6 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
   }
   case AIE2P::G_AIE_POSTINC_STORE: {
     RBID = deriveRegBankID(I.getOperand(1).getReg(), MRI, RBI);
-    // TODO: handle remaining store sizes
     if (LoadStoreSize == 128) {
       assert(RBID == AIE2P::VRegBankID &&
              "128-bit vectors should be in the Vector Register Bank");
@@ -2237,21 +2307,29 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
                                       : AIE2P::VST_dmw_sts_w_pstm_nrm;
       return {ISelOpcode, FitsImmediateRange,
               /*OffsetOpcode=*/AIE2P::VST_dmw_sts_w_pstm_nrm_imm};
-    } else if (LoadStoreSize == 512) {
+    } else if (LoadStoreSize == 512 || LoadStoreSize == 1024 ||
+               LoadStoreSize == 2048) {
       FitsImmediateRange = checkImmediateRange<4, 64>(Offset);
       if (RBID == AIE2P::AccRegBankID) {
         ISelOpcode = FitsImmediateRange ? AIE2P::VST_dmx_sts_bm_pstm_nrm_imm
                                         : AIE2P::VST_dmx_sts_bm_pstm_nrm;
         return {ISelOpcode, FitsImmediateRange,
-                /*OffsetOpcode=*/AIE2P::VST_dmx_sts_bm_pstm_nrm_imm};
+                /*OffsetOpcode=*/AIE2P::VST_dmx_sts_bm_idx_imm};
       }
       if (RBID == AIE2P::VRegBankID) {
         ISelOpcode = FitsImmediateRange ? AIE2P::VST_dmx_sts_x_pstm_nrm_imm
                                         : AIE2P::VST_dmx_sts_x_pstm_nrm;
         return {ISelOpcode, FitsImmediateRange,
-                /*OffsetOpcode=*/AIE2P::VST_dmx_sts_x_pstm_nrm_imm};
+                /*OffsetOpcode=*/AIE2P::VST_dmx_sts_x_idx_imm};
       }
-      llvm_unreachable("512-bit vector type must be in AccRegBank or VRegBank");
+      if (RBID == AIE2P::FifoRegBankID) {
+        ISelOpcode = FitsImmediateRange ? AIE2P::VST_dmx_sts_fifohl_pstm_nrm_imm
+                                        : AIE2P::VST_dmx_sts_fifohl_pstm_nrm;
+        return {ISelOpcode, FitsImmediateRange,
+                /*OffsetOpcode=*/AIE2P::VST_dmx_sts_fifohl_idx_imm};
+      }
+      llvm_unreachable("Vector type must be in AccRegBank or VRegBank "
+                       "or FifoRegBank");
     }
     if (LoadStoreSize == 20 || LoadStoreSize == 32) {
       FitsImmediateRange = checkImmediateRange<4, 4>(Offset);
@@ -2275,7 +2353,6 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
   }
   case AIE2P::G_AIE_POSTINC_2D_STORE: {
     RBID = deriveRegBankID(I.getOperand(2).getReg(), MRI, RBI);
-    // TODO: handle remaining store sizes
     if (LoadStoreSize == 128) {
       assert(RBID == AIE2P::VRegBankID &&
              "128-bit vectors should be in the Vector Register Bank");
@@ -2287,16 +2364,22 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
              "256-bit vectors should be in the Vector Register Bank");
       return {AIE2P::VST_2D_dmw_sts_w, NoImmediate,
               /*OffsetOpcode=*/{}};
-    } else if (LoadStoreSize == 512) {
+    } else if (LoadStoreSize == 512 || LoadStoreSize == 1024 ||
+               LoadStoreSize == 2048) {
       if (RBID == AIE2P::AccRegBankID) {
         return {AIE2P::VST_2D_dmx_sts_bm, NoImmediate,
-                /*OffsetOpcode=*/{}};
+                /*OffsetOpcode=*/{AIE2P::VST_dmx_sts_bm_idx_imm}};
       }
       if (RBID == AIE2P::VRegBankID) {
         return {AIE2P::VST_2D_dmx_sts_x, NoImmediate,
-                /*OffsetOpcode=*/{}};
+                /*OffsetOpcode=*/{AIE2P::VST_dmx_sts_x_idx_imm}};
+      }
+      if (RBID == AIE2P::FifoRegBankID) {
+        return {AIE2P::VST_2D_dmx_sts_x, NoImmediate,
+                /*OffsetOpcode=*/{AIE2P::VST_dmx_sts_fifohl_idx_imm}};
       }
-      llvm_unreachable("512-bit vector type must be in AccRegBank or VRegBank");
+      llvm_unreachable("Vector type must be in AccRegBank or VRegBank "
+                       "or FifoRegBank");
     }
     if (LoadStoreSize == 20 || LoadStoreSize == 32) {
       return {/*ISelOpcode=*/AIE2P::ST_2D_dms_sts, NoImmediate,
@@ -2314,7 +2397,6 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
   }
   case AIE2P::G_AIE_POSTINC_3D_STORE: {
     RBID = deriveRegBankID(I.getOperand(3).getReg(), MRI, RBI);
-    // TODO: handle remaining store sizes
     if (LoadStoreSize == 128) {
       assert(RBID == AIE2P::VRegBankID &&
              "128-bit vectors should be in the Vector Register Bank");
@@ -2326,16 +2408,22 @@ LoadStoreOpcodes AIE2PInstructionSelector::getLoadStoreOpcode(
              "256-bit vectors should be in the Vector Register Bank");
       return {AIE2P::VST_3D_dmw_sts_w, NoImmediate,
               /*OffsetOpcode=*/{}};
-    } else if (LoadStoreSize == 512) {
+    } else if (LoadStoreSize == 512 || LoadStoreSize == 1024 ||
+               LoadStoreSize == 2048) {
       if (RBID == AIE2P::AccRegBankID) {
         return {AIE2P::VST_3D_dmx_sts_bm, NoImmediate,
-                /*OffsetOpcode=*/{}};
+                /*OffsetOpcode=*/{AIE2P::VST_dmx_sts_bm_idx_imm}};
       }
       if (RBID == AIE2P::VRegBankID) {
         return {AIE2P::VST_3D_dmx_sts_x, NoImmediate,
-                /*OffsetOpcode=*/{}};
+                /*OffsetOpcode=*/{AIE2P::VST_dmx_sts_x_idx_imm}};
       }
-      llvm_unreachable("512-bit vector type must be in AccRegBank or VRegBank");
+      if (RBID == AIE2P::FifoRegBankID) {
+        return {AIE2P::VST_3D_dmx_sts_x, NoImmediate,
+                /*OffsetOpcode=*/{AIE2P::VST_dmx_sts_fifohl_idx_imm}};
+      }
+      llvm_unreachable("Vector type must be in AccRegBank or VRegBank "
+                       "or FifoRegBank");
     }
     if (LoadStoreSize == 20 || LoadStoreSize == 32) {
       return {/*ISelOpcode=*/AIE2P::ST_3D_dms_sts, NoImmediate,
@@ -2364,7 +2452,8 @@ bool AIE2PInstructionSelector::selectWideG_AIE_LOAD_STORE(
   LLT SrcDstTy = MRI.getType(AMI.SrcDstOp.getReg());
   const unsigned SrcDstTySize = SrcDstTy.getSizeInBits();
   const unsigned SplitFactor = (SrcDstTySize == 1024) ? 2 : 4;
-  const unsigned RBID = deriveRegBankID(I.getOperand(0).getReg(), MRI, RBI);
+  const unsigned OpIdx = AMI.MemI.mayStore() ? I.getNumExplicitDefs() : 0;
+  const unsigned RBID = deriveRegBankID(I.getOperand(OpIdx).getReg(), MRI, RBI);
   const TargetRegisterClass *RC512 = nullptr;
   const TargetRegisterClass *RC1024 = nullptr;
   const TargetRegisterClass *RC2048 = &AIE2P::ACC2048RegClass;
@@ -2410,25 +2499,28 @@ bool AIE2PInstructionSelector::selectWideG_AIE_LOAD_STORE(
 
   SmallVector<MachineInstrBuilder, 4> SplitInstrs;
   switch (AMI.MemI.getOpcode()) {
-  case AIE2P::G_STORE: {
+  case AIE2P::G_STORE:
+  case AIE2P::G_AIE_POSTINC_STORE:
+  case AIE2P::G_AIE_POSTINC_2D_STORE:
+  case AIE2P::G_AIE_POSTINC_3D_STORE: {
     for (unsigned SubRegIdx = 0; SubRegIdx < SplitFactor; ++SubRegIdx) {
       const unsigned Offset = SubRegIdx * 64;
       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {SubRegs[SubRegIdx]}, {})
                       .addReg(AMI.SrcDstOp.getReg(), 0,
                               SubRegIdxes[SubRegIdx % SubRegIdxes.size()]);
-
-      auto StoreInstr = (SubRegIdx == 0)
-                            ? MIB.buildInstr(LSO.ISelOpcode, {}, {})
-                            : MIB.buildInstr(*LSO.OffsetOpcode, {}, {})
-                                  .addReg(Copy.getReg(0))
-                                  .addReg(AMI.PtrOp.getReg())
-                                  .addImm(Offset);
-
+      MachineInstrBuilder StoreInstr;
       if (SubRegIdx == 0) {
+        StoreInstr = MIB.buildInstr(LSO.ISelOpcode, {}, {});
         for (auto Def : AMI.MemI.defs())
           StoreInstr.addDef(Def.getReg());
         StoreInstr.addReg(Copy.getReg(0));
         addAddressingMode(StoreInstr, AMI, LSO.FitsImmediateRange, false, MRI);
+      } else {
+
+        StoreInstr = MIB.buildInstr(*LSO.OffsetOpcode, {}, {})
+                         .addReg(Copy.getReg(0))
+                         .addReg(AMI.PtrOp.getReg())
+                         .addImm(Offset);
       }
       SplitInstrs.push_back(StoreInstr);
     }
@@ -2436,22 +2528,114 @@ bool AIE2PInstructionSelector::selectWideG_AIE_LOAD_STORE(
     handleSplitMemOperands(SplitInstrs);
     break;
   }
-  case AIE2P::G_LOAD: {
+  case AIE2P::G_AIE_OFFSET_STORE: {
+    if (!LSO.FitsImmediateRange) {
+      // Emit an PTR_ADD to evaluate the offset
+      insertPtrAddForOffset(MRI, AMI.MemI);
+    }
+    for (unsigned SubRegIdx = 0; SubRegIdx < SplitFactor; ++SubRegIdx) {
+      const unsigned Offset = SubRegIdx * 64;
+      auto Copy = MIB.buildInstr(TargetOpcode::COPY, {SubRegs[SubRegIdx]}, {})
+                      .addReg(AMI.SrcDstOp.getReg(), 0,
+                              SubRegIdxes[SubRegIdx % SubRegIdxes.size()]);
+      MachineInstrBuilder StoreInstr;
+      if (SubRegIdx == 0) {
+        StoreInstr = MIB.buildInstr(LSO.ISelOpcode, {}, {});
+        StoreInstr.addReg(Copy.getReg(0));
+
+        StoreInstr.addUse(AMI.PtrOp.getReg());
+
+        if (LSO.FitsImmediateRange) {
+          StoreInstr.addImm(AMI.ImmediateOffset->getSExtValue()); // Offset
+        } else {
+          // In this case we have already inserted a PTR_ADD to add the offset
+          // to the base pointer
+          StoreInstr.addImm(0); // Offset
+        }
+      } else { // SubRegIdx != 0
+        if (LSO.FitsImmediateRange) {
+          StoreInstr = MIB.buildInstr(*LSO.OffsetOpcode, {}, {})
+                           .addReg(Copy.getReg(0))
+                           .addReg(AMI.PtrOp.getReg())
+                           .addImm(AMI.ImmediateOffset->getSExtValue() +
+                                   Offset); // Offset
+        } else {
+          StoreInstr = MIB.buildInstr(*LSO.OffsetOpcode, {}, {})
+                           .addReg(Copy.getReg(0))
+                           .addReg(AMI.PtrOp.getReg())
+                           .addImm(Offset); // Offset
+        }
+      }
+      SplitInstrs.push_back(StoreInstr);
+    }
+    handleSplitMemOperands(SplitInstrs);
+    break;
+  }
+  case AIE2P::G_LOAD:
+  case AIE2P::G_AIE_POSTINC_LOAD:
+  case AIE2P::G_AIE_POSTINC_2D_LOAD:
+  case AIE2P::G_AIE_POSTINC_3D_LOAD: {
     for (unsigned SubRegIdx = 0; SubRegIdx < SplitFactor; ++SubRegIdx) {
-      auto Load = (SubRegIdx == 0)
-                      ? MIB.buildInstr(LSO.ISelOpcode, {SubRegs[SubRegIdx]}, {})
-                      : MIB.buildInstr(*LSO.OffsetOpcode, {}, {})
-                            .addDef(SubRegs[SubRegIdx])
-                            .addUse(AMI.PtrOp.getReg())
-                            .addImm(SubRegIdx * 64);
+      MachineInstrBuilder Load;
       if (SubRegIdx == 0) {
+        Load = MIB.buildInstr(LSO.ISelOpcode, {SubRegs[0]}, {});
         for (auto *Def = AMI.MemI.defs().begin() + 1;
              Def != AMI.MemI.defs().end(); Def++) {
           Load.addDef(Def->getReg());
         }
         addAddressingMode(Load, AMI, LSO.FitsImmediateRange, false, MRI);
+      } else {
+        Load = MIB.buildInstr(*LSO.OffsetOpcode, {}, {})
+                   .addDef(SubRegs[SubRegIdx])
+                   .addUse(AMI.PtrOp.getReg())
+                   .addImm(SubRegIdx * 64);
       }
+      SplitInstrs.push_back(Load);
+    }
+    auto RegSeq =
+        MIB.buildInstr(AIE2P::REG_SEQUENCE, {AMI.SrcDstOp.getReg()}, {});
+    for (unsigned SubRegIdx = 0; SubRegIdx < SplitFactor; ++SubRegIdx) {
+      RegSeq.addReg(SubRegs[SubRegIdx]).addImm(SubRegIdxes[SubRegIdx]);
+    }
+    Register SrcDstReg = AMI.SrcDstOp.getReg();
+    if (!RBI.constrainGenericRegister(
+            SrcDstReg, *(SrcDstTySize == 2048 ? RC2048 : RC1024), MRI))
+      return false;
 
+    handleSplitMemOperands(SplitInstrs);
+    break;
+  }
+  case AIE2P::G_AIE_OFFSET_LOAD: {
+    if (!LSO.FitsImmediateRange) {
+      // Emit an PTR_ADD to evaluate the offset
+      insertPtrAddForOffset(MRI, AMI.MemI);
+    }
+    for (unsigned SubRegIdx = 0; SubRegIdx < SplitFactor; ++SubRegIdx) {
+      MachineInstrBuilder Load;
+      if (SubRegIdx == 0) {
+        Load = MIB.buildInstr(LSO.ISelOpcode, {SubRegs[0]}, {})
+                   .addUse(AMI.PtrOp.getReg());
+        if (LSO.FitsImmediateRange) {
+          Load.addImm(AMI.ImmediateOffset->getSExtValue()); // Offset
+        } else {
+          // In this case we have already inserted a PTR_ADD to add the offset
+          // to the base pointer
+          Load.addImm(0); // Offset
+        }
+      } else { // SubRegIdx != 0
+        if (LSO.FitsImmediateRange) {
+          Load = MIB.buildInstr(*LSO.OffsetOpcode, {}, {})
+                     .addDef(SubRegs[SubRegIdx])
+                     .addUse(AMI.PtrOp.getReg())
+                     .addImm(AMI.ImmediateOffset->getSExtValue() +
+                             SubRegIdx * 64); // Offset
+        } else {
+          Load = MIB.buildInstr(*LSO.OffsetOpcode, {}, {})
+                     .addDef(SubRegs[SubRegIdx])
+                     .addUse(AMI.PtrOp.getReg())
+                     .addImm(SubRegIdx * 64); // Offset
+        }
+      }
       SplitInstrs.push_back(Load);
     }
     auto RegSeq =
diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-indexed-load-store.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-indexed-load-store.mir
index 9f595931909b..acff254d7545 100644
--- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-indexed-load-store.mir
+++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-indexed-load-store.mir
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
 # RUN: llc -mtriple aie2p -run-pass=instruction-select %s -verify-machineinstrs -o - | FileCheck %s
 
 ---
@@ -69,14 +69,14 @@ body:             |
 ...
 
 ---
-name:            VEC256_LOAD_overMinOffset
+name:            VEC256_LOAD_outOfMinRangeOffset
 alignment:       16
 legalized:       true
 regBankSelected: true
 body:             |
   bb.1.entry:
     liveins: $p0
-    ; CHECK-LABEL: name: VEC256_LOAD_overMinOffset
+    ; CHECK-LABEL: name: VEC256_LOAD_outOfMinRangeOffset
     ; CHECK: liveins: $p0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
@@ -151,14 +151,14 @@ body:             |
 ...
 
 ---
-name:            VEC512_LOAD_overMinOffset
+name:            VEC512_LOAD_outOfMinRangeOffset
 alignment:       16
 legalized:       true
 regBankSelected: true
 body:             |
   bb.1.entry:
     liveins: $p0
-    ; CHECK-LABEL: name: VEC512_LOAD_overMinOffset
+    ; CHECK-LABEL: name: VEC512_LOAD_outOfMinRangeOffset
     ; CHECK: liveins: $p0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
@@ -233,14 +233,14 @@ body:             |
 ...
 
 ---
-name:            ACC512_LOAD_overMinOffset
+name:            ACC512_LOAD_outOfMinRangeOffset
 alignment:       16
 legalized:       true
 regBankSelected: true
 body:             |
   bb.1.entry:
     liveins: $p0
-    ; CHECK-LABEL: name: ACC512_LOAD_overMinOffset
+    ; CHECK-LABEL: name: ACC512_LOAD_outOfMinRangeOffset
     ; CHECK: liveins: $p0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
@@ -315,14 +315,14 @@ body:             |
 ...
 
 ---
-name:            VEC256_STORE_overMinOffset
+name:            VEC256_STORE_outOfMinRangeOffset
 alignment:       16
 legalized:       true
 regBankSelected: true
 body:             |
   bb.1.entry:
     liveins: $p0, $wl0
-    ; CHECK-LABEL: name: VEC256_STORE_overMinOffset
+    ; CHECK-LABEL: name: VEC256_STORE_outOfMinRangeOffset
     ; CHECK: liveins: $p0, $wl0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec256 = COPY $wl0
@@ -397,14 +397,14 @@ body:             |
 ...
 
 ---
-name:            VEC512_STORE_overMinOffset
+name:            VEC512_STORE_outOfMinRangeOffset
 alignment:       16
 legalized:       true
 regBankSelected: true
 body:             |
   bb.1.entry:
     liveins: $p0, $x0
-    ; CHECK-LABEL: name: VEC512_STORE_overMinOffset
+    ; CHECK-LABEL: name: VEC512_STORE_outOfMinRangeOffset
     ; CHECK: liveins: $p0, $x0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
@@ -479,14 +479,14 @@ body:             |
 ...
 
 ---
-name:            ACC512_STORE_overMinOffset
+name:            ACC512_STORE_outOfMinRangeOffset
 alignment:       16
 legalized:       true
 regBankSelected: true
 body:             |
   bb.1.entry:
     liveins: $bmll0, $p0
-    ; CHECK-LABEL: name: ACC512_STORE_overMinOffset
+    ; CHECK-LABEL: name: ACC512_STORE_outOfMinRangeOffset
     ; CHECK: liveins: $bmll0, $p0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:acc512 = COPY $bmll0
@@ -498,3 +498,936 @@ body:             |
     %2:modregbank(s20) = G_CONSTANT i20 -576
     G_AIE_OFFSET_STORE %0:accregbank(<8 x s64>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<8 x s64>))
 ...
+
+---
+name:            FIFO512_LOAD_maxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: FIFO512_LOAD_maxOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 448 :: (load (<16 x s32>))
+    ; CHECK-NEXT: $lfl0 = COPY [[VLDA_dmx_lda_fifohl_idx_imm]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 448
+    %0:fiforegbank(<16 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<16 x s32>))
+    $lfl0 = COPY %0:fiforegbank(<16 x s32>)
+...
+
+---
+name:            FIFO512_LOAD_minOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: FIFO512_LOAD_minOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], -512 :: (load (<16 x s32>))
+    ; CHECK-NEXT: $lfl0 = COPY [[VLDA_dmx_lda_fifohl_idx_imm]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -512
+    %0:fiforegbank(<16 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<16 x s32>))
+    $lfl0 = COPY %0:fiforegbank(<16 x s32>)
+...
+
+---
+name:            FIFO512_LOAD_overMaxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: FIFO512_LOAD_overMaxOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 512
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx [[COPY]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>))
+    ; CHECK-NEXT: $lfl0 = COPY [[VLDA_dmx_lda_fifohl_idx]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 512
+    %0:fiforegbank(<16 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<16 x s32>))
+    $lfl0 = COPY %0:fiforegbank(<16 x s32>)
+...
+
+---
+name:            FIFO512_LOAD_outOfMinRangeOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: FIFO512_LOAD_outOfMinRangeOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:edj = MOV_PD_imm11_pseudo -576
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx [[COPY]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>))
+    ; CHECK-NEXT: $lfl0 = COPY [[VLDA_dmx_lda_fifohl_idx]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -576
+    %0:fiforegbank(<16 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<16 x s32>))
+    $lfl0 = COPY %0:fiforegbank(<16 x s32>)
+...
+
+---
+name:            FIFO512_STORE_maxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $lfl0
+    ; CHECK-LABEL: name: FIFO512_STORE_maxOffset
+    ; CHECK: liveins: $p0, $lfl0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fifo512 = COPY $lfl0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY]], [[COPY1]], 448 :: (store (<16 x s32>))
+    %0:fiforegbank(<16 x s32>) = COPY $lfl0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 448
+    G_AIE_OFFSET_STORE %0:fiforegbank(<16 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<16 x s32>))
+...
+
+---
+name:            FIFO512_STORE_minOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $lfl0
+    ; CHECK-LABEL: name: FIFO512_STORE_minOffset
+    ; CHECK: liveins: $p0, $lfl0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fifo512 = COPY $lfl0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY]], [[COPY1]], -512 :: (store (<16 x s32>))
+    %0:fiforegbank(<16 x s32>) = COPY $lfl0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -512
+    G_AIE_OFFSET_STORE %0:fiforegbank(<16 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<16 x s32>))
+...
+
+---
+name:            FIFO512_STORE_overMaxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $lfl0
+    ; CHECK-LABEL: name: FIFO512_STORE_overMaxOffset
+    ; CHECK: liveins: $p0, $lfl0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fifo512 = COPY $lfl0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 512
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx [[COPY]], [[COPY1]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>))
+    %0:fiforegbank(<16 x s32>) = COPY $lfl0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 512
+    G_AIE_OFFSET_STORE %0:fiforegbank(<16 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<16 x s32>))
+...
+
+---
+name:            FIFO512_STORE_outOfMinRangeOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $lfl0
+    ; CHECK-LABEL: name: FIFO512_STORE_outOfMinRangeOffset
+    ; CHECK: liveins: $p0, $lfl0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fifo512 = COPY $lfl0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:edj = MOV_PD_imm11_pseudo -576
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx [[COPY]], [[COPY1]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>))
+    %0:fiforegbank(<16 x s32>) = COPY $lfl0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -576
+    G_AIE_OFFSET_STORE %0:fiforegbank(<16 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<16 x s32>))
+...
+
+---
+name:            VEC1024_LOAD_maxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: VEC1024_LOAD_maxOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[COPY]], 384 :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm1:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[COPY]], 448 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLDA_dmx_lda_x_idx_imm]], %subreg.sub_512_lo, [[VLDA_dmx_lda_x_idx_imm1]], %subreg.sub_512_hi
+    ; CHECK-NEXT: $y0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 384
+    %0:vregbank(<32 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s32>))
+    $y0 = COPY %0:vregbank(<32 x s32>)
+...
+
+---
+name:            VEC1024_LOAD_minOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: VEC1024_LOAD_minOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[COPY]], -448 :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm1:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[COPY]], -384 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLDA_dmx_lda_x_idx_imm]], %subreg.sub_512_lo, [[VLDA_dmx_lda_x_idx_imm1]], %subreg.sub_512_hi
+    ; CHECK-NEXT: $y0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -448
+    %0:vregbank(<32 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s32>))
+    $y0 = COPY %0:vregbank(<32 x s32>)
+...
+
+---
+name:            VEC1024_LOAD_overMaxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: VEC1024_LOAD_overMaxOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[PADD_imm_pseudo:%[0-9]+]]:ep = PADD_imm_pseudo [[COPY]], 448
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[PADD_imm_pseudo]], 0 :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm1:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[PADD_imm_pseudo]], 64 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLDA_dmx_lda_x_idx_imm]], %subreg.sub_512_lo, [[VLDA_dmx_lda_x_idx_imm1]], %subreg.sub_512_hi
+    ; CHECK-NEXT: $y0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 448
+    %0:vregbank(<32 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s32>))
+    $y0 = COPY %0:vregbank(<32 x s32>)
+...
+
+---
+name:            VEC1024_LOAD_outOfMinRangeOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: VEC1024_LOAD_outOfMinRangeOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo -576
+    ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[MOV_PD_imm11_pseudo]]
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[PADD_mod_pseudo]], 0 :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm1:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[PADD_mod_pseudo]], 64 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLDA_dmx_lda_x_idx_imm]], %subreg.sub_512_lo, [[VLDA_dmx_lda_x_idx_imm1]], %subreg.sub_512_hi
+    ; CHECK-NEXT: $y0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -576
+    %0:vregbank(<32 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s32>))
+    $y0 = COPY %0:vregbank(<32 x s32>)
+...
+
+---
+name:            VEC1024_STORE_maxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $lfl0
+    ; CHECK-LABEL: name: VEC1024_STORE_maxOffset
+    ; CHECK: liveins: $p0, $lfl0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec1024 = COPY $y0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY]].sub_512_lo
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY2]], [[COPY1]], 384 :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY]].sub_512_hi
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[COPY1]], 448 :: (store (<16 x s32>) into unknown-address + 64)
+    %0:vregbank(<32 x s32>) = COPY $y0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 384
+    G_AIE_OFFSET_STORE %0:vregbank(<32 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s32>))
+...
+
+---
+name:            VEC1024_STORE_minOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $y0
+    ; CHECK-LABEL: name: VEC1024_STORE_minOffset
+    ; CHECK: liveins: $p0, $y0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec1024 = COPY $y0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY]].sub_512_lo
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY2]], [[COPY1]], -512 :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY]].sub_512_hi
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[COPY1]], -448 :: (store (<16 x s32>) into unknown-address + 64)
+    %0:vregbank(<32 x s32>) = COPY $y0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -512
+    G_AIE_OFFSET_STORE %0:vregbank(<32 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s32>))
+...
+
+---
+name:            VEC1024_STORE_overMaxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $y0
+    ; CHECK-LABEL: name: VEC1024_STORE_overMaxOffset
+    ; CHECK: liveins: $p0, $y0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec1024 = COPY $y0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[PADD_imm_pseudo:%[0-9]+]]:ep = PADD_imm_pseudo [[COPY1]], 448
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY]].sub_512_lo
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY2]], [[PADD_imm_pseudo]], 0 :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY]].sub_512_hi
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[PADD_imm_pseudo]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    %0:vregbank(<32 x s32>) = COPY $y0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 448
+    G_AIE_OFFSET_STORE %0:vregbank(<32 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s32>))
+...
+
+---
+name:            VEC1024_STORE_outOfMinRangeOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $y0
+    ; CHECK-LABEL: name: VEC1024_STORE_outOfMinRangeOffset
+    ; CHECK: liveins: $p0, $y0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec1024 = COPY $y0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo -576
+    ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY1]], [[MOV_PD_imm11_pseudo]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY]].sub_512_lo
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY2]], [[PADD_mod_pseudo]], 0 :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY]].sub_512_hi
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[PADD_mod_pseudo]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    %0:vregbank(<32 x s32>) = COPY $y0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -576
+    G_AIE_OFFSET_STORE %0:vregbank(<32 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s32>))
+...
+
+---
+name:            ACC1024_LOAD_maxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: ACC1024_LOAD_maxOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 384 :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 448 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi
+    ; CHECK-NEXT: $cml0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 384
+    %0:accregbank(<32 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s32>))
+    $cml0 = COPY %0:accregbank(<32 x s32>)
+...
+
+---
+name:            ACC1024_LOAD_minOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: ACC1024_LOAD_minOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], -448 :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], -384 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi
+    ; CHECK-NEXT: $cml0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -448
+    %0:accregbank(<32 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s32>))
+    $cml0 = COPY %0:accregbank(<32 x s32>)
+...
+
+---
+name:            ACC1024_LOAD_overMaxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: ACC1024_LOAD_overMaxOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[PADD_imm_pseudo:%[0-9]+]]:ep = PADD_imm_pseudo [[COPY]], 448
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[PADD_imm_pseudo]], 0 :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[PADD_imm_pseudo]], 64 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi
+    ; CHECK-NEXT: $cml0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 448
+    %0:accregbank(<32 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s32>))
+    $cml0 = COPY %0:accregbank(<32 x s32>)
+...
+
+---
+name:            ACC1024_LOAD_outOfMinRangeOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: ACC1024_LOAD_outOfMinRangeOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo -576
+    ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[MOV_PD_imm11_pseudo]]
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[PADD_mod_pseudo]], 0 :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[PADD_mod_pseudo]], 64 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi
+    ; CHECK-NEXT: $cml0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -576
+    %0:accregbank(<32 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s32>))
+    $cml0 = COPY %0:accregbank(<32 x s32>)
+...
+
+---
+name:            ACC1024_STORE_maxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $cml0
+    ; CHECK-LABEL: name: ACC1024_STORE_maxOffset
+    ; CHECK: liveins: $p0, $cml0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:acc1024 = COPY $cml0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY1]], 384 :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY1]], 448 :: (store (<16 x s32>) into unknown-address + 64)
+    %0:accregbank(<32 x s32>) = COPY $cml0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 384
+    G_AIE_OFFSET_STORE %0:accregbank(<32 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s32>))
+...
+
+---
+name:            ACC1024_STORE_minOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $cml0
+    ; CHECK-LABEL: name: ACC1024_STORE_minOffset
+    ; CHECK: liveins: $p0, $cml0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:acc1024 = COPY $cml0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY1]], -512 :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY1]], -448 :: (store (<16 x s32>) into unknown-address + 64)
+    %0:accregbank(<32 x s32>) = COPY $cml0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -512
+    G_AIE_OFFSET_STORE %0:accregbank(<32 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s32>))
+...
+
+---
+name:            ACC1024_STORE_overMaxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $cml0
+    ; CHECK-LABEL: name: ACC1024_STORE_overMaxOffset
+    ; CHECK: liveins: $p0, $cml0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:acc1024 = COPY $cml0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[PADD_imm_pseudo:%[0-9]+]]:ep = PADD_imm_pseudo [[COPY1]], 448
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[PADD_imm_pseudo]], 0 :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[PADD_imm_pseudo]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    %0:accregbank(<32 x s32>) = COPY $cml0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 448
+    G_AIE_OFFSET_STORE %0:accregbank(<32 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s32>))
+...
+
+---
+name:            ACC1024_STORE_outOfMinRangeOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $cml0
+    ; CHECK-LABEL: name: ACC1024_STORE_outOfMinRangeOffset
+    ; CHECK: liveins: $p0, $cml0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:acc1024 = COPY $cml0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo -576
+    ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY1]], [[MOV_PD_imm11_pseudo]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[PADD_mod_pseudo]], 0 :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[PADD_mod_pseudo]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    %0:accregbank(<32 x s32>) = COPY $cml0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -576
+    G_AIE_OFFSET_STORE %0:accregbank(<32 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s32>))
+...
+
+---
+name:            FIFO1024_LOAD_maxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: FIFO1024_LOAD_maxOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 384 :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm1:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 448 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm1]], %subreg.sub_hi_fifo
+    ; CHECK-NEXT: $lf0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 384
+    %0:fiforegbank(<32 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s32>))
+    $lf0 = COPY %0:fiforegbank(<32 x s32>)
+...
+
+---
+name:            FIFO1024_LOAD_minOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: FIFO1024_LOAD_minOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], -448 :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm1:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], -384 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm1]], %subreg.sub_hi_fifo
+    ; CHECK-NEXT: $lf0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -448
+    %0:fiforegbank(<32 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s32>))
+    $lf0 = COPY %0:fiforegbank(<32 x s32>)
+...
+
+---
+name:            FIFO1024_LOAD_overMaxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: FIFO1024_LOAD_overMaxOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[PADD_imm_pseudo:%[0-9]+]]:ep = PADD_imm_pseudo [[COPY]], 448
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[PADD_imm_pseudo]], 0 :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm1:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[PADD_imm_pseudo]], 64 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm1]], %subreg.sub_hi_fifo
+    ; CHECK-NEXT: $lf0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 448
+    %0:fiforegbank(<32 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s32>))
+    $lf0 = COPY %0:fiforegbank(<32 x s32>)
+...
+
+---
+name:            FIFO1024_LOAD_outOfMinRangeOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: FIFO1024_LOAD_outOfMinRangeOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo -576
+    ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[MOV_PD_imm11_pseudo]]
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[PADD_mod_pseudo]], 0 :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm1:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[PADD_mod_pseudo]], 64 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm1]], %subreg.sub_hi_fifo
+    ; CHECK-NEXT: $lf0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -576
+    %0:fiforegbank(<32 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s32>))
+    $lf0 = COPY %0:fiforegbank(<32 x s32>)
+...
+
+---
+name:            FIFO1024_STORE_maxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $lf0
+    ; CHECK-LABEL: name: FIFO1024_STORE_maxOffset
+    ; CHECK: liveins: $p0, $lf0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fifo1024 = COPY $lf0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY]].sub_lo_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY2]], [[COPY1]], 384 :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo512 = COPY [[COPY]].sub_hi_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY3]], [[COPY1]], 448 :: (store (<16 x s32>) into unknown-address + 64)
+    %0:fiforegbank(<32 x s32>) = COPY $lf0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 384
+    G_AIE_OFFSET_STORE %0:fiforegbank(<32 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s32>))
+...
+
+---
+name:            FIFO1024_STORE_minOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $lf0
+    ; CHECK-LABEL: name: FIFO1024_STORE_minOffset
+    ; CHECK: liveins: $p0, $lf0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fifo1024 = COPY $lf0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY]].sub_lo_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY2]], [[COPY1]], -512 :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo512 = COPY [[COPY]].sub_hi_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY3]], [[COPY1]], -448 :: (store (<16 x s32>) into unknown-address + 64)
+    %0:fiforegbank(<32 x s32>) = COPY $lf0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -512
+    G_AIE_OFFSET_STORE %0:fiforegbank(<32 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s32>))
+...
+
+---
+name:            FIFO1024_STORE_overMaxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $lf0
+    ; CHECK-LABEL: name: FIFO1024_STORE_overMaxOffset
+    ; CHECK: liveins: $p0, $lf0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fifo1024 = COPY $lf0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[PADD_imm_pseudo:%[0-9]+]]:ep = PADD_imm_pseudo [[COPY1]], 448
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY]].sub_lo_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY2]], [[PADD_imm_pseudo]], 0 :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo512 = COPY [[COPY]].sub_hi_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY3]], [[PADD_imm_pseudo]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    %0:fiforegbank(<32 x s32>) = COPY $lf0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 448
+    G_AIE_OFFSET_STORE %0:fiforegbank(<32 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s32>))
+...
+
+---
+name:            FIFO1024_STORE_outOfMinRangeOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $lf0
+    ; CHECK-LABEL: name: FIFO1024_STORE_outOfMinRangeOffset
+    ; CHECK: liveins: $p0, $lf0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fifo1024 = COPY $lf0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo -576
+    ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY1]], [[MOV_PD_imm11_pseudo]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY]].sub_lo_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY2]], [[PADD_mod_pseudo]], 0 :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo512 = COPY [[COPY]].sub_hi_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY3]], [[PADD_mod_pseudo]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    %0:fiforegbank(<32 x s32>) = COPY $lf0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -576
+    G_AIE_OFFSET_STORE %0:fiforegbank(<32 x s32>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s32>))
+...
+
+
+---
+name:            ACC2048_LOAD_maxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: ACC2048_LOAD_maxOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 256 :: (load (<8 x s64>), align 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 320 :: (load (<8 x s64>) from unknown-address + 64)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 384 :: (load (<8 x s64>) from unknown-address + 128, align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 448 :: (load (<8 x s64>) from unknown-address + 192)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm3]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: $dm0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 256
+    %0:accregbank(<32 x s64>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s64>))
+    $dm0 = COPY %0:accregbank(<32 x s64>)
+...
+
+---
+name:            ACC2048_LOAD_minOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: ACC2048_LOAD_minOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], -320 :: (load (<8 x s64>), align 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], -256 :: (load (<8 x s64>) from unknown-address + 64)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], -192 :: (load (<8 x s64>) from unknown-address + 128, align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], -128 :: (load (<8 x s64>) from unknown-address + 192)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm3]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: $dm0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -320
+    %0:accregbank(<32 x s64>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s64>))
+    $dm0 = COPY %0:accregbank(<32 x s64>)
+...
+
+---
+name:            ACC2048_LOAD_overMaxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: ACC2048_LOAD_overMaxOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[PADD_imm_pseudo:%[0-9]+]]:ep = PADD_imm_pseudo [[COPY]], 320
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[PADD_imm_pseudo]], 0 :: (load (<8 x s64>), align 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[PADD_imm_pseudo]], 64 :: (load (<8 x s64>) from unknown-address + 64)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[PADD_imm_pseudo]], 128 :: (load (<8 x s64>) from unknown-address + 128, align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[PADD_imm_pseudo]], 192 :: (load (<8 x s64>) from unknown-address + 192)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm3]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: $dm0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 320
+    %0:accregbank(<32 x s64>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s64>))
+    $dm0 = COPY %0:accregbank(<32 x s64>)
+...
+
+---
+name:            ACC2048_LOAD_outOfMinRangeOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: ACC2048_LOAD_outOfMinRangeOffset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo -576
+    ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[MOV_PD_imm11_pseudo]]
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[PADD_mod_pseudo]], 0 :: (load (<8 x s64>), align 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[PADD_mod_pseudo]], 64 :: (load (<8 x s64>) from unknown-address + 64)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[PADD_mod_pseudo]], 128 :: (load (<8 x s64>) from unknown-address + 128, align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[PADD_mod_pseudo]], 192 :: (load (<8 x s64>) from unknown-address + 192)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm3]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: $dm0 = COPY [[REG_SEQUENCE]]
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -576
+    %0:accregbank(<32 x s64>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<32 x s64>))
+    $dm0 = COPY %0:accregbank(<32 x s64>)
+...
+
+---
+name:            ACC2048_STORE_maxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $dm0
+    ; CHECK-LABEL: name: ACC2048_STORE_maxOffset
+    ; CHECK: liveins: $p0, $dm0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:acc2048 = COPY $dm0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY1]], 256 :: (store (<8 x s64>), align 256)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY1]], 320 :: (store (<8 x s64>) into unknown-address + 64)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY]].sub_1024_acc_hi_then_sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY1]], 384 :: (store (<8 x s64>) into unknown-address + 128, align 128)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY]].sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY1]], 448 :: (store (<8 x s64>) into unknown-address + 192)
+    %0:accregbank(<32 x s64>) = COPY $dm0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 256
+    G_AIE_OFFSET_STORE %0:accregbank(<32 x s64>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s64>))
+...
+
+---
+name:            ACC2048_STORE_minOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $dm0
+    ; CHECK-LABEL: name: ACC2048_STORE_minOffset
+    ; CHECK: liveins: $p0, $dm0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:acc2048 = COPY $dm0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[COPY1]], -512 :: (store (<8 x s64>), align 256)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY1]], -448 :: (store (<8 x s64>) into unknown-address + 64)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY]].sub_1024_acc_hi_then_sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY1]], -384 :: (store (<8 x s64>) into unknown-address + 128, align 128)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY]].sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY1]], -320 :: (store (<8 x s64>) into unknown-address + 192)
+    %0:accregbank(<32 x s64>) = COPY $dm0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -512
+    G_AIE_OFFSET_STORE %0:accregbank(<32 x s64>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s64>))
+...
+
+---
+name:            ACC2048_STORE_overMaxOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $dm0
+    ; CHECK-LABEL: name: ACC2048_STORE_overMaxOffset
+    ; CHECK: liveins: $p0, $dm0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:acc2048 = COPY $dm0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[PADD_imm_pseudo:%[0-9]+]]:ep = PADD_imm_pseudo [[COPY1]], 320
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[PADD_imm_pseudo]], 0 :: (store (<8 x s64>), align 256)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[PADD_imm_pseudo]], 64 :: (store (<8 x s64>) into unknown-address + 64)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY]].sub_1024_acc_hi_then_sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[PADD_imm_pseudo]], 128 :: (store (<8 x s64>) into unknown-address + 128, align 128)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY]].sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[PADD_imm_pseudo]], 192 :: (store (<8 x s64>) into unknown-address + 192)
+    %0:accregbank(<32 x s64>) = COPY $dm0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 320
+    G_AIE_OFFSET_STORE %0:accregbank(<32 x s64>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s64>))
+...
+
+---
+name:            ACC2048_STORE_outOfMinRangeOffset
+alignment:       16
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.1.entry:
+    liveins: $p0, $dm0
+    ; CHECK-LABEL: name: ACC2048_STORE_outOfMinRangeOffset
+    ; CHECK: liveins: $p0, $dm0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:acc2048 = COPY $dm0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo -576
+    ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY1]], [[MOV_PD_imm11_pseudo]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY2]], [[PADD_mod_pseudo]], 0 :: (store (<8 x s64>), align 256)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[PADD_mod_pseudo]], 64 :: (store (<8 x s64>) into unknown-address + 64)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY]].sub_1024_acc_hi_then_sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[PADD_mod_pseudo]], 128 :: (store (<8 x s64>) into unknown-address + 128, align 128)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY]].sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[PADD_mod_pseudo]], 192 :: (store (<8 x s64>) into unknown-address + 192)
+    %0:accregbank(<32 x s64>) = COPY $dm0
+    %1:ptrregbank(p0) = COPY $p0
+    %2:modregbank(s20) = G_CONSTANT i20 -576
+    G_AIE_OFFSET_STORE %0:accregbank(<32 x s64>), %1:ptrregbank(p0), %2:modregbank(s20) :: (store (<32 x s64>))
+...
diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-pre-post-increment.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-pre-post-increment.mir
index 80c5eee723f9..ed3f5a0769e1 100644
--- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-pre-post-increment.mir
+++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-vector-pre-post-increment.mir
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
 # RUN: llc -mtriple aie2p -run-pass=instruction-select -o - -verify-machineinstrs %s | FileCheck %s
 
 ---
@@ -187,14 +187,14 @@ body: |
 ...
 
 ---
-name: post-inc-vector-load-am-512
+name: post-inc-vector-load-bm-512
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $p0, $r0, $r1
-    ; CHECK-LABEL: name: post-inc-vector-load-am-512
+    ; CHECK-LABEL: name: post-inc-vector-load-bm-512
     ; CHECK: liveins: $p0, $r0, $r1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
@@ -228,14 +228,14 @@ body: |
 
 
 ---
-name: post-inc-2d-vector-load-am-512
+name: post-inc-2d-vector-load-bm-512
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $p0
-    ; CHECK-LABEL: name: post-inc-2d-vector-load-am-512
+    ; CHECK-LABEL: name: post-inc-2d-vector-load-bm-512
     ; CHECK: liveins: $p0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
@@ -329,14 +329,14 @@ body: |
 ...
 
 ---
-name: post-inc-vector-store-am-512
+name: post-inc-vector-store-bm-512
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $p0, $r0, $r1, $bmll0
-    ; CHECK-LABEL: name: post-inc-vector-store-am-512
+    ; CHECK-LABEL: name: post-inc-vector-store-bm-512
     ; CHECK: liveins: $p0, $r0, $r1, $bmll0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
@@ -372,14 +372,14 @@ body: |
 
 
 ---
-name: post-inc-2d-vector-store-am-512
+name: post-inc-2d-vector-store-bm-512
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $p0, $bmll0
-    ; CHECK-LABEL: name: post-inc-2d-vector-store-am-512
+    ; CHECK-LABEL: name: post-inc-2d-vector-store-bm-512
     ; CHECK: liveins: $p0, $bmll0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
@@ -472,14 +472,14 @@ body: |
 
 
 ---
-name: post-inc-3d-vector-load-am-512
+name: post-inc-3d-vector-load-bm-512
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $p0
-    ; CHECK-LABEL: name: post-inc-3d-vector-load-am-512
+    ; CHECK-LABEL: name: post-inc-3d-vector-load-bm-512
     ; CHECK: liveins: $p0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
@@ -579,14 +579,14 @@ body: |
 
 
 ---
-name: post-inc-3d-vector-store-am-512
+name: post-inc-3d-vector-store-bm-512
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $p0, $bmll0
-    ; CHECK-LABEL: name: post-inc-3d-vector-store-am-512
+    ; CHECK-LABEL: name: post-inc-3d-vector-store-bm-512
     ; CHECK: liveins: $p0, $bmll0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
@@ -614,5 +614,1256 @@ body: |
     PseudoRET implicit $lr
 ...
 
+---
+name: post-inc-vector-load-fifo-512
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $r0, $r1
+    ; CHECK-LABEL: name: post-inc-vector-load-fifo-512
+    ; CHECK: liveins: $p0, $r0, $r1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]]
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s32>) from stack - 64)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm_imm:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm_imm [[VLDA_dmx_lda_fifohl_pstm_nrm1]], 448 :: (load (<16 x s32>) from stack - 64)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm_imm2:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm1]], -512 :: (load (<16 x s32>) from stack - 64)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm_imm4:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm3]], 0 :: (load (<16 x s32>) from stack - 64)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm2:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm3:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm5]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>) from stack - 64)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm4:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm5:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm [[VLDA_dmx_lda_fifohl_pstm_nrm3]], [[MOV_PD_imm11_pseudo1]] :: (load (<16 x s32>) from stack - 64)
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDA_dmx_lda_fifohl_pstm_nrm4]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:gprregbank(s32) = COPY $r0
+    %7:modregbank(s20) = G_TRUNC %1
+    %8:modregbank(s20) = G_CONSTANT i20 448
+    %9:modregbank(s20) = G_CONSTANT i20 -512
+    %10:modregbank(s20) = G_CONSTANT i20 0
+    %11:modregbank(s20) = G_CONSTANT i20 32
+    %12:modregbank(s20) = G_CONSTANT i20 512
+    %13:fiforegbank(<16 x s32>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<16 x s32>) from stack - 64)
+    %14:fiforegbank(<16 x s32>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<16 x s32>) from stack - 64)
+    %15:fiforegbank(<16 x s32>), %21:ptrregbank(p0) = G_AIE_POSTINC_LOAD %20, %9 :: (load (<16 x s32>) from stack - 64)
+    %16:fiforegbank(<16 x s32>), %22:ptrregbank(p0) = G_AIE_POSTINC_LOAD %21, %10 :: (load (<16 x s32>) from stack - 64)
+    %17:fiforegbank(<16 x s32>), %23:ptrregbank(p0) = G_AIE_POSTINC_LOAD %22, %11 :: (load (<16 x s32>) from stack - 64)
+    %18:fiforegbank(<16 x s32>), %24:ptrregbank(p0) = G_AIE_POSTINC_LOAD %23, %12 :: (load (<16 x s32>) from stack - 64)
+    PseudoRET implicit $lr, implicit %18
+...
+
+---
+name: post-inc-2d-vector-load-fifo-512
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: post-inc-2d-vector-load-fifo-512
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count
+    ; CHECK-NEXT: [[VLDA_2D_dmx_lda_fifohl:%[0-9]+]]:fifo512, [[VLDA_2D_dmx_lda_fifohl1:%[0-9]+]]:ep, [[VLDA_2D_dmx_lda_fifohl2:%[0-9]+]]:edc = VLDA_2D_dmx_lda_fifohl [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>))
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDA_2D_dmx_lda_fifohl]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edn(s20) = G_CONSTANT i20 3
+    %4:edc(s20) = G_CONSTANT i20 4
+    %5:fiforegbank(<16 x s32>), %6:ptrregbank(p0), %7:modregbank(s20) = G_AIE_POSTINC_2D_LOAD %0, %1, %2, %3, %4 :: (load (<16 x s32>))
+    PseudoRET implicit $lr, implicit %5
+...
+
+---
+name: post-inc-3d-vector-load-fifo-512
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: post-inc-3d-vector-load-fifo-512
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 5
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo5:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 6
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count
+    ; CHECK-NEXT: [[VLDA_3D_dmx_lda_fifohl:%[0-9]+]]:fifo512, [[VLDA_3D_dmx_lda_fifohl1:%[0-9]+]]:ep, [[VLDA_3D_dmx_lda_fifohl2:%[0-9]+]]:edcl, [[VLDA_3D_dmx_lda_fifohl3:%[0-9]+]]:edch = VLDA_3D_dmx_lda_fifohl [[COPY]], [[REG_SEQUENCE]] :: (load (<8 x s64>))
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDA_3D_dmx_lda_fifohl]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edj(s20) = G_CONSTANT i20 3
+    %4:edn(s20) = G_CONSTANT i20 4
+    %5:edn(s20) = G_CONSTANT i20 5
+    %6:edc(s20) = G_CONSTANT i20 6
+    %7:edc(s20) = G_CONSTANT i20 7
+    %8:fiforegbank(<8 x s64>), %9:ptrregbank(p0), %10:modregbank(s20), %11:modregbank(s20) = G_AIE_POSTINC_3D_LOAD %0, %1, %2, %3, %4, %6, %5, %7 :: (load (<8 x s64>))
+    PseudoRET implicit $lr, implicit %8
+...
+
+---
+name: post-inc-vector-store-fifo-512
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $r0, $r1, $lfl0
+    ; CHECK-LABEL: name: post-inc-vector-store-fifo-512
+    ; CHECK: liveins: $p0, $r0, $r1, $lfl0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]]
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo512 = COPY $lfl0
+    ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm [[COPY3]], [[COPY]], [[COPY2]] :: (store (<16 x s32>) into stack - 64)
+    ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm_imm:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm_imm [[COPY3]], [[VST_dmx_sts_fifohl_pstm_nrm]], 448 :: (store (<16 x s32>) into stack - 64)
+    ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm_imm1:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm_imm [[COPY3]], [[VST_dmx_sts_fifohl_pstm_nrm_imm]], -512 :: (store (<16 x s32>) into stack - 64)
+    ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm_imm2:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm_imm [[COPY3]], [[VST_dmx_sts_fifohl_pstm_nrm_imm1]], 0 :: (store (<16 x s32>) into stack - 64)
+    ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm1:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm [[COPY3]], [[VST_dmx_sts_fifohl_pstm_nrm_imm2]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>) into stack - 64)
+    ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm2:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm [[COPY3]], [[VST_dmx_sts_fifohl_pstm_nrm1]], [[MOV_PD_imm11_pseudo1]] :: (store (<16 x s32>) into stack - 64)
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VST_dmx_sts_fifohl_pstm_nrm2]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:gprregbank(s32) = COPY $r0
+    %7:modregbank(s20) = G_TRUNC %1
+    %8:modregbank(s20) = G_CONSTANT i20 448
+    %9:modregbank(s20) = G_CONSTANT i20 -512
+    %10:modregbank(s20) = G_CONSTANT i20 0
+    %11:modregbank(s20) = G_CONSTANT i20 32
+    %12:modregbank(s20) = G_CONSTANT i20 512
+    %13:fiforegbank(<16 x s32>) = COPY $lfl0
+    %19:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %0, %7 :: (store (<16 x s32>) into stack - 64)
+    %20:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %19, %8 :: (store (<16 x s32>) into stack - 64)
+    %21:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %20, %9 :: (store (<16 x s32>) into stack - 64)
+    %22:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %21, %10 :: (store (<16 x s32>) into stack - 64)
+    %23:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %22, %11 :: (store (<16 x s32>) into stack - 64)
+    %24:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %23, %12 :: (store (<16 x s32>) into stack - 64)
+    PseudoRET implicit $lr, implicit %24
+...
+
+---
+name: post-inc-2d-vector-store-fifo-512
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $lfl0
+    ; CHECK-LABEL: name: post-inc-2d-vector-store-fifo-512
+    ; CHECK: liveins: $p0, $lfl0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fifo512 = COPY $lfl0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:mxs = COPY [[COPY1]]
+    ; CHECK-NEXT: [[VST_2D_dmx_sts_x:%[0-9]+]]:ep, [[VST_2D_dmx_sts_x1:%[0-9]+]]:edc = VST_2D_dmx_sts_x [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>))
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edn(s20) = G_CONSTANT i20 3
+    %4:edc(s20) = G_CONSTANT i20 4
+    %5:fiforegbank(<16 x s32>) = COPY $lfl0
+    %6:ptrregbank(p0), %7:modregbank(s20) = G_AIE_POSTINC_2D_STORE %5, %0, %1, %2, %3, %4 :: (store (<16 x s32>))
+    PseudoRET implicit $lr
+...
+
+---
+name: post-inc-3d-vector-store-fifo-512
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $lfl0
+    ; CHECK-LABEL: name: post-inc-3d-vector-store-fifo-512
+    ; CHECK: liveins: $p0, $lfl0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 5
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo5:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 6
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fifo512 = COPY $lfl0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:mxs = COPY [[COPY1]]
+    ; CHECK-NEXT: [[VST_3D_dmx_sts_x:%[0-9]+]]:ep, [[VST_3D_dmx_sts_x1:%[0-9]+]]:edcl, [[VST_3D_dmx_sts_x2:%[0-9]+]]:edch = VST_3D_dmx_sts_x [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>))
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edj(s20) = G_CONSTANT i20 3
+    %4:edn(s20) = G_CONSTANT i20 4
+    %5:edn(s20) = G_CONSTANT i20 5
+    %6:edc(s20) = G_CONSTANT i20 6
+    %7:edc(s20) = G_CONSTANT i20 7
+    %8:fiforegbank(<16 x s32>) = COPY $lfl0
+    %9:ptrregbank(p0), %10:modregbank(s20), %11:modregbank(s20) = G_AIE_POSTINC_3D_STORE %8, %0, %1, %2, %3, %4, %6, %5, %7 :: (store (<16 x s32>))
+    PseudoRET implicit $lr
+...
+
+---
+name: post-inc-vector-load-fifo-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $r0, $r1
+    ; CHECK-LABEL: name: post-inc-vector-load-fifo-1024
+    ; CHECK: liveins: $p0, $r0, $r1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]]
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_pstm_nrm]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_hi_fifo
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm_imm:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm_imm [[VLDA_dmx_lda_fifohl_pstm_nrm1]], 448 :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm1:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm1]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_pstm_nrm_imm]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm1]], %subreg.sub_hi_fifo
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm_imm2:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm1]], -512 :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm2:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm1]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_pstm_nrm_imm2]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm2]], %subreg.sub_hi_fifo
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm_imm4:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm3]], 0 :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm3:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm3]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_pstm_nrm_imm4]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm3]], %subreg.sub_hi_fifo
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm2:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm3:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm5]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm4:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm_imm5]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_pstm_nrm2]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm4]], %subreg.sub_hi_fifo
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_pstm_nrm4:%[0-9]+]]:fifo512, [[VLDA_dmx_lda_fifohl_pstm_nrm5:%[0-9]+]]:ep = VLDA_dmx_lda_fifohl_pstm_nrm [[VLDA_dmx_lda_fifohl_pstm_nrm3]], [[MOV_PD_imm11_pseudo1]] :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm5:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[VLDA_dmx_lda_fifohl_pstm_nrm3]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_dmx_lda_fifohl_pstm_nrm4]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm5]], %subreg.sub_hi_fifo
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE5]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:gprregbank(s32) = COPY $r0
+    %7:modregbank(s20) = G_TRUNC %1
+    %8:modregbank(s20) = G_CONSTANT i20 448
+    %9:modregbank(s20) = G_CONSTANT i20 -512
+    %10:modregbank(s20) = G_CONSTANT i20 0
+    %11:modregbank(s20) = G_CONSTANT i20 32
+    %12:modregbank(s20) = G_CONSTANT i20 512
+    %13:fiforegbank(<32 x s32>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<32 x s32>) from stack - 64)
+    %14:fiforegbank(<32 x s32>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<32 x s32>) from stack - 64)
+    %15:fiforegbank(<32 x s32>), %21:ptrregbank(p0) = G_AIE_POSTINC_LOAD %20, %9 :: (load (<32 x s32>) from stack - 64)
+    %16:fiforegbank(<32 x s32>), %22:ptrregbank(p0) = G_AIE_POSTINC_LOAD %21, %10 :: (load (<32 x s32>) from stack - 64)
+    %17:fiforegbank(<32 x s32>), %23:ptrregbank(p0) = G_AIE_POSTINC_LOAD %22, %11 :: (load (<32 x s32>) from stack - 64)
+    %18:fiforegbank(<32 x s32>), %24:ptrregbank(p0) = G_AIE_POSTINC_LOAD %23, %12 :: (load (<32 x s32>) from stack - 64)
+    PseudoRET implicit $lr, implicit %18
+...
+
+---
+name: post-inc-2d-vector-load-fifo-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: post-inc-2d-vector-load-fifo-1024
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count
+    ; CHECK-NEXT: [[VLDA_2D_dmx_lda_fifohl:%[0-9]+]]:fifo512, [[VLDA_2D_dmx_lda_fifohl1:%[0-9]+]]:ep, [[VLDA_2D_dmx_lda_fifohl2:%[0-9]+]]:edc = VLDA_2D_dmx_lda_fifohl [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_fifohl_idx_imm:%[0-9]+]]:fifo512 = VLDA_dmx_lda_fifohl_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:fifo1024 = REG_SEQUENCE [[VLDA_2D_dmx_lda_fifohl]], %subreg.sub_lo_fifo, [[VLDA_dmx_lda_fifohl_idx_imm]], %subreg.sub_hi_fifo
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE1]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edn(s20) = G_CONSTANT i20 3
+    %4:edc(s20) = G_CONSTANT i20 4
+    %5:fiforegbank(<32 x s32>), %6:ptrregbank(p0), %7:modregbank(s20) = G_AIE_POSTINC_2D_LOAD %0, %1, %2, %3, %4 :: (load (<32 x s32>))
+    PseudoRET implicit $lr, implicit %5
+...
+
+---
+name: post-inc-3d-vector-load-fifo-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: post-inc-3d-vector-load-fifo-1024
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 5
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo5:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 6
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count
+    ; CHECK-NEXT: [[VLDA_3D_dmx_lda_fifohl:%[0-9]+]]:fifo512, [[VLDA_3D_dmx_lda_fifohl1:%[0-9]+]]:ep, [[VLDA_3D_dmx_lda_fifohl2:%[0-9]+]]:edcl, [[VLDA_3D_dmx_lda_fifohl3:%[0-9]+]]:edch = VLDA_3D_dmx_lda_fifohl [[COPY]], [[REG_SEQUENCE]] :: (load (<8 x s64>))
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDA_3D_dmx_lda_fifohl]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edj(s20) = G_CONSTANT i20 3
+    %4:edn(s20) = G_CONSTANT i20 4
+    %5:edn(s20) = G_CONSTANT i20 5
+    %6:edc(s20) = G_CONSTANT i20 6
+    %7:edc(s20) = G_CONSTANT i20 7
+    %8:fiforegbank(<8 x s64>), %9:ptrregbank(p0), %10:modregbank(s20), %11:modregbank(s20) = G_AIE_POSTINC_3D_LOAD %0, %1, %2, %3, %4, %6, %5, %7 :: (load (<8 x s64>))
+    PseudoRET implicit $lr, implicit %8
+...
+
+---
+name: post-inc-vector-store-fifo-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $r0, $r1, $lf0
+    ; CHECK-LABEL: name: post-inc-vector-store-fifo-1024
+    ; CHECK: liveins: $p0, $r0, $r1, $lf0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]]
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fifo1024 = COPY $lf0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo
+    ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm [[COPY4]], [[COPY]], [[COPY2]] :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY5]], [[COPY]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo
+    ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm_imm:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm_imm [[COPY6]], [[VST_dmx_sts_fifohl_pstm_nrm]], 448 :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY7]], [[VST_dmx_sts_fifohl_pstm_nrm]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo
+    ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm_imm1:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm_imm [[COPY8]], [[VST_dmx_sts_fifohl_pstm_nrm_imm]], -512 :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY9]], [[VST_dmx_sts_fifohl_pstm_nrm_imm]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo
+    ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm_imm2:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm_imm [[COPY10]], [[VST_dmx_sts_fifohl_pstm_nrm_imm1]], 0 :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY11]], [[VST_dmx_sts_fifohl_pstm_nrm_imm1]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo
+    ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm1:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm [[COPY12]], [[VST_dmx_sts_fifohl_pstm_nrm_imm2]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY13]], [[VST_dmx_sts_fifohl_pstm_nrm_imm2]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY14:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_lo_fifo
+    ; CHECK-NEXT: [[VST_dmx_sts_fifohl_pstm_nrm2:%[0-9]+]]:ep = VST_dmx_sts_fifohl_pstm_nrm [[COPY14]], [[VST_dmx_sts_fifohl_pstm_nrm1]], [[MOV_PD_imm11_pseudo1]] :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY15:%[0-9]+]]:fifo512 = COPY [[COPY3]].sub_hi_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY15]], [[VST_dmx_sts_fifohl_pstm_nrm1]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VST_dmx_sts_fifohl_pstm_nrm2]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:gprregbank(s32) = COPY $r0
+    %7:modregbank(s20) = G_TRUNC %1
+    %8:modregbank(s20) = G_CONSTANT i20 448
+    %9:modregbank(s20) = G_CONSTANT i20 -512
+    %10:modregbank(s20) = G_CONSTANT i20 0
+    %11:modregbank(s20) = G_CONSTANT i20 32
+    %12:modregbank(s20) = G_CONSTANT i20 512
+    %13:fiforegbank(<32 x s32>) = COPY $lf0
+    %19:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %0, %7 :: (store (<32 x s32>) into stack - 64)
+    %20:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %19, %8 :: (store (<32 x s32>) into stack - 64)
+    %21:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %20, %9 :: (store (<32 x s32>) into stack - 64)
+    %22:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %21, %10 :: (store (<32 x s32>) into stack - 64)
+    %23:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %22, %11 :: (store (<32 x s32>) into stack - 64)
+    %24:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %23, %12 :: (store (<32 x s32>) into stack - 64)
+    PseudoRET implicit $lr, implicit %24
+...
+
+---
+name: post-inc-2d-vector-store-fifo-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $lf0
+    ; CHECK-LABEL: name: post-inc-2d-vector-store-fifo-1024
+    ; CHECK: liveins: $p0, $lf0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fifo1024 = COPY $lf0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_lo_fifo
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:mxs = COPY [[COPY2]]
+    ; CHECK-NEXT: [[VST_2D_dmx_sts_x:%[0-9]+]]:ep, [[VST_2D_dmx_sts_x1:%[0-9]+]]:edc = VST_2D_dmx_sts_x [[COPY3]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_hi_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY4]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edn(s20) = G_CONSTANT i20 3
+    %4:edc(s20) = G_CONSTANT i20 4
+    %5:fiforegbank(<32 x s32>) = COPY $lf0
+    %6:ptrregbank(p0), %7:modregbank(s20) = G_AIE_POSTINC_2D_STORE %5, %0, %1, %2, %3, %4 :: (store (<32 x s32>))
+    PseudoRET implicit $lr
+...
+
+---
+name: post-inc-3d-vector-store-fifo-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $lf0
+    ; CHECK-LABEL: name: post-inc-3d-vector-store-fifo-1024
+    ; CHECK: liveins: $p0, $lf0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 5
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo5:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 6
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fifo1024 = COPY $lf0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_lo_fifo
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:mxs = COPY [[COPY2]]
+    ; CHECK-NEXT: [[VST_3D_dmx_sts_x:%[0-9]+]]:ep, [[VST_3D_dmx_sts_x1:%[0-9]+]]:edcl, [[VST_3D_dmx_sts_x2:%[0-9]+]]:edch = VST_3D_dmx_sts_x [[COPY3]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fifo512 = COPY [[COPY1]].sub_hi_fifo
+    ; CHECK-NEXT: VST_dmx_sts_fifohl_idx_imm [[COPY4]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edj(s20) = G_CONSTANT i20 3
+    %4:edn(s20) = G_CONSTANT i20 4
+    %5:edn(s20) = G_CONSTANT i20 5
+    %6:edc(s20) = G_CONSTANT i20 6
+    %7:edc(s20) = G_CONSTANT i20 7
+    %8:fiforegbank(<32 x s32>) = COPY $lf0
+    %9:ptrregbank(p0), %10:modregbank(s20), %11:modregbank(s20) = G_AIE_POSTINC_3D_STORE %8, %0, %1, %2, %3, %4, %6, %5, %7 :: (store (<32 x s32>))
+    PseudoRET implicit $lr
+...
+
+---
+name: post-inc-vector-load-vreg-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $r0, $r1
+    ; CHECK-LABEL: name: post-inc-vector-load-vreg-1024
+    ; CHECK: liveins: $p0, $r0, $r1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]]
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_pstm_nrm:%[0-9]+]]:vec512, [[VLDA_dmx_lda_x_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmx_lda_x_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLDA_dmx_lda_x_pstm_nrm]], %subreg.sub_512_lo, [[VLDA_dmx_lda_x_idx_imm]], %subreg.sub_512_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_pstm_nrm_imm:%[0-9]+]]:vec512, [[VLDA_dmx_lda_x_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmx_lda_x_pstm_nrm_imm [[VLDA_dmx_lda_x_pstm_nrm1]], 448 :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm1:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[VLDA_dmx_lda_x_pstm_nrm1]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLDA_dmx_lda_x_pstm_nrm_imm]], %subreg.sub_512_lo, [[VLDA_dmx_lda_x_idx_imm1]], %subreg.sub_512_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_pstm_nrm_imm2:%[0-9]+]]:vec512, [[VLDA_dmx_lda_x_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmx_lda_x_pstm_nrm_imm [[VLDA_dmx_lda_x_pstm_nrm_imm1]], -512 :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm2:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[VLDA_dmx_lda_x_pstm_nrm_imm1]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLDA_dmx_lda_x_pstm_nrm_imm2]], %subreg.sub_512_lo, [[VLDA_dmx_lda_x_idx_imm2]], %subreg.sub_512_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_pstm_nrm_imm4:%[0-9]+]]:vec512, [[VLDA_dmx_lda_x_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmx_lda_x_pstm_nrm_imm [[VLDA_dmx_lda_x_pstm_nrm_imm3]], 0 :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm3:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[VLDA_dmx_lda_x_pstm_nrm_imm3]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLDA_dmx_lda_x_pstm_nrm_imm4]], %subreg.sub_512_lo, [[VLDA_dmx_lda_x_idx_imm3]], %subreg.sub_512_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_pstm_nrm2:%[0-9]+]]:vec512, [[VLDA_dmx_lda_x_pstm_nrm3:%[0-9]+]]:ep = VLDA_dmx_lda_x_pstm_nrm [[VLDA_dmx_lda_x_pstm_nrm_imm5]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm4:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[VLDA_dmx_lda_x_pstm_nrm_imm5]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLDA_dmx_lda_x_pstm_nrm2]], %subreg.sub_512_lo, [[VLDA_dmx_lda_x_idx_imm4]], %subreg.sub_512_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_pstm_nrm4:%[0-9]+]]:vec512, [[VLDA_dmx_lda_x_pstm_nrm5:%[0-9]+]]:ep = VLDA_dmx_lda_x_pstm_nrm [[VLDA_dmx_lda_x_pstm_nrm3]], [[MOV_PD_imm11_pseudo1]] :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_x_idx_imm5:%[0-9]+]]:vec512 = VLDA_dmx_lda_x_idx_imm [[VLDA_dmx_lda_x_pstm_nrm3]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLDA_dmx_lda_x_pstm_nrm4]], %subreg.sub_512_lo, [[VLDA_dmx_lda_x_idx_imm5]], %subreg.sub_512_hi
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE5]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:gprregbank(s32) = COPY $r0
+    %7:modregbank(s20) = G_TRUNC %1
+    %8:modregbank(s20) = G_CONSTANT i20 448
+    %9:modregbank(s20) = G_CONSTANT i20 -512
+    %10:modregbank(s20) = G_CONSTANT i20 0
+    %11:modregbank(s20) = G_CONSTANT i20 32
+    %12:modregbank(s20) = G_CONSTANT i20 512
+    %13:vregbank(<32 x s32>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<32 x s32>) from stack - 64)
+    %14:vregbank(<32 x s32>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<32 x s32>) from stack - 64)
+    %15:vregbank(<32 x s32>), %21:ptrregbank(p0) = G_AIE_POSTINC_LOAD %20, %9 :: (load (<32 x s32>) from stack - 64)
+    %16:vregbank(<32 x s32>), %22:ptrregbank(p0) = G_AIE_POSTINC_LOAD %21, %10 :: (load (<32 x s32>) from stack - 64)
+    %17:vregbank(<32 x s32>), %23:ptrregbank(p0) = G_AIE_POSTINC_LOAD %22, %11 :: (load (<32 x s32>) from stack - 64)
+    %18:vregbank(<32 x s32>), %24:ptrregbank(p0) = G_AIE_POSTINC_LOAD %23, %12 :: (load (<32 x s32>) from stack - 64)
+    PseudoRET implicit $lr, implicit %18
+...
+
+---
+name: post-inc-2d-vector-load-vreg-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: post-inc-2d-vector-load-vreg-1024
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count
+    ; CHECK-NEXT: [[VLDA_2D_dmx_lda_x:%[0-9]+]]:vec512, [[VLDA_2D_dmx_lda_x1:%[0-9]+]]:ep, [[VLDA_2D_dmx_lda_x2:%[0-9]+]]:edc = VLDA_2D_dmx_lda_x [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmw_lda_w_idx_imm:%[0-9]+]]:mwa = VLDA_dmw_lda_w_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY [[VLDA_dmw_lda_w_idx_imm]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec1024 = REG_SEQUENCE [[VLDA_2D_dmx_lda_x]], %subreg.sub_512_lo, [[COPY1]], %subreg.sub_512_hi
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE1]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edn(s20) = G_CONSTANT i20 3
+    %4:edc(s20) = G_CONSTANT i20 4
+    %5:vregbank(<32 x s32>), %6:ptrregbank(p0), %7:modregbank(s20) = G_AIE_POSTINC_2D_LOAD %0, %1, %2, %3, %4 :: (load (<32 x s32>))
+    PseudoRET implicit $lr, implicit %5
+...
+
+---
+name: post-inc-3d-vector-load-vreg-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: post-inc-3d-vector-load-vreg-1024
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 5
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo5:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 6
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count
+    ; CHECK-NEXT: [[VLDA_3D_dmx_lda_x:%[0-9]+]]:vec512, [[VLDA_3D_dmx_lda_x1:%[0-9]+]]:ep, [[VLDA_3D_dmx_lda_x2:%[0-9]+]]:edcl, [[VLDA_3D_dmx_lda_x3:%[0-9]+]]:edch = VLDA_3D_dmx_lda_x [[COPY]], [[REG_SEQUENCE]] :: (load (<8 x s64>))
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDA_3D_dmx_lda_x]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edj(s20) = G_CONSTANT i20 3
+    %4:edn(s20) = G_CONSTANT i20 4
+    %5:edn(s20) = G_CONSTANT i20 5
+    %6:edc(s20) = G_CONSTANT i20 6
+    %7:edc(s20) = G_CONSTANT i20 7
+    %8:vregbank(<8 x s64>), %9:ptrregbank(p0), %10:modregbank(s20), %11:modregbank(s20) = G_AIE_POSTINC_3D_LOAD %0, %1, %2, %3, %4, %6, %5, %7 :: (load (<8 x s64>))
+    PseudoRET implicit $lr, implicit %8
+...
+
+---
+name: post-inc-vector-store-vreg-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $r0, $r1, $y0
+    ; CHECK-LABEL: name: post-inc-vector-store-vreg-1024
+    ; CHECK: liveins: $p0, $r0, $r1, $y0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]]
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec1024 = COPY $y0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm [[COPY4]], [[COPY]], [[COPY2]] :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY5]], [[COPY]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm_imm:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm_imm [[COPY6]], [[VST_dmx_sts_x_pstm_nrm]], 448 :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY7]], [[VST_dmx_sts_x_pstm_nrm]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm_imm1:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm_imm [[COPY8]], [[VST_dmx_sts_x_pstm_nrm_imm]], -512 :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY9]], [[VST_dmx_sts_x_pstm_nrm_imm]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm_imm2:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm_imm [[COPY10]], [[VST_dmx_sts_x_pstm_nrm_imm1]], 0 :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY11]], [[VST_dmx_sts_x_pstm_nrm_imm1]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm1:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm [[COPY12]], [[VST_dmx_sts_x_pstm_nrm_imm2]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY13]], [[VST_dmx_sts_x_pstm_nrm_imm2]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_x_pstm_nrm2:%[0-9]+]]:ep = VST_dmx_sts_x_pstm_nrm [[COPY14]], [[VST_dmx_sts_x_pstm_nrm1]], [[MOV_PD_imm11_pseudo1]] :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vec512 = COPY [[COPY3]].sub_512_hi
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY15]], [[VST_dmx_sts_x_pstm_nrm1]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VST_dmx_sts_x_pstm_nrm2]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:gprregbank(s32) = COPY $r0
+    %7:modregbank(s20) = G_TRUNC %1
+    %8:modregbank(s20) = G_CONSTANT i20 448
+    %9:modregbank(s20) = G_CONSTANT i20 -512
+    %10:modregbank(s20) = G_CONSTANT i20 0
+    %11:modregbank(s20) = G_CONSTANT i20 32
+    %12:modregbank(s20) = G_CONSTANT i20 512
+    %13:vregbank(<32 x s32>) = COPY $y0
+    %19:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %0, %7 :: (store (<32 x s32>) into stack - 64)
+    %20:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %19, %8 :: (store (<32 x s32>) into stack - 64)
+    %21:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %20, %9 :: (store (<32 x s32>) into stack - 64)
+    %22:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %21, %10 :: (store (<32 x s32>) into stack - 64)
+    %23:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %22, %11 :: (store (<32 x s32>) into stack - 64)
+    %24:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %23, %12 :: (store (<32 x s32>) into stack - 64)
+    PseudoRET implicit $lr, implicit %24
+...
+
+---
+name: post-inc-2d-vector-store-vreg-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $y0
+    ; CHECK-LABEL: name: post-inc-2d-vector-store-vreg-1024
+    ; CHECK: liveins: $p0, $y0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec1024 = COPY $y0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_lo
+    ; CHECK-NEXT: [[VST_2D_dmx_sts_x:%[0-9]+]]:ep, [[VST_2D_dmx_sts_x1:%[0-9]+]]:edc = VST_2D_dmx_sts_x [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_hi
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edn(s20) = G_CONSTANT i20 3
+    %4:edc(s20) = G_CONSTANT i20 4
+    %5:vregbank(<32 x s32>) = COPY $y0
+    %6:ptrregbank(p0), %7:modregbank(s20) = G_AIE_POSTINC_2D_STORE %5, %0, %1, %2, %3, %4 :: (store (<32 x s32>))
+    PseudoRET implicit $lr
+...
+
+---
+name: post-inc-3d-vector-store-vreg-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $y0
+    ; CHECK-LABEL: name: post-inc-3d-vector-store-vreg-1024
+    ; CHECK: liveins: $p0, $y0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 5
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo5:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 6
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec1024 = COPY $y0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_lo
+    ; CHECK-NEXT: [[VST_3D_dmx_sts_x:%[0-9]+]]:ep, [[VST_3D_dmx_sts_x1:%[0-9]+]]:edcl, [[VST_3D_dmx_sts_x2:%[0-9]+]]:edch = VST_3D_dmx_sts_x [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = COPY [[COPY1]].sub_512_hi
+    ; CHECK-NEXT: VST_dmx_sts_x_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edj(s20) = G_CONSTANT i20 3
+    %4:edn(s20) = G_CONSTANT i20 4
+    %5:edn(s20) = G_CONSTANT i20 5
+    %6:edc(s20) = G_CONSTANT i20 6
+    %7:edc(s20) = G_CONSTANT i20 7
+    %8:vregbank(<32 x s32>) = COPY $y0
+    %9:ptrregbank(p0), %10:modregbank(s20), %11:modregbank(s20) = G_AIE_POSTINC_3D_STORE %8, %0, %1, %2, %3, %4, %6, %5, %7 :: (store (<32 x s32>))
+    PseudoRET implicit $lr
+...
+
+---
+name: post-inc-vector-load-acc-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $r0, $r1
+    ; CHECK-LABEL: name: post-inc-vector-load-acc-1024
+    ; CHECK: liveins: $p0, $r0, $r1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]]
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 448 :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_512_acc_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm2:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], -512 :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm2]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_512_acc_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm4:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 0 :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm4]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm3]], %subreg.sub_512_acc_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm2:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm3:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm4:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm2]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm4]], %subreg.sub_512_acc_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm4:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm5:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[VLDA_dmx_lda_bm_pstm_nrm3]], [[MOV_PD_imm11_pseudo1]] :: (load (<16 x s32>) from stack - 64, basealign 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm5:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm3]], 64 :: (load (<16 x s32>) from stack, align 128)
+    ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm4]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm5]], %subreg.sub_512_acc_hi
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE5]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:gprregbank(s32) = COPY $r0
+    %7:modregbank(s20) = G_TRUNC %1
+    %8:modregbank(s20) = G_CONSTANT i20 448
+    %9:modregbank(s20) = G_CONSTANT i20 -512
+    %10:modregbank(s20) = G_CONSTANT i20 0
+    %11:modregbank(s20) = G_CONSTANT i20 32
+    %12:modregbank(s20) = G_CONSTANT i20 512
+    %13:accregbank(<32 x s32>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<32 x s32>) from stack - 64)
+    %14:accregbank(<32 x s32>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<32 x s32>) from stack - 64)
+    %15:accregbank(<32 x s32>), %21:ptrregbank(p0) = G_AIE_POSTINC_LOAD %20, %9 :: (load (<32 x s32>) from stack - 64)
+    %16:accregbank(<32 x s32>), %22:ptrregbank(p0) = G_AIE_POSTINC_LOAD %21, %10 :: (load (<32 x s32>) from stack - 64)
+    %17:accregbank(<32 x s32>), %23:ptrregbank(p0) = G_AIE_POSTINC_LOAD %22, %11 :: (load (<32 x s32>) from stack - 64)
+    %18:accregbank(<32 x s32>), %24:ptrregbank(p0) = G_AIE_POSTINC_LOAD %23, %12 :: (load (<32 x s32>) from stack - 64)
+    PseudoRET implicit $lr, implicit %18
+...
+
+---
+name: post-inc-2d-vector-load-acc-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: post-inc-2d-vector-load-acc-1024
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count
+    ; CHECK-NEXT: [[VLDA_2D_dmx_lda_bm:%[0-9]+]]:acc512, [[VLDA_2D_dmx_lda_bm1:%[0-9]+]]:ep, [[VLDA_2D_dmx_lda_bm2:%[0-9]+]]:edc = VLDA_2D_dmx_lda_bm [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:acc1024 = REG_SEQUENCE [[VLDA_2D_dmx_lda_bm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_hi
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE1]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edn(s20) = G_CONSTANT i20 3
+    %4:edc(s20) = G_CONSTANT i20 4
+    %5:accregbank(<32 x s32>), %6:ptrregbank(p0), %7:modregbank(s20) = G_AIE_POSTINC_2D_LOAD %0, %1, %2, %3, %4 :: (load (<32 x s32>))
+    PseudoRET implicit $lr, implicit %5
+...
+
+---
+name: post-inc-3d-vector-load-acc-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: post-inc-3d-vector-load-acc-1024
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 5
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo5:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 6
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count
+    ; CHECK-NEXT: [[VLDA_3D_dmx_lda_bm:%[0-9]+]]:acc512, [[VLDA_3D_dmx_lda_bm1:%[0-9]+]]:ep, [[VLDA_3D_dmx_lda_bm2:%[0-9]+]]:edcl, [[VLDA_3D_dmx_lda_bm3:%[0-9]+]]:edch = VLDA_3D_dmx_lda_bm [[COPY]], [[REG_SEQUENCE]] :: (load (<8 x s64>))
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDA_3D_dmx_lda_bm]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edj(s20) = G_CONSTANT i20 3
+    %4:edn(s20) = G_CONSTANT i20 4
+    %5:edn(s20) = G_CONSTANT i20 5
+    %6:edc(s20) = G_CONSTANT i20 6
+    %7:edc(s20) = G_CONSTANT i20 7
+    %8:accregbank(<8 x s64>), %9:ptrregbank(p0), %10:modregbank(s20), %11:modregbank(s20) = G_AIE_POSTINC_3D_LOAD %0, %1, %2, %3, %4, %6, %5, %7 :: (load (<8 x s64>))
+    PseudoRET implicit $lr, implicit %8
+...
+
+---
+name: post-inc-vector-store-acc-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $r0, $r1, $cml0
+    ; CHECK-LABEL: name: post-inc-vector-store-acc-1024
+    ; CHECK: liveins: $p0, $r0, $r1, $cml0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]]
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024 = COPY $cml0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY4]], [[COPY]], [[COPY2]] :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY6]], [[VST_dmx_sts_bm_pstm_nrm]], 448 :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY7]], [[VST_dmx_sts_bm_pstm_nrm]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm1:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY8]], [[VST_dmx_sts_bm_pstm_nrm_imm]], -512 :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY9]], [[VST_dmx_sts_bm_pstm_nrm_imm]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm2:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY10]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 0 :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY11]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm1:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY12]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY13]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: [[COPY14:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm2:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY14]], [[VST_dmx_sts_bm_pstm_nrm1]], [[MOV_PD_imm11_pseudo1]] :: (store (<16 x s32>) into stack - 64, basealign 128)
+    ; CHECK-NEXT: [[COPY15:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY15]], [[VST_dmx_sts_bm_pstm_nrm1]], 64 :: (store (<16 x s32>) into stack, align 128)
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VST_dmx_sts_bm_pstm_nrm2]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:gprregbank(s32) = COPY $r0
+    %7:modregbank(s20) = G_TRUNC %1
+    %8:modregbank(s20) = G_CONSTANT i20 448
+    %9:modregbank(s20) = G_CONSTANT i20 -512
+    %10:modregbank(s20) = G_CONSTANT i20 0
+    %11:modregbank(s20) = G_CONSTANT i20 32
+    %12:modregbank(s20) = G_CONSTANT i20 512
+    %13:accregbank(<32 x s32>) = COPY $cml0
+    %19:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %0, %7 :: (store (<32 x s32>) into stack - 64)
+    %20:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %19, %8 :: (store (<32 x s32>) into stack - 64)
+    %21:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %20, %9 :: (store (<32 x s32>) into stack - 64)
+    %22:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %21, %10 :: (store (<32 x s32>) into stack - 64)
+    %23:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %22, %11 :: (store (<32 x s32>) into stack - 64)
+    %24:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %23, %12 :: (store (<32 x s32>) into stack - 64)
+    PseudoRET implicit $lr, implicit %24
+...
+
+---
+name: post-inc-2d-vector-store-acc-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $cml0
+    ; CHECK-LABEL: name: post-inc-2d-vector-store-acc-1024
+    ; CHECK: liveins: $p0, $cml0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024 = COPY $cml0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_2D_dmx_sts_bm:%[0-9]+]]:ep, [[VST_2D_dmx_sts_bm1:%[0-9]+]]:edc = VST_2D_dmx_sts_bm [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edn(s20) = G_CONSTANT i20 3
+    %4:edc(s20) = G_CONSTANT i20 4
+    %5:accregbank(<32 x s32>) = COPY $cml0
+    %6:ptrregbank(p0), %7:modregbank(s20) = G_AIE_POSTINC_2D_STORE %5, %0, %1, %2, %3, %4 :: (store (<32 x s32>))
+    PseudoRET implicit $lr
+...
+
+---
+name: post-inc-3d-vector-store-acc-1024
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $cml0
+    ; CHECK-LABEL: name: post-inc-3d-vector-store-acc-1024
+    ; CHECK: liveins: $p0, $cml0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 5
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo5:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 6
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024 = COPY $cml0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_3D_dmx_sts_bm:%[0-9]+]]:ep, [[VST_3D_dmx_sts_bm1:%[0-9]+]]:edcl, [[VST_3D_dmx_sts_bm2:%[0-9]+]]:edch = VST_3D_dmx_sts_bm [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 128)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edj(s20) = G_CONSTANT i20 3
+    %4:edn(s20) = G_CONSTANT i20 4
+    %5:edn(s20) = G_CONSTANT i20 5
+    %6:edc(s20) = G_CONSTANT i20 6
+    %7:edc(s20) = G_CONSTANT i20 7
+    %8:accregbank(<32 x s32>) = COPY $cml0
+    %9:ptrregbank(p0), %10:modregbank(s20), %11:modregbank(s20) = G_AIE_POSTINC_3D_STORE %8, %0, %1, %2, %3, %4, %6, %5, %7 :: (store (<32 x s32>))
+    PseudoRET implicit $lr
+...
+
+---
+name: post-inc-vector-load-acc-2048
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $r0, $r1
+    ; CHECK-LABEL: name: post-inc-vector-load-acc-2048
+    ; CHECK: liveins: $p0, $r0, $r1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]]
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s32>) from stack - 64, basealign 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from stack, align 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 128 :: (load (<16 x s32>) from stack + 64, basealign 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 192 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256)
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 448 :: (load (<16 x s32>) from stack - 64, basealign 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm3:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 64 :: (load (<16 x s32>) from stack, align 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm4:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 128 :: (load (<16 x s32>) from stack + 64, basealign 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm5:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm1]], 192 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256)
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm3]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm4]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm5]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm2:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], -512 :: (load (<16 x s32>) from stack - 64, basealign 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm6:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], 64 :: (load (<16 x s32>) from stack, align 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm7:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], 128 :: (load (<16 x s32>) from stack + 64, basealign 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm8:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm1]], 192 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256)
+    ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm2]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm6]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm7]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm8]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm_imm4:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 0 :: (load (<16 x s32>) from stack - 64, basealign 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm9:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 64 :: (load (<16 x s32>) from stack, align 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm10:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 128 :: (load (<16 x s32>) from stack + 64, basealign 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm11:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm3]], 192 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256)
+    ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm_imm4]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm9]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm10]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm11]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm2:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm3:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], [[MOV_PD_imm11_pseudo]] :: (load (<16 x s32>) from stack - 64, basealign 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm12:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], 64 :: (load (<16 x s32>) from stack, align 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm13:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], 128 :: (load (<16 x s32>) from stack + 64, basealign 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm14:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm_imm5]], 192 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256)
+    ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm2]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm12]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm13]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm14]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_pstm_nrm4:%[0-9]+]]:acc512, [[VLDA_dmx_lda_bm_pstm_nrm5:%[0-9]+]]:ep = VLDA_dmx_lda_bm_pstm_nrm [[VLDA_dmx_lda_bm_pstm_nrm3]], [[MOV_PD_imm11_pseudo1]] :: (load (<16 x s32>) from stack - 64, basealign 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm15:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm3]], 64 :: (load (<16 x s32>) from stack, align 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm16:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm3]], 128 :: (load (<16 x s32>) from stack + 64, basealign 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm17:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[VLDA_dmx_lda_bm_pstm_nrm3]], 192 :: (load (<16 x s32>) from stack + 128, align 128, basealign 256)
+    ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_dmx_lda_bm_pstm_nrm4]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm15]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm16]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm17]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE5]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:gprregbank(s32) = COPY $r0
+    %7:modregbank(s20) = G_TRUNC %1
+    %8:modregbank(s20) = G_CONSTANT i20 448
+    %9:modregbank(s20) = G_CONSTANT i20 -512
+    %10:modregbank(s20) = G_CONSTANT i20 0
+    %11:modregbank(s20) = G_CONSTANT i20 32
+    %12:modregbank(s20) = G_CONSTANT i20 512
+    %13:accregbank(<64 x s32>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<64 x s32>) from stack - 64)
+    %14:accregbank(<64 x s32>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<64 x s32>) from stack - 64)
+    %15:accregbank(<64 x s32>), %21:ptrregbank(p0) = G_AIE_POSTINC_LOAD %20, %9 :: (load (<64 x s32>) from stack - 64)
+    %16:accregbank(<64 x s32>), %22:ptrregbank(p0) = G_AIE_POSTINC_LOAD %21, %10 :: (load (<64 x s32>) from stack - 64)
+    %17:accregbank(<64 x s32>), %23:ptrregbank(p0) = G_AIE_POSTINC_LOAD %22, %11 :: (load (<64 x s32>) from stack - 64)
+    %18:accregbank(<64 x s32>), %24:ptrregbank(p0) = G_AIE_POSTINC_LOAD %23, %12 :: (load (<64 x s32>) from stack - 64)
+    PseudoRET implicit $lr, implicit %18
+...
 
+---
+name: post-inc-2d-vector-load-acc-2048
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: post-inc-2d-vector-load-acc-2048
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count
+    ; CHECK-NEXT: [[VLDA_2D_dmx_lda_bm:%[0-9]+]]:acc512, [[VLDA_2D_dmx_lda_bm1:%[0-9]+]]:ep, [[VLDA_2D_dmx_lda_bm2:%[0-9]+]]:edc = VLDA_2D_dmx_lda_bm [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s32>), align 256)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 64 :: (load (<16 x s32>) from unknown-address + 64)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm1:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 128 :: (load (<16 x s32>) from unknown-address + 128, align 128)
+    ; CHECK-NEXT: [[VLDA_dmx_lda_bm_idx_imm2:%[0-9]+]]:acc512 = VLDA_dmx_lda_bm_idx_imm [[COPY]], 192 :: (load (<16 x s32>) from unknown-address + 192)
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:acc2048 = REG_SEQUENCE [[VLDA_2D_dmx_lda_bm]], %subreg.sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm]], %subreg.sub_512_acc_hi, [[VLDA_dmx_lda_bm_idx_imm1]], %subreg.sub_1024_acc_hi_then_sub_512_acc_lo, [[VLDA_dmx_lda_bm_idx_imm2]], %subreg.sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE1]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edn(s20) = G_CONSTANT i20 3
+    %4:edc(s20) = G_CONSTANT i20 4
+    %5:accregbank(<64 x s32>), %6:ptrregbank(p0), %7:modregbank(s20) = G_AIE_POSTINC_2D_LOAD %0, %1, %2, %3, %4 :: (load (<64 x s32>))
+    PseudoRET implicit $lr, implicit %5
+...
+
+---
+name: post-inc-3d-vector-load-acc-2048
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: post-inc-3d-vector-load-acc-2048
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 5
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo5:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 6
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count
+    ; CHECK-NEXT: [[VLDA_3D_dmx_lda_bm:%[0-9]+]]:acc512, [[VLDA_3D_dmx_lda_bm1:%[0-9]+]]:ep, [[VLDA_3D_dmx_lda_bm2:%[0-9]+]]:edcl, [[VLDA_3D_dmx_lda_bm3:%[0-9]+]]:edch = VLDA_3D_dmx_lda_bm [[COPY]], [[REG_SEQUENCE]] :: (load (<8 x s64>))
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDA_3D_dmx_lda_bm]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edj(s20) = G_CONSTANT i20 3
+    %4:edn(s20) = G_CONSTANT i20 4
+    %5:edn(s20) = G_CONSTANT i20 5
+    %6:edc(s20) = G_CONSTANT i20 6
+    %7:edc(s20) = G_CONSTANT i20 7
+    %8:accregbank(<8 x s64>), %9:ptrregbank(p0), %10:modregbank(s20), %11:modregbank(s20) = G_AIE_POSTINC_3D_LOAD %0, %1, %2, %3, %4, %6, %5, %7 :: (load (<8 x s64>))
+    PseudoRET implicit $lr, implicit %8
+...
+
+---
+name: post-inc-vector-store-acc-2048
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $r0, $r1, $dm0
+    ; CHECK-LABEL: name: post-inc-vector-store-acc-2048
+    ; CHECK: liveins: $p0, $r0, $r1, $dm0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]]
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 32
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:em = MOV_PD_imm11_pseudo 512
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc2048 = COPY $dm0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY4]], [[COPY]], [[COPY2]] :: (store (<16 x s32>) into stack - 64, basealign 256)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY]], 64 :: (store (<16 x s32>) into stack, align 256)
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY6]], [[COPY]], 128 :: (store (<16 x s32>) into stack + 64, basealign 256)
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY7]], [[COPY]], 192 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256)
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY8]], [[VST_dmx_sts_bm_pstm_nrm]], 448 :: (store (<16 x s32>) into stack - 64, basealign 256)
+    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY9]], [[VST_dmx_sts_bm_pstm_nrm]], 64 :: (store (<16 x s32>) into stack, align 256)
+    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY10]], [[VST_dmx_sts_bm_pstm_nrm]], 128 :: (store (<16 x s32>) into stack + 64, basealign 256)
+    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY11]], [[VST_dmx_sts_bm_pstm_nrm]], 192 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256)
+    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm1:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY12]], [[VST_dmx_sts_bm_pstm_nrm_imm]], -512 :: (store (<16 x s32>) into stack - 64, basealign 256)
+    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY13]], [[VST_dmx_sts_bm_pstm_nrm_imm]], 64 :: (store (<16 x s32>) into stack, align 256)
+    ; CHECK-NEXT: [[COPY14:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY14]], [[VST_dmx_sts_bm_pstm_nrm_imm]], 128 :: (store (<16 x s32>) into stack + 64, basealign 256)
+    ; CHECK-NEXT: [[COPY15:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY15]], [[VST_dmx_sts_bm_pstm_nrm_imm]], 192 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256)
+    ; CHECK-NEXT: [[COPY16:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm_imm2:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm_imm [[COPY16]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 0 :: (store (<16 x s32>) into stack - 64, basealign 256)
+    ; CHECK-NEXT: [[COPY17:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY17]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 64 :: (store (<16 x s32>) into stack, align 256)
+    ; CHECK-NEXT: [[COPY18:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY18]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 128 :: (store (<16 x s32>) into stack + 64, basealign 256)
+    ; CHECK-NEXT: [[COPY19:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY19]], [[VST_dmx_sts_bm_pstm_nrm_imm1]], 192 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256)
+    ; CHECK-NEXT: [[COPY20:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm1:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY20]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], [[MOV_PD_imm11_pseudo]] :: (store (<16 x s32>) into stack - 64, basealign 256)
+    ; CHECK-NEXT: [[COPY21:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY21]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], 64 :: (store (<16 x s32>) into stack, align 256)
+    ; CHECK-NEXT: [[COPY22:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY22]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], 128 :: (store (<16 x s32>) into stack + 64, basealign 256)
+    ; CHECK-NEXT: [[COPY23:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY23]], [[VST_dmx_sts_bm_pstm_nrm_imm2]], 192 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256)
+    ; CHECK-NEXT: [[COPY24:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_dmx_sts_bm_pstm_nrm2:%[0-9]+]]:ep = VST_dmx_sts_bm_pstm_nrm [[COPY24]], [[VST_dmx_sts_bm_pstm_nrm1]], [[MOV_PD_imm11_pseudo1]] :: (store (<16 x s32>) into stack - 64, basealign 256)
+    ; CHECK-NEXT: [[COPY25:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY25]], [[VST_dmx_sts_bm_pstm_nrm1]], 64 :: (store (<16 x s32>) into stack, align 256)
+    ; CHECK-NEXT: [[COPY26:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY26]], [[VST_dmx_sts_bm_pstm_nrm1]], 128 :: (store (<16 x s32>) into stack + 64, basealign 256)
+    ; CHECK-NEXT: [[COPY27:%[0-9]+]]:acc512 = COPY [[COPY3]].sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY27]], [[VST_dmx_sts_bm_pstm_nrm1]], 192 :: (store (<16 x s32>) into stack + 128, align 128, basealign 256)
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VST_dmx_sts_bm_pstm_nrm2]]
+    %0:ptrregbank(p0) = COPY $p0
+    %1:gprregbank(s32) = COPY $r0
+    %7:modregbank(s20) = G_TRUNC %1
+    %8:modregbank(s20) = G_CONSTANT i20 448
+    %9:modregbank(s20) = G_CONSTANT i20 -512
+    %10:modregbank(s20) = G_CONSTANT i20 0
+    %11:modregbank(s20) = G_CONSTANT i20 32
+    %12:modregbank(s20) = G_CONSTANT i20 512
+    %13:accregbank(<64 x s32>) = COPY $dm0
+    %19:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %0, %7 :: (store (<64 x s32>) into stack - 64)
+    %20:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %19, %8 :: (store (<64 x s32>) into stack - 64)
+    %21:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %20, %9 :: (store (<64 x s32>) into stack - 64)
+    %22:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %21, %10 :: (store (<64 x s32>) into stack - 64)
+    %23:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %22, %11 :: (store (<64 x s32>) into stack - 64)
+    %24:ptrregbank(p0) = G_AIE_POSTINC_STORE %13, %23, %12 :: (store (<64 x s32>) into stack - 64)
+    PseudoRET implicit $lr, implicit %24
+...
+
+---
+name: post-inc-2d-vector-store-acc-2048
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $dm0
+    ; CHECK-LABEL: name: post-inc-2d-vector-store-acc-2048
+    ; CHECK: liveins: $p0, $dm0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc2048 = COPY $dm0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_count
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_2D_dmx_sts_bm:%[0-9]+]]:ep, [[VST_2D_dmx_sts_bm1:%[0-9]+]]:edc = VST_2D_dmx_sts_bm [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 256)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY]], 128 :: (store (<16 x s32>) into unknown-address + 128, align 128)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY]], 192 :: (store (<16 x s32>) into unknown-address + 192)
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edn(s20) = G_CONSTANT i20 3
+    %4:edc(s20) = G_CONSTANT i20 4
+    %5:accregbank(<64 x s32>) = COPY $dm0
+    %6:ptrregbank(p0), %7:modregbank(s20) = G_AIE_POSTINC_2D_STORE %5, %0, %1, %2, %3, %4 :: (store (<64 x s32>))
+    PseudoRET implicit $lr
+...
+
+---
+name: post-inc-3d-vector-store-acc-2048
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $p0, $dm0
+    ; CHECK-LABEL: name: post-inc-3d-vector-store-acc-2048
+    ; CHECK: liveins: $p0, $dm0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em = MOV_PD_imm11_pseudo 1
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 2
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:edj = MOV_PD_imm11_pseudo 3
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 4
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:edn = MOV_PD_imm11_pseudo 5
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo5:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 6
+    ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:edc = MOV_PD_imm11_pseudo 7
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc2048 = COPY $dm0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm11_pseudo]], %subreg.sub_mod, [[MOV_PD_imm11_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm11_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm11_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm11_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm11_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm11_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_lo
+    ; CHECK-NEXT: [[VST_3D_dmx_sts_bm:%[0-9]+]]:ep, [[VST_3D_dmx_sts_bm1:%[0-9]+]]:edcl, [[VST_3D_dmx_sts_bm2:%[0-9]+]]:edch = VST_3D_dmx_sts_bm [[COPY2]], [[COPY]], [[REG_SEQUENCE]] :: (store (<16 x s32>), align 256)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY3]], [[COPY]], 64 :: (store (<16 x s32>) into unknown-address + 64)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_lo
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY4]], [[COPY]], 128 :: (store (<16 x s32>) into unknown-address + 128, align 128)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc512 = COPY [[COPY1]].sub_1024_acc_hi_then_sub_512_acc_hi
+    ; CHECK-NEXT: VST_dmx_sts_bm_idx_imm [[COPY5]], [[COPY]], 192 :: (store (<16 x s32>) into unknown-address + 192)
+    ; CHECK-NEXT: PseudoRET implicit $lr
+    %0:ptrregbank(p0) = COPY $p0
+    %1:em(s20) = G_CONSTANT i20 1
+    %2:edj(s20) = G_CONSTANT i20 2
+    %3:edj(s20) = G_CONSTANT i20 3
+    %4:edn(s20) = G_CONSTANT i20 4
+    %5:edn(s20) = G_CONSTANT i20 5
+    %6:edc(s20) = G_CONSTANT i20 6
+    %7:edc(s20) = G_CONSTANT i20 7
+    %8:accregbank(<64 x s32>) = COPY $dm0
+    %9:ptrregbank(p0), %10:modregbank(s20), %11:modregbank(s20) = G_AIE_POSTINC_3D_STORE %8, %0, %1, %2, %3, %4, %6, %5, %7 :: (store (<64 x s32>))
+    PseudoRET implicit $lr
+...
 
diff --git a/llvm/test/CodeGen/AIE/aie2p/combine-loads-stores.mir b/llvm/test/CodeGen/AIE/aie2p/combine-loads-stores.mir
new file mode 100644
index 000000000000..514ba9c2bdec
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2p/combine-loads-stores.mir
@@ -0,0 +1,1661 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
+# RUN: llc -mtriple aie2p -run-pass=aie2p-postlegalizer-custom-combiner %s -verify-machineinstrs -o - | FileCheck %s
+
+---
+name:            load_to_preinc
+body:             |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: load_to_preinc
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20)
+    ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[TRUNC]](s20) :: (load (s32))
+    ; CHECK-NEXT: $r0 = COPY [[AIE_OFFSET_LOAD]](s32)
+    ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0)
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = G_CONSTANT i32 24
+    %2:_(s20) = G_TRUNC %1
+    %3:_(p0) = G_PTR_ADD %0, %2
+    %4:_(s32) = G_LOAD %3 :: (load (s32))
+    $r0 = COPY %4
+    $p0 = COPY %3
+...
+
+---
+name:            load_to_preinc_with_copies
+body:             |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: load_to_preinc_with_copies
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20)
+    ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[TRUNC]](s20) :: (load (s32))
+    ; CHECK-NEXT: $r0 = COPY [[AIE_OFFSET_LOAD]](s32)
+    ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0)
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = G_CONSTANT i32 24
+    %2:_(s20) = G_TRUNC %1
+    %3:_(p0) = G_PTR_ADD %0, %2
+    %5:_(p0) = COPY %3
+    %4:_(s32) = G_LOAD %5 :: (load (s32))
+    $r0 = COPY %4
+    $p0 = COPY %3
+...
+
+---
+name:            load_to_preinc_dead_ptr_add
+body:             |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: load_to_preinc_dead_ptr_add
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[TRUNC]](s20) :: (load (s32))
+    ; CHECK-NEXT: $r0 = COPY [[AIE_OFFSET_LOAD]](s32)
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = G_CONSTANT i32 24
+    %2:_(s20) = G_TRUNC %1
+    %4:_(p0) = G_PTR_ADD %0, %2
+    %3:_(s32) = G_LOAD %4 :: (load (s32))
+    $r0 = COPY %3
+...
+
+---
+name:            load_to_postinc
+body:             |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: load_to_postinc
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32))
+    ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32)
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0)
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = G_CONSTANT i32 24
+    %2:_(s20) = G_TRUNC %1
+    %3:_(s32) = G_LOAD %0 :: (load (s32))
+    %4:_(p0) = G_PTR_ADD %0, %2
+    $r0 = COPY %3
+    $p0 = COPY %4
+...
+
+---
+name:            load_to_postinc_ptradd_before_load
+body:             |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: load_to_postinc_ptradd_before_load
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32))
+    ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32)
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0)
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = G_CONSTANT i32 24
+    %2:_(s20) = G_TRUNC %1
+    %4:_(p0) = G_PTR_ADD %0, %2
+    %3:_(s32) = G_LOAD %0 :: (load (s32))
+    $r0 = COPY %3
+    $p0 = COPY %4
+...
+
+---
+name:            load_to_postinc_ptradd_before_usedinphi
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: load_to_postinc_ptradd_before_usedinphi
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $p0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(p0) = G_PHI [[COPY]](p0), %bb.0, %2(p0), %bb.1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+  ; CHECK-NEXT:   [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[PHI]], [[TRUNC]](s20) :: (load (s32))
+  ; CHECK-NEXT:   $r0 = COPY [[AIE_POSTINC_LOAD]](s32)
+  ; CHECK-NEXT:   G_BR %bb.1
+  bb.0:
+    successors: %bb.1
+    liveins: $p0
+    %0:_(p0) = COPY $p0
+    G_BR %bb.1
+  bb.1:
+    successors: %bb.1
+    %1:_(p0) = G_PHI %0(p0), %bb.0, %4(p0), %bb.1
+    %2:_(s32) = G_CONSTANT i32 32
+    %3:_(s20) = G_TRUNC %2
+    %4:_(p0) = G_PTR_ADD %1, %3
+    %5:_(s32) = G_LOAD %1 :: (load (s32))
+    $r0 = COPY %5
+    G_BR %bb.1
+...
+
+
+# Our current combine code is not able to move the memory operation up. In this
+# case we cannot just move the pointer add to the load and we therefore don't
+# combine. This could be improved.
+---
+name:            load_not_to_postinc_ptradd_before_load
+body:             |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: load_not_to_postinc_ptradd_before_load
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20)
+    ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32)
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = G_CONSTANT i32 24
+    %2:_(s20) = G_TRUNC %1
+    %4:_(p0) = G_PTR_ADD %0, %2
+    $p0 = COPY %4
+    %3:_(s32) = G_LOAD %0 :: (load (s32))
+    $r0 = COPY %3
+...
+
+---
+name:            load_to_postinc_move_offset
+body:             |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: load_to_postinc_move_offset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32))
+    ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32)
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0)
+    %0:_(p0) = COPY $p0
+    %3:_(s32) = G_LOAD %0 :: (load (s32))
+    %1:_(s32) = G_CONSTANT i32 12
+    %2:_(s20) = G_TRUNC %1
+    %4:_(p0) = G_PTR_ADD %0, %2
+    $r0 = COPY %3
+    $p0 = COPY %4
+...
+
+---
+name:            load_not_to_postinc_cannot_move_offset
+body:             |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: load_not_to_postinc_cannot_move_offset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20)
+    ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0)
+    %0:_(p0) = COPY $p0
+    %3:_(s32) = G_LOAD %0 :: (load (s32))
+    $r0 = COPY %3
+    %1:_(s32) = G_CONSTANT i32 12
+    %2:_(s20) = G_TRUNC %1
+    %4:_(p0) = G_PTR_ADD %0, %2
+    $p0 = COPY %4
+...
+
+---
+name:            load_to_postinc_arg_offset
+body:             |
+  bb.0:
+    liveins: $p0, $r1
+    ; CHECK-LABEL: name: load_to_postinc_arg_offset
+    ; CHECK: liveins: $p0, $r1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[LSHR]](s32)
+    ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32))
+    ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32)
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0)
+    %0:_(p0) = COPY $p0
+    %5:_(s32) = COPY $r1
+    %3:_(s32) = G_LOAD %0 :: (load (s32))
+    %1:_(s32) = G_CONSTANT i32 2
+    %4:_(s32) = G_LSHR %5, %1
+    %6:_(s20) = G_TRUNC %4
+    %2:_(p0) = G_PTR_ADD %0, %6
+    $r0 = COPY %3
+    $p0 = COPY %2
+...
+
+---
+name:            load_cannot_combine
+body:             |
+  bb.0:
+    liveins: $p0, $r1
+    ; CHECK-LABEL: name: load_cannot_combine
+    ; CHECK: liveins: $p0, $r1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[LOAD]](s32)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20)
+    ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32)
+    ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0)
+    %0:_(p0) = COPY $p0
+    %3:_(s32) = G_LOAD %0 :: (load (s32))
+    %1:_(s20) = G_TRUNC %3
+    %2:_(p0) = G_PTR_ADD %0, %1
+    $r0 = COPY %3
+    $p0 = COPY %2
+...
+
+---
+name:            store_to_preinc
+body:             |
+  bb.0:
+    liveins: $p0, $r0
+    ; CHECK-LABEL: name: store_to_preinc
+    ; CHECK: liveins: $p0, $r0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20)
+    ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY1]](s32), [[COPY]](p0), [[TRUNC]](s20) :: (store (s32))
+    ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0)
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = COPY $r0
+    %2:_(s32) = G_CONSTANT i32 24
+    %4:_(s20) = G_TRUNC %2
+    %3:_(p0) = G_PTR_ADD %0, %4
+    G_STORE %1, %3 :: (store (s32))
+    $p0 = COPY %3
+...
+
+---
+name:            store_to_postinc
+body:             |
+  bb.0:
+    liveins: $p0, $r0
+    ; CHECK-LABEL: name: store_to_postinc
+    ; CHECK: liveins: $p0, $r0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[COPY1]](s32), [[COPY]], [[TRUNC]](s20) :: (store (s32))
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_STORE]](p0)
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = COPY $r0
+    %2:_(s32) = G_CONSTANT i32 24
+    %4:_(s20) = G_TRUNC %2
+    G_STORE %1, %0 :: (store (s32))
+    %3:_(p0) = G_PTR_ADD %0, %4
+    $p0 = COPY %3
+...
+
+---
+name:            store_to_postinc_ptr_add_before
+body:             |
+  bb.0:
+    liveins: $p0, $r0
+    ; CHECK-LABEL: name: store_to_postinc_ptr_add_before
+    ; CHECK: liveins: $p0, $r0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 24
+    ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[COPY1]](s32), [[COPY]], [[C]](s20) :: (store (s32))
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_STORE]](p0)
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = COPY $r0
+    %2:_(s20) = G_CONSTANT i20 24
+    %3:_(p0) = G_PTR_ADD %0, %2
+    G_STORE %1, %0 :: (store (s32))
+    $p0 = COPY %3
+...
+
+# Our current combine code is not able to move the memory operation up. In this
+# case we cannot just move the pointer add to the store and we therefore don't
+# combine. This could be improved.
+---
+name:            store_not_to_postinc_ptr_add_before
+body:             |
+  bb.0:
+    liveins: $p0, $r0
+    ; CHECK-LABEL: name: store_not_to_postinc_ptr_add_before
+    ; CHECK: liveins: $p0, $r0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 24
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+    ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0)
+    ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p0) :: (store (s32))
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = COPY $r0
+    %2:_(s20) = G_CONSTANT i20 24
+    %3:_(p0) = G_PTR_ADD %0, %2
+    $p0 = COPY %3
+    G_STORE %1, %0 :: (store (s32))
+...
+
+---
+name:            store_not_to_postinc_def_use_dependency
+body:             |
+  bb.0:
+    liveins: $p0, $r0
+    ; CHECK-LABEL: name: store_not_to_postinc_def_use_dependency
+    ; CHECK: liveins: $p0, $r0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 24
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+    ; CHECK-NEXT: G_STORE [[PTR_ADD]](p0), [[COPY]](p0) :: (store (s20), align 4)
+    ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0)
+    %0:_(p0) = COPY $p0
+    %1:_(s20) = G_CONSTANT i20 24
+    %2:_(p0) = G_PTR_ADD %0, %1
+    G_STORE %2, %0 :: (store (s20))
+    $p0 = COPY %2
+...
+
+---
+name:            store_to_postinc_move_offset
+body:             |
+  bb.0:
+    liveins: $p0, $r0
+    ; CHECK-LABEL: name: store_to_postinc_move_offset
+    ; CHECK: liveins: $p0, $r0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[COPY1]](s32), [[COPY]], [[TRUNC]](s20) :: (store (s32))
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_STORE]](p0)
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = COPY $r0
+    G_STORE %1, %0 :: (store (s32))
+    %2:_(s32) = G_CONSTANT i32 24
+    %4:_(s20) = G_TRUNC %2
+    %3:_(p0) = G_PTR_ADD %0, %4
+    $p0 = COPY %3
+...
+
+---
+name:            store_not_to_postinc_cannot_move_offset
+body:             |
+  bb.0:
+    liveins: $p0, $r0
+    ; CHECK-LABEL: name: store_not_to_postinc_cannot_move_offset
+    ; CHECK: liveins: $p0, $r0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0
+    ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $p1
+    ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[COPY2]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20)
+    ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0)
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = COPY $r0
+    G_STORE %1, %0 :: (store (s32))
+    %10:_(p0) = COPY $p1
+    G_STORE %1, %10 :: (store (s32))
+    %2:_(s32) = G_CONSTANT i32 24
+    %4:_(s20) = G_TRUNC %2
+    %3:_(p0) = G_PTR_ADD %0, %4
+    $p0 = COPY %3
+...
+
+---
+name:            zextload_to_postinc
+body:             |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: zextload_to_postinc
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[AIE_POSTINC_ZEXTLOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_ZEXTLOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_ZEXTLOAD [[COPY]], [[TRUNC]](s20) :: (load (s8))
+    ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_ZEXTLOAD]](s32)
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_ZEXTLOAD1]](p0)
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = G_CONSTANT i32 24
+    %4:_(s20) = G_TRUNC %1
+    %3:_(s32) = G_ZEXTLOAD %0 :: (load (s8))
+    %2:_(p0) = G_PTR_ADD %0, %4
+    $r0 = COPY %3
+    $p0 = COPY %2
+...
+
+---
+name:            different_bb_memop_dominating
+body:             |
+  ; CHECK-LABEL: name: different_bb_memop_dominating
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $p0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+  ; CHECK-NEXT:   $p0 = COPY [[PTR_ADD]](p0)
+  ; CHECK-NEXT:   $r0 = COPY [[LOAD]](s32)
+  bb.0:
+    liveins: $p0
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = G_CONSTANT i32 24
+    %4:_(s20) = G_TRUNC %1
+    %2:_(p0) = G_PTR_ADD %0, %4
+
+  bb.1:
+    %3:_(s32) = G_LOAD %0 :: (load (s32))
+    $p0 = COPY %2
+    $r0 = COPY %3
+...
+
+---
+name:            same_bb_ptradd_uses_different_bb
+body:             |
+  ; CHECK-LABEL: name: same_bb_ptradd_uses_different_bb
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $p0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32))
+  ; CHECK-NEXT:   $r0 = COPY [[AIE_POSTINC_LOAD]](s32)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $p0 = COPY [[AIE_POSTINC_LOAD1]](p0)
+  bb.0:
+    liveins: $p0
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = G_CONSTANT i32 24
+    %4:_(s20) = G_TRUNC %1
+
+  bb.1:
+    %2:_(p0) = G_PTR_ADD %0, %4
+    %3:_(s32) = G_LOAD %0 :: (load (s32))
+    $r0 = COPY %3
+
+  bb.2:
+    $p0 = COPY %2
+...
+
+---
+name:            different_bb_memop_dominating_offset_different_bb
+body:             |
+  ; CHECK-LABEL: name: different_bb_memop_dominating_offset_different_bb
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $p0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+  ; CHECK-NEXT:   $p0 = COPY [[PTR_ADD]](p0)
+  ; CHECK-NEXT:   $r0 = COPY [[LOAD]](s32)
+  bb.0:
+    %1:_(s32) = G_CONSTANT i32 24
+    %4:_(s20) = G_TRUNC %1
+
+  bb.1:
+    liveins: $p0
+    %0:_(p0) = COPY $p0
+    %2:_(p0) = G_PTR_ADD %0, %4
+
+  bb.2:
+    %3:_(s32) = G_LOAD %0 :: (load (s32))
+    $p0 = COPY %2
+    $r0 = COPY %3
+...
+
+---
+name:            different_bb_ptradd_dominating
+body:             |
+  ; CHECK-LABEL: name: different_bb_ptradd_dominating
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $p0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+  ; CHECK-NEXT:   $r0 = COPY [[LOAD]](s32)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20)
+  ; CHECK-NEXT:   $p0 = COPY [[PTR_ADD]](p0)
+  bb.0:
+    G_BR %bb.2
+
+  bb.1:
+    liveins: $p0
+    %0:_(p0) = COPY $p0
+    %3:_(s32) = G_LOAD %0 :: (load (s32))
+    $r0 = COPY %3
+
+  bb.2:
+    %1:_(s32) = G_CONSTANT i32 24
+    %4:_(s20) = G_TRUNC %1
+    %2:_(p0) = G_PTR_ADD %0, %4
+    $p0 = COPY %2
+...
+
+---
+name:            dominatin_ptradd_use
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: dominatin_ptradd_use
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20)
+    ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32)
+    %1:_(s32) = G_CONSTANT i32 24
+    %4:_(s20) = G_TRUNC %1
+    %0:_(p0) = COPY $p0
+    %2:_(p0) = G_PTR_ADD %0, %4
+    $p0 = COPY %2
+    %3:_(s32) = G_LOAD %0 :: (load (s32))
+    $r0 = COPY %3
+...
+
+---
+name:            two_ptradds
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: two_ptradds
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s20) = G_TRUNC [[C1]](s32)
+    ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC1]](s20) :: (load (s32))
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0)
+    ; CHECK-NEXT: $p1 = COPY [[PTR_ADD]](p0)
+    ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32)
+    %2:_(p0) = COPY $p0
+    %7:_(s32) = G_LOAD %2 :: (load (s32))
+    %0:_(s32) = G_CONSTANT i32 24
+    %1:_(s20) = G_TRUNC %0
+    %3:_(p0) = G_PTR_ADD %2, %1
+    %4:_(s32) = G_CONSTANT i32 24
+    %5:_(s20) = G_TRUNC %4
+    %6:_(p0) = G_PTR_ADD %2, %5
+    $p0 = COPY %6
+    $p1 = COPY %3
+    $r0 = COPY %7
+...
+
+---
+name:            post_inc_ignore_dead_ptr_add
+body:             |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: post_inc_ignore_dead_ptr_add
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32)
+    %0:_(p0) = COPY $p0
+    %1:_(s20) = G_CONSTANT i20 24
+    %2:_(s32) = G_LOAD %0 :: (load (s32))
+    %3:_(p0) = G_PTR_ADD %0, %1
+    $r0 = COPY %2
+...
+
+# Try moving ptr_add up for post-increment combines
+---
+name: move_ptr_add_up
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: move_ptr_add_up
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32))
+    ; CHECK-NEXT: G_STORE [[AIE_POSTINC_LOAD]](s32), [[COPY1]](p0) :: (store (s32))
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit $wl0, implicit [[AIE_POSTINC_LOAD1]](p0)
+      %0:_(p0) = COPY $p0
+      %6:_(p0) = COPY $p1
+      %1:_(s20) = G_CONSTANT i20 64
+      %4:_(s32) = G_LOAD %0(p0) :: (load (s32))
+      G_STORE %4, %6 :: (store (s32))
+      %3:_(p0) = G_PTR_ADD %0, %1
+      %10:_(p0) = COPY %0
+      PseudoRET implicit $lr, implicit $wl0, implicit %3
+...
+
+# When moving PTR_ADDs up we have to be able to move the G_CONSTANT up too
+---
+name: move_g_constant
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: move_g_constant
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32))
+    ; CHECK-NEXT: G_STORE [[AIE_POSTINC_LOAD]](s32), [[COPY1]](p0) :: (store (s32))
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit $wl0, implicit [[AIE_POSTINC_LOAD1]](p0)
+      %0:_(p0) = COPY $p0
+      %6:_(p0) = COPY $p1
+      %4:_(s32) = G_LOAD %0(p0) :: (load (s32))
+      G_STORE %4, %6 :: (store (s32))
+      %1:_(s20) = G_CONSTANT i20 64
+      %3:_(p0) = G_PTR_ADD %0, %1
+      %10:_(p0) = COPY %0
+      PseudoRET implicit $lr, implicit $wl0, implicit %3
+...
+
+# When moving PTR_ADDs up we have to be able to move the G_CONSTANT up too
+# The G_CONSTANT that defines the pointer does not interfere with this combining
+---
+name: move_g_constant_ptr_g_constant
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: move_g_constant_ptr_g_constant
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(p0) = G_CONSTANT i20 123
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C1]](s20) :: (load (s32))
+    ; CHECK-NEXT: G_STORE [[AIE_POSTINC_LOAD]](s32), [[C]](p0) :: (store (s32))
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit $wl0, implicit [[AIE_POSTINC_LOAD1]](p0)
+      %0:_(p0) = COPY $p0
+      %6:_(p0) = G_CONSTANT i20 123
+      %4:_(s32) = G_LOAD %0(p0) :: (load (s32))
+      G_STORE %4, %6 :: (store (s32))
+      %1:_(s20) = G_CONSTANT i20 64
+      %3:_(p0) = G_PTR_ADD %0, %1
+      %10:_(p0) = COPY %0
+      PseudoRET implicit $lr, implicit $wl0, implicit %3
+...
+
+# Test G_CONSTANT move up with postinc_2d
+---
+name: postinc_2d_move_g_constant
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: postinc_2d_move_g_constant
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 0
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
+    ; CHECK-NEXT: [[AIE_POSTINC_2D_LOAD:%[0-9]+]]:_(<32 x s8>), [[AIE_POSTINC_2D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_LOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_LOAD [[COPY]], [[C]], [[C1]], [[C2]], [[C3]] :: (load (<32 x s8>))
+    ; CHECK-NEXT: $wl0 = COPY [[AIE_POSTINC_2D_LOAD]](<32 x s8>)
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_2D_LOAD1]](p0)
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %6:_(<32 x s8>) = G_LOAD %0(p0) :: (load (<32 x s8>))
+      $wl0 = COPY %6(<32 x s8>)
+      %2:_(s20) = G_CONSTANT i20 128
+      %3:_(s20) = G_CONSTANT i20 0
+      %4:_(s20) = G_CONSTANT i20 32
+      %5:_(p0), %8:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.2d), %0:_(p0), %1:_(s20), %2:_(s20), %3:_(s20), %4:_(s20)
+      $p0 = COPY %5
+...
+
+---
+name: zextload_postinc_2d
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: zextload_postinc_2d
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 0
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
+    ; CHECK-NEXT: [[AIE_POSTINC_2D_ZEXTLOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_2D_ZEXTLOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_ZEXTLOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_ZEXTLOAD [[COPY]], [[C]], [[C1]], [[C2]], [[C3]] :: (load (s8))
+    ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_2D_ZEXTLOAD]](s32)
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_2D_ZEXTLOAD1]](p0)
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %6:_(s32) = G_ZEXTLOAD %0(p0) :: (load (s8))
+      $r0 = COPY %6(s32)
+      %2:_(s20) = G_CONSTANT i20 128
+      %3:_(s20) = G_CONSTANT i20 0
+      %4:_(s20) = G_CONSTANT i20 32
+      %5:_(p0), %8:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.2d), %0:_(p0), %1:_(s20), %2:_(s20), %3:_(s20), %4:_(s20)
+      $p0 = COPY %5
+...
+
+---
+name: sextload_postinc_2d
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sextload_postinc_2d
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 0
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
+    ; CHECK-NEXT: [[AIE_POSTINC_2D_SEXTLOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_2D_SEXTLOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_SEXTLOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_SEXTLOAD [[COPY]], [[C]], [[C1]], [[C2]], [[C3]] :: (load (s20), align 4)
+    ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_2D_SEXTLOAD]](s32)
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_2D_SEXTLOAD1]](p0)
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %6:_(s32) = G_SEXTLOAD %0(p0) :: (load (s20))
+      $r0 = COPY %6(s32)
+      %2:_(s20) = G_CONSTANT i20 128
+      %3:_(s20) = G_CONSTANT i20 0
+      %4:_(s20) = G_CONSTANT i20 32
+      %5:_(p0), %8:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.2d), %0:_(p0), %1:_(s20), %2:_(s20), %3:_(s20), %4:_(s20)
+      $p0 = COPY %5
+...
+
+---
+name: sextload_postinc_3d
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sextload_postinc_3d
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 0
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s20) = G_CONSTANT i20 0
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
+    ; CHECK-NEXT: [[AIE_POSTINC_3D_SEXTLOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_3D_SEXTLOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_SEXTLOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_SEXTLOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_SEXTLOAD [[COPY]], [[C]], [[C1]], [[C2]], [[C3]], [[C4]], [[C5]], [[C6]] :: (load (s20), align 4)
+    ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_3D_SEXTLOAD]](s32)
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_3D_SEXTLOAD1]](p0)
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %9:_(s32) = G_SEXTLOAD %0(p0) :: (load (s20))
+      $r0 = COPY %9(s32)
+      %2:_(s20) = G_CONSTANT i20 128
+      %3:_(s20) = G_CONSTANT i20 0
+      %4:_(s20) = G_CONSTANT i20 32
+      %5:_(s20) = G_CONSTANT i20 128
+      %6:_(s20) = G_CONSTANT i20 0
+      %7:_(s20) = G_CONSTANT i20 32
+      %8:_(p0), %10:_(s20), %11:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %0:_(p0), %1:_(s20), %2:_(s20), %3:_(s20), %4:_(s20), %5:_(s20), %6:_(s20), %7:_(s20)
+      $p0 = COPY %8
+...
+
+---
+name: zextload_postinc_3d
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: zextload_postinc_3d
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 0
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s20) = G_CONSTANT i20 0
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
+    ; CHECK-NEXT: [[AIE_POSTINC_3D_ZEXTLOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_3D_ZEXTLOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_ZEXTLOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_ZEXTLOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_ZEXTLOAD [[COPY]], [[C]], [[C1]], [[C2]], [[C3]], [[C4]], [[C5]], [[C6]] :: (load (s20), align 4)
+    ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_3D_ZEXTLOAD]](s32)
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_3D_ZEXTLOAD1]](p0)
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %9:_(s32) = G_ZEXTLOAD %0(p0) :: (load (s20))
+      $r0 = COPY %9(s32)
+      %2:_(s20) = G_CONSTANT i20 128
+      %3:_(s20) = G_CONSTANT i20 0
+      %4:_(s20) = G_CONSTANT i20 32
+      %5:_(s20) = G_CONSTANT i20 128
+      %6:_(s20) = G_CONSTANT i20 0
+      %7:_(s20) = G_CONSTANT i20 32
+      %8:_(p0), %10:_(s20), %11:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %0:_(p0), %1:_(s20), %2:_(s20), %3:_(s20), %4:_(s20), %5:_(s20), %6:_(s20), %7:_(s20)
+      $p0 = COPY %8
+...
+
+# Test G_CONSTANT move up with postinc_3d
+---
+name: postinc_3d_move_g_constant
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: postinc_3d_move_g_constant
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 0
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s20) = G_CONSTANT i20 0
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
+    ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<32 x s8>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C1]], [[C2]], [[C3]], [[C4]], [[C5]], [[C6]] :: (load (<32 x s8>))
+    ; CHECK-NEXT: $wl0 = COPY [[AIE_POSTINC_3D_LOAD]](<32 x s8>)
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_3D_LOAD1]](p0)
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %9:_(<32 x s8>) = G_LOAD %0(p0) :: (load (<32 x s8>))
+      $wl0 = COPY %9(<32 x s8>)
+      %2:_(s20) = G_CONSTANT i20 128
+      %3:_(s20) = G_CONSTANT i20 0
+      %4:_(s20) = G_CONSTANT i20 32
+      %5:_(s20) = G_CONSTANT i20 128
+      %6:_(s20) = G_CONSTANT i20 0
+      %7:_(s20) = G_CONSTANT i20 32
+      %8:_(p0), %10:_(s20), %11:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %0:_(p0), %1:_(s20), %2:_(s20), %3:_(s20), %4:_(s20), %5:_(s20), %6:_(s20), %7:_(s20)
+      $p0 = COPY %8
+...
+
+---
+name: offset_combine_vectors
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: offset_combine_vectors
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<32 x s8>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<32 x s8>))
+    ; CHECK-NEXT: $wl0 = COPY [[AIE_OFFSET_LOAD]](<32 x s8>)
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %3:_(p0) = G_PTR_ADD %0, %1
+      %4:_(<32 x s8>) = G_LOAD %3(p0) :: (load (<32 x s8>))
+      $wl0 = COPY %4(<32 x s8>)
+...
+
+---
+name: preinc_combine_vectors_512_bits
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: preinc_combine_vectors_512_bits
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<32 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<32 x s16>))
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: G_AIE_OFFSET_STORE [[AIE_OFFSET_LOAD]](<32 x s16>), [[COPY1]](p0), [[C1]](s20) :: (store (<32 x s16>))
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %3:_(p0) = G_PTR_ADD %0, %1
+      %4:_(<32 x s16>) = G_LOAD %3(p0) :: (load (<32 x s16>))
+      %5:_(p0) = COPY $p0
+      %6:_(s20) = G_CONSTANT i20 64
+      %7:_(p0) = G_PTR_ADD %5, %6
+      G_STORE %4:_(<32 x s16>), %7(p0) :: (store (<32 x s16>))
+...
+
+---
+name: preinc_combine_vectors_1024_bits
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: preinc_combine_vectors_1024_bits
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<32 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<32 x s32>))
+    ; CHECK-NEXT: $y2 = COPY [[AIE_OFFSET_LOAD]](<32 x s32>)
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %3:_(p0) = G_PTR_ADD %0, %1
+      %4:_(<32 x s32>) = G_LOAD %3(p0) :: (load (<32 x s32>))
+      $y2 = COPY %4(<32 x s32>)
+...
+
+---
+name: preinc_combine_vectors_2048_bits
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: preinc_combine_vectors_2048_bits
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<32 x s64>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<32 x s64>))
+    ; CHECK-NEXT: $dm0 = COPY [[AIE_OFFSET_LOAD]](<32 x s64>)
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %3:_(p0) = G_PTR_ADD %0, %1
+      %4:_(<32 x s64>) = G_LOAD %3(p0) :: (load (<32 x s64>))
+      $dm0 = COPY %4(<32 x s64>)
+...
+
+# If the original pointer is used after the ptr_add (in this case the implicit
+# %0 in bb.1) then do not combine to a post increment because that would lead to
+# an additional COPY to preserve the original pointer.
+# Note: If the copy is inevitable (in this case it is) we might as well combine.
+# Our current implementation does not consider this case
+---
+name: not_combine_postinc_later_use
+body: |
+  ; CHECK-LABEL: name: not_combine_postinc_later_use
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s20)
+  ; CHECK-NEXT:   $r0 = COPY [[LOAD]](s32)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit [[PTR_ADD]](p0), implicit [[PTR_ADD1]](p0), implicit [[COPY]](p0)
+  bb.0:
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %2:_(s20) = G_CONSTANT i20 64
+      %4:_(s32) = G_LOAD %0(p0) :: (load (s32))
+      %3:_(p0) = G_PTR_ADD %0, %1
+      %5:_(p0) = G_PTR_ADD %0, %2
+      $r0 = COPY %4(s32)
+
+  bb.1:
+      PseudoRET implicit $lr, implicit %3(p0), implicit %5, implicit %0
+...
+
+# In the following tests we are testing the behaviour of the combiner with uses
+# in different basic blocks than the instruction to be combined
+---
+name: postinc_bb_0_use_bb_1
+body: |
+  ; CHECK-LABEL: name: postinc_bb_0_use_bb_1
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+  ; CHECK-NEXT:   $r0 = COPY [[LOAD]](s32)
+  ; CHECK-NEXT:   PseudoJNZ $r1, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $p0 = COPY [[COPY]](p0)
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit [[PTR_ADD]](p0)
+  bb.0:
+    %0:_(p0) = COPY $p0
+    %1:_(s20) = G_CONSTANT i20 64
+    %4:_(s32) = G_LOAD %0(p0) :: (load (s32))
+    %3:_(p0) = G_PTR_ADD %0, %1
+    $r0 = COPY %4(s32)
+    PseudoJNZ $r1, %bb.2
+
+  bb.1:
+    $p0 = COPY %0
+    PseudoJ_jump_imm %bb.3
+
+  bb.2:
+    PseudoJ_jump_imm %bb.3
+
+  bb.3:
+    PseudoRET implicit $lr, implicit %3
+...
+
+---
+name: postinc_bb_0_use_bb_3
+body: |
+  ; CHECK-LABEL: name: postinc_bb_0_use_bb_3
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+  ; CHECK-NEXT:   $r0 = COPY [[LOAD]](s32)
+  ; CHECK-NEXT:   PseudoJNZ $r1, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   $p0 = COPY [[COPY]](p0)
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit [[PTR_ADD]](p0)
+  bb.0:
+    %0:_(p0) = COPY $p0
+    %1:_(s20) = G_CONSTANT i20 64
+    %4:_(s32) = G_LOAD %0(p0) :: (load (s32))
+    %3:_(p0) = G_PTR_ADD %0, %1
+    $r0 = COPY %4(s32)
+    PseudoJNZ $r1, %bb.2
+
+  bb.1:
+    PseudoJ_jump_imm %bb.3
+
+  bb.2:
+    PseudoJ_jump_imm %bb.3
+
+  bb.3:
+    $p0 = COPY %0
+    PseudoRET implicit $lr, implicit %3
+...
+
+---
+name: postinc_bb_1_use_bb_0
+body: |
+  ; CHECK-LABEL: name: postinc_bb_1_use_bb_0
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   $p0 = COPY [[COPY]](p0)
+  ; CHECK-NEXT:   PseudoJNZ $r1, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+  ; CHECK-NEXT:   [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32))
+  ; CHECK-NEXT:   $r0 = COPY [[AIE_POSTINC_LOAD]](s32)
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0)
+  bb.0:
+    %0:_(p0) = COPY $p0
+    $p0 = COPY %0
+    PseudoJNZ $r1, %bb.2
+
+  bb.1:
+    %1:_(s20) = G_CONSTANT i20 64
+    %4:_(s32) = G_LOAD %0(p0) :: (load (s32))
+    %3:_(p0) = G_PTR_ADD %0, %1
+    $r0 = COPY %4(s32)
+    PseudoJ_jump_imm %bb.3
+
+  bb.2:
+    PseudoJ_jump_imm %bb.3
+
+  bb.3:
+    PseudoRET implicit $lr, implicit %3
+...
+
+# In this case we would want to combine the postincrement.
+# But the current heuristic is too conservative and since the use in bb.2 does
+# not dominate the combined instruction in bb.1 it aborts the combining.
+---
+name: postinc_bb_1_use_bb_2
+body: |
+  ; CHECK-LABEL: name: postinc_bb_1_use_bb_2
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   PseudoJNZ $r1, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+  ; CHECK-NEXT:   $r0 = COPY [[LOAD]](s32)
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $p0 = COPY [[COPY]](p0)
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit [[PTR_ADD]](p0)
+  bb.0:
+    %0:_(p0) = COPY $p0
+    PseudoJNZ $r1, %bb.2
+
+  bb.1:
+    %1:_(s20) = G_CONSTANT i20 64
+    %4:_(s32) = G_LOAD %0(p0) :: (load (s32))
+    %3:_(p0) = G_PTR_ADD %0, %1
+    $r0 = COPY %4(s32)
+    PseudoJ_jump_imm %bb.3
+
+  bb.2:
+    $p0 = COPY %0
+    PseudoJ_jump_imm %bb.3
+
+  bb.3:
+    PseudoRET implicit $lr, implicit %3
+...
+
+---
+name: postinc_bb_1_use_bb_3
+body: |
+  ; CHECK-LABEL: name: postinc_bb_1_use_bb_3
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   PseudoJNZ $r1, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+  ; CHECK-NEXT:   $r0 = COPY [[LOAD]](s32)
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   $p0 = COPY [[COPY]](p0)
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit [[PTR_ADD]](p0)
+  bb.0:
+    %0:_(p0) = COPY $p0
+    PseudoJNZ $r1, %bb.2
+
+  bb.1:
+    %1:_(s20) = G_CONSTANT i20 64
+    %4:_(s32) = G_LOAD %0(p0) :: (load (s32))
+    %3:_(p0) = G_PTR_ADD %0, %1
+    $r0 = COPY %4(s32)
+    PseudoJ_jump_imm %bb.3
+
+  bb.2:
+    PseudoJ_jump_imm %bb.3
+
+  bb.3:
+    $p0 = COPY %0
+    PseudoRET implicit $lr, implicit %3
+...
+
+---
+name: postinc_bb_3_use_bb_0
+body: |
+  ; CHECK-LABEL: name: postinc_bb_3_use_bb_0
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   $p0 = COPY [[COPY]](p0)
+  ; CHECK-NEXT:   PseudoJNZ $r1, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+  ; CHECK-NEXT:   [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32))
+  ; CHECK-NEXT:   $r0 = COPY [[AIE_POSTINC_LOAD]](s32)
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0)
+  bb.0:
+    %0:_(p0) = COPY $p0
+    $p0 = COPY %0
+    PseudoJNZ $r1, %bb.2
+
+  bb.1:
+    PseudoJ_jump_imm %bb.3
+
+  bb.2:
+    PseudoJ_jump_imm %bb.3
+
+  bb.3:
+    %1:_(s20) = G_CONSTANT i20 64
+    %4:_(s32) = G_LOAD %0(p0) :: (load (s32))
+    %3:_(p0) = G_PTR_ADD %0, %1
+    $r0 = COPY %4(s32)
+    PseudoRET implicit $lr, implicit %3
+...
+
+# In this case we would want to combine the postincrement.
+# But the current heuristic is too conservative and since the use in bb.1 does
+# not dominate the combined instruction in bb.3 it aborts the combining.
+---
+name: postinc_bb_3_use_bb_1
+body: |
+  ; CHECK-LABEL: name: postinc_bb_3_use_bb_1
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   PseudoJNZ $r1, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $p0 = COPY [[COPY]](p0)
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoJ_jump_imm %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+  ; CHECK-NEXT:   $r0 = COPY [[LOAD]](s32)
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit [[PTR_ADD]](p0)
+  bb.0:
+    %0:_(p0) = COPY $p0
+    PseudoJNZ $r1, %bb.2
+
+  bb.1:
+    $p0 = COPY %0
+    PseudoJ_jump_imm %bb.3
+
+  bb.2:
+    PseudoJ_jump_imm %bb.3
+
+  bb.3:
+    %1:_(s20) = G_CONSTANT i20 64
+    %4:_(s32) = G_LOAD %0(p0) :: (load (s32))
+    %3:_(p0) = G_PTR_ADD %0, %1
+    $r0 = COPY %4(s32)
+    PseudoRET implicit $lr, implicit %3
+...
+
+---
+name: vector_256_combine_postinc
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vector_256_combine_postinc
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<32 x s8>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (<32 x s8>))
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C]](s20)
+    ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](<32 x s8>), [[COPY1]], [[C1]](s20) :: (store (<32 x s8>))
+    ; CHECK-NEXT: $p1 = COPY [[PTR_ADD]](p0)
+    ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_STORE]](p0)
+    ; CHECK-NEXT: $wl0 = COPY [[AIE_POSTINC_LOAD]](<32 x s8>)
+      %0:_(p0) = COPY $p0
+      %6:_(p0) = COPY $p1
+      %1:_(s20) = G_CONSTANT i20 64
+      %2:_(s20) = G_CONSTANT i20 64
+      %4:_(<32 x s8>) = G_LOAD %0(p0) :: (load (<32 x s8>))
+      G_STORE %4, %6 :: (store (<32 x s8>))
+      %3:_(p0) = G_PTR_ADD %0, %1
+      $p0 = COPY %3
+      %7:_(p0) = G_PTR_ADD %6, %1
+      $p1 = COPY %7
+      %5:_(p0) = G_PTR_ADD %6, %2
+      $p2 = COPY %5
+      $wl0 = COPY %4(<32 x s8>)
+...
+
+---
+name: vector_256_combine_postinc_2d
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vector_256_combine_postinc_2d
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_POSTINC_2D_LOAD:%[0-9]+]]:_(<32 x s8>), [[AIE_POSTINC_2D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_LOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s8>))
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_2D_LOAD1]](p0)
+    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.2d), [[COPY1]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20)
+    ; CHECK-NEXT: [[AIE_POSTINC_2D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_STORE1:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_STORE [[AIE_POSTINC_2D_LOAD]](<32 x s8>), [[COPY1]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s8>))
+    ; CHECK-NEXT: $p1 = COPY [[INT]](p0)
+    ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_2D_STORE]](p0)
+    ; CHECK-NEXT: $wl0 = COPY [[AIE_POSTINC_2D_LOAD]](<32 x s8>)
+      %0:_(p0) = COPY $p0
+      %6:_(p0) = COPY $p1
+      %1:_(s20) = G_CONSTANT i20 64
+      %4:_(<32 x s8>) = G_LOAD %0(p0) :: (load (<32 x s8>))
+      G_STORE %4, %6 :: (store (<32 x s8>))
+      %3:_(p0), %8:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.2d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
+      $p0 = COPY %3
+      %7:_(p0), %9:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.2d), %6:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
+      $p1 = COPY %7
+      %5:_(p0), %10:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.2d), %6:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
+      $p2 = COPY %5
+      $wl0 = COPY %4(<32 x s8>)
+...
+
+---
+name: vector_256_combine_postinc_3d
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vector_256_combine_postinc_3d
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<32 x s8>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s8>))
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_3D_LOAD1]](p0)
+    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20), [[INT2:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[COPY1]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20)
+    ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[AIE_POSTINC_3D_LOAD]](<32 x s8>), [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s8>))
+    ; CHECK-NEXT: $p1 = COPY [[INT]](p0)
+    ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_3D_STORE]](p0)
+    ; CHECK-NEXT: $wl0 = COPY [[AIE_POSTINC_3D_LOAD]](<32 x s8>)
+      %0:_(p0) = COPY $p0
+      %6:_(p0) = COPY $p1
+      %1:_(s20) = G_CONSTANT i20 64
+      %4:_(<32 x s8>) = G_LOAD %0(p0) :: (load (<32 x s8>))
+      G_STORE %4, %6 :: (store (<32 x s8>))
+      %3:_(p0), %8:_(s20), %9:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
+      $p0 = COPY %3
+      %7:_(p0), %10:_(s20), %11:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %6:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
+      $p1 = COPY %7
+      %5:_(p0), %12:_(s20), %13:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %6:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
+      $p2 = COPY %5
+      $wl0 = COPY %4(<32 x s8>)
+...
+
+---
+name: vector_256_combine_postinc_move_ptr_add
+body: |
+  ; CHECK-LABEL: name: vector_256_combine_postinc_move_ptr_add
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+  ; CHECK-NEXT:   [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<32 x s8>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (<32 x s8>))
+  ; CHECK-NEXT:   G_STORE [[AIE_POSTINC_LOAD]](<32 x s8>), [[COPY1]](p0) :: (store (<32 x s8>))
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit $wl0, implicit [[AIE_POSTINC_LOAD1]](p0), implicit [[AIE_POSTINC_LOAD]](<32 x s8>)
+  bb.0:
+      %0:_(p0) = COPY $p0
+      %6:_(p0) = COPY $p1
+      %1:_(s20) = G_CONSTANT i20 64
+      %4:_(<32 x s8>) = G_LOAD %0(p0) :: (load (<32 x s8>))
+      G_STORE %4, %6 :: (store (<32 x s8>))
+      %3:_(p0) = G_PTR_ADD %0, %1
+
+  bb.1:
+      PseudoRET implicit $lr, implicit $wl0, implicit %3, implicit %4
+...
+
+---
+name: vector_512_combine_postinc
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vector_512_combine_postinc
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (<32 x s16>))
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C]](s20)
+    ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](<32 x s16>), [[COPY1]], [[C1]](s20) :: (store (<32 x s16>))
+    ; CHECK-NEXT: $p1 = COPY [[PTR_ADD]](p0)
+    ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_STORE]](p0)
+      %0:_(p0) = COPY $p0
+      %6:_(p0) = COPY $p1
+      %1:_(s20) = G_CONSTANT i20 64
+      %2:_(s20) = G_CONSTANT i20 64
+      %4:_(<32 x s16>) = G_LOAD %0(p0) :: (load (<32 x s16>))
+      G_STORE %4, %6 :: (store (<32 x s16>))
+      %3:_(p0) = G_PTR_ADD %0, %1
+      $p0 = COPY %3
+      %7:_(p0) = G_PTR_ADD %6, %1
+      $p1 = COPY %7
+      %5:_(p0) = G_PTR_ADD %6, %2
+      $p2 = COPY %5
+...
+
+---
+name: vector_512_combine_postinc_2d
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vector_512_combine_postinc_2d
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_POSTINC_2D_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_2D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_LOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s16>))
+    ; CHECK-NEXT: [[AIE_POSTINC_2D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_STORE1:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_STORE [[AIE_POSTINC_2D_LOAD]](<32 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s16>))
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_2D_LOAD1]](p0)
+    ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_2D_STORE]](p0)
+      %0:_(p0) = COPY $p0
+      %6:_(p0) = COPY $p1
+      %1:_(s20) = G_CONSTANT i20 64
+      %4:_(<32 x s16>) = G_LOAD %0(p0) :: (load (<32 x s16>))
+      G_STORE %4, %6 :: (store (<32 x s16>))
+      %3:_(p0), %8:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.2d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
+      $p0 = COPY %3
+      %7:_(p0), %9:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.2d), %6:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
+      $p1 = COPY %7
+...
+
+---
+name: vector_512_combine_postinc_3d
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vector_512_combine_postinc_3d
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s16>))
+    ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[AIE_POSTINC_3D_LOAD]](<32 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s16>))
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_3D_LOAD1]](p0)
+    ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_3D_STORE]](p0)
+      %0:_(p0) = COPY $p0
+      %6:_(p0) = COPY $p1
+      %1:_(s20) = G_CONSTANT i20 64
+      %4:_(<32 x s16>) = G_LOAD %0(p0) :: (load (<32 x s16>))
+      G_STORE %4, %6 :: (store (<32 x s16>))
+      %3:_(p0), %8:_(s20), %9:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
+      $p0 = COPY %3
+      %7:_(p0), %10:_(s20), %11:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %6:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
+      $p1 = COPY %7
+...
+
+---
+name: vector_1024_combine_postinc
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vector_1024_combine_postinc
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<32 x s32>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (<32 x s32>))
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C]](s20)
+    ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](<32 x s32>), [[COPY1]], [[C1]](s20) :: (store (<32 x s32>))
+    ; CHECK-NEXT: $p1 = COPY [[PTR_ADD]](p0)
+    ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_STORE]](p0)
+      %0:_(p0) = COPY $p0
+      %6:_(p0) = COPY $p1
+      %1:_(s20) = G_CONSTANT i20 64
+      %2:_(s20) = G_CONSTANT i20 64
+      %4:_(<32 x s32>) = G_LOAD %0(p0) :: (load (<32 x s32>))
+      G_STORE %4, %6 :: (store (<32 x s32>))
+      %3:_(p0) = G_PTR_ADD %0, %1
+      $p0 = COPY %3
+      %7:_(p0) = G_PTR_ADD %6, %1
+      $p1 = COPY %7
+      %5:_(p0) = G_PTR_ADD %6, %2
+      $p2 = COPY %5
+...
+
+---
+name: vector_2048_combine_postinc
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vector_2048_combine_postinc
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<64 x s32>) = G_LOAD [[COPY]](p0) :: (load (<64 x s32>))
+    ; CHECK-NEXT: G_STORE [[LOAD]](<64 x s32>), [[COPY1]](p0) :: (store (<64 x s32>))
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+    ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0)
+    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C]](s20)
+    ; CHECK-NEXT: $p1 = COPY [[PTR_ADD1]](p0)
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C1]](s20)
+    ; CHECK-NEXT: $p2 = COPY [[PTR_ADD2]](p0)
+      %0:_(p0) = COPY $p0
+      %6:_(p0) = COPY $p1
+      %1:_(s20) = G_CONSTANT i20 64
+      %2:_(s20) = G_CONSTANT i20 64
+      %4:_(<64 x s32>) = G_LOAD %0(p0) :: (load (<64 x s32>))
+      G_STORE %4, %6 :: (store (<64 x s32>))
+      %3:_(p0) = G_PTR_ADD %0, %1
+      $p0 = COPY %3
+      %7:_(p0) = G_PTR_ADD %6, %1
+      $p1 = COPY %7
+      %5:_(p0) = G_PTR_ADD %6, %2
+      $p2 = COPY %5
+...
+
+---
+
+name: offset_combine_128bit_load
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: offset_combine_128bit_load
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<4 x s32>))
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_OFFSET_LOAD]](<4 x s32>)
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %2:_(p0) = G_PTR_ADD %0, %1
+      %3:_(<4 x s32>) = G_LOAD %2(p0) :: (load (<4 x s32>))
+      PseudoRET implicit $lr, implicit %3
+...
+
+---
+name: postinc_combine_128bit_load
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: postinc_combine_128bit_load
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<4 x s32>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (<4 x s32>))
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit $wl0, implicit [[AIE_POSTINC_LOAD]](<4 x s32>), implicit [[AIE_POSTINC_LOAD1]](p0)
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %2:_(<4 x s32>) = G_LOAD %0(p0) :: (load (<4 x s32>))
+      %3:_(p0) = G_PTR_ADD %0, %1
+      PseudoRET implicit $lr, implicit $wl0, implicit %2, implicit %3
+...
+
+---
+name: postinc_2d_combine_128bit_load
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: postinc_2d_combine_128bit_load
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_POSTINC_2D_LOAD:%[0-9]+]]:_(<16 x s8>), [[AIE_POSTINC_2D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_LOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]] :: (load (<16 x s8>))
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_2D_LOAD1]](p0)
+    ; CHECK-NEXT: $q0 = COPY [[AIE_POSTINC_2D_LOAD]](<16 x s8>)
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %2:_(<16 x s8>) = G_LOAD %0(p0) :: (load (<16 x s8>))
+      %3:_(p0), %4:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.2d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
+      $p0 = COPY %3
+      $q0 = COPY %2(<16 x s8>)
+...
+
+---
+name: postinc_3d_combine_128bit_load
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: postinc_3d_combine_128bit_load
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<16 x s8>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (<16 x s8>))
+    ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_3D_LOAD1]](p0)
+    ; CHECK-NEXT: $q0 = COPY [[AIE_POSTINC_3D_LOAD]](<16 x s8>)
+      %0:_(p0) = COPY $p0
+      %1:_(s20) = G_CONSTANT i20 64
+      %2:_(<16 x s8>) = G_LOAD %0(p0) :: (load (<16 x s8>))
+      %3:_(p0), %4:_(s20), %5:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20)
+      $p0 = COPY %3
+      $q0 = COPY %2(<16 x s8>)
+...
diff --git a/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll b/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll
index db8a52529a4f..0203db64b11e 100644
--- a/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll
@@ -15,8 +15,8 @@ define dso_local void @_Z17test_fifo_ld_fillRPDv64_DB8_R12fifo_state_t(ptr nocap
 ; CHECK-LABEL: _Z17test_fifo_ld_fillRPDv64_DB8_R12fifo_state_t:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
-; CHECK-NEXT:    lda r24, [p1, dj0]; nopx
+; CHECK-NEXT:    lda p0, [p0, #0]; nopb ; nopx ; mov dj0, #128; nops
+; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -28,10 +28,13 @@ define dso_local void @_Z17test_fifo_ld_fillRPDv64_DB8_R12fifo_state_t(ptr nocap
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r24, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst lfl0, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x1, lfh0
+; CHECK-NEXT:    ret lr; vmov x0, lfl0
+; CHECK-NEXT:    vst x1, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x0, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p2, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -53,8 +56,8 @@ define dso_local noundef <64 x i8> @_Z16test_fifo_ld_popRPDv64_DB8_R12fifo_state
 ; CHECK-LABEL: _Z16test_fifo_ld_popRPDv64_DB8_R12fifo_state_t:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
-; CHECK-NEXT:    lda r24, [p1, dj0]; nopx
+; CHECK-NEXT:    lda p0, [p0, #0]; nopb ; nopx ; mov dj0, #128; nops
+; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -66,10 +69,13 @@ define dso_local noundef <64 x i8> @_Z16test_fifo_ld_popRPDv64_DB8_R12fifo_state
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r24, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst lfl0, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x3, lfh0
+; CHECK-NEXT:    ret lr; vmov x2, lfl0
+; CHECK-NEXT:    vst x3, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x2, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p2, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -92,7 +98,7 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_1d_byteRPDv64_DB8_R12fi
 ; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRPDv64_DB8_R12fifo_state_ti:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda p0, [p0, #0]; nopb ; nopx ; mov dj0, #128
 ; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -105,10 +111,13 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_1d_byteRPDv64_DB8_R12fi
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r24, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst lfl0, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x3, lfh0
+; CHECK-NEXT:    ret lr; vmov x2, lfl0
+; CHECK-NEXT:    vst x3, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x2, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p2, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -132,9 +141,9 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_2d_byteRPDv64_DB8_R12fi
 ; CHECK-LABEL: _Z24test_fifo_ld_pop_2d_byteRPDv64_DB8_R12fifo_state_tiiRii:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda dc0, [p2, #0]; nopxm
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj1, #128
-; CHECK-NEXT:    lda r24, [p1, dj1]
+; CHECK-NEXT:    lda dc0, [p2, #0]; nopb ; nopxm ; nops
+; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mov m0, r0
@@ -146,10 +155,13 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_2d_byteRPDv64_DB8_R12fi
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st dc0, [p2, #0]; ret lr
-; CHECK-NEXT:    st r24, [p1, dj1] // Delay Slot 5
-; CHECK-NEXT:    vst lfl0, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x3, lfh0
+; CHECK-NEXT:    st dc0, [p2, #0]; ret lr; vmov x2, lfl0
+; CHECK-NEXT:    vst x3, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x2, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p3, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -181,24 +193,27 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_3d_byteRPDv64_DB8_R12fi
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    lda dc0, [p2, #0]; nopb ; nops ; nopxm ; nopv
-; CHECK-NEXT:    lda dc4, [p3, #0]
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj1, #128
-; CHECK-NEXT:    lda r24, [p1, dj1]
+; CHECK-NEXT:    lda dc4, [p3, #0]; nopx
+; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    mov m0, r0
 ; CHECK-NEXT:    mov dn0, r1
-; CHECK-NEXT:    mov dj0, r2
+; CHECK-NEXT:    mov dn4, r3
 ; CHECK-NEXT:    mov p4, p0
-; CHECK-NEXT:    vlda lfl0, [p1, #0]; mov dn4, r3
-; CHECK-NEXT:    vlda lfh0, [p1, #64]; mov dj4, r4
+; CHECK-NEXT:    vlda lfl0, [p1, #0]; mov dj4, r4
+; CHECK-NEXT:    vlda lfh0, [p1, #64]; mov dj0, r2
 ; CHECK-NEXT:    vldb.pop.512.3d x0, [p0, lf0, r24, d0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st dc0, [p2, #0]
-; CHECK-NEXT:    st dc4, [p3, #0]; ret lr
-; CHECK-NEXT:    st r24, [p1, dj1] // Delay Slot 5
-; CHECK-NEXT:    vst lfl0, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    st dc0, [p2, #0]; vmov x3, lfh0
+; CHECK-NEXT:    st dc4, [p3, #0]; ret lr; vmov x2, lfl0
+; CHECK-NEXT:    vst x3, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x2, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p4, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -237,8 +252,8 @@ define dso_local %struct.v64bfp16ebs8 @_Z16test_fifo_ld_popRP22v64bfp16ebs8_unal
 ; CHECK-LABEL: _Z16test_fifo_ld_popRP22v64bfp16ebs8_unalignedR12fifo_state_t:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
-; CHECK-NEXT:    lda r24, [p1, dj0]; nopx
+; CHECK-NEXT:    lda p0, [p0, #0]; nopb ; nopx ; mov dj0, #128; nops
+; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -250,10 +265,13 @@ define dso_local %struct.v64bfp16ebs8 @_Z16test_fifo_ld_popRP22v64bfp16ebs8_unal
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r24, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst lfl0, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x3, lfh0
+; CHECK-NEXT:    ret lr; vmov x2, lfl0
+; CHECK-NEXT:    vst x3, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x2, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p2, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -279,7 +297,7 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRP22v64bfp16e
 ; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRP22v64bfp16ebs8_unalignedR12fifo_state_ti:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda p0, [p0, #0]; nopb ; nopx ; mov dj0, #128
 ; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -292,10 +310,13 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRP22v64bfp16e
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r24, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst lfl0, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x3, lfh0
+; CHECK-NEXT:    ret lr; vmov x2, lfl0
+; CHECK-NEXT:    vst x3, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x2, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p2, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -322,9 +343,9 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRP22v64bfp16e
 ; CHECK-LABEL: _Z24test_fifo_ld_pop_2d_byteRP22v64bfp16ebs8_unalignedR12fifo_state_tiiRii:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda dc0, [p2, #0]; nopxm
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj1, #128
-; CHECK-NEXT:    lda r24, [p1, dj1]
+; CHECK-NEXT:    lda dc0, [p2, #0]; nopb ; nopxm ; nops
+; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mov m0, r0
@@ -336,10 +357,13 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRP22v64bfp16e
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st dc0, [p2, #0]; ret lr
-; CHECK-NEXT:    st r24, [p1, dj1] // Delay Slot 5
-; CHECK-NEXT:    vst lfl0, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x3, lfh0
+; CHECK-NEXT:    st dc0, [p2, #0]; ret lr; vmov x2, lfl0
+; CHECK-NEXT:    vst x3, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x2, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p3, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -374,24 +398,27 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRP22v64bfp16e
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    lda dc0, [p2, #0]; nopb ; nops ; nopxm ; nopv
-; CHECK-NEXT:    lda dc4, [p3, #0]
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj1, #128
-; CHECK-NEXT:    lda r24, [p1, dj1]
+; CHECK-NEXT:    lda dc4, [p3, #0]; nopx
+; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    mov m0, r0
 ; CHECK-NEXT:    mov dn0, r1
-; CHECK-NEXT:    mov dj0, r2
+; CHECK-NEXT:    mov dn4, r3
 ; CHECK-NEXT:    mov p4, p0
-; CHECK-NEXT:    vlda lfl0, [p1, #0]; mov dn4, r3
-; CHECK-NEXT:    vlda lfh0, [p1, #64]; mov dj4, r4
+; CHECK-NEXT:    vlda lfl0, [p1, #0]; mov dj4, r4
+; CHECK-NEXT:    vlda lfh0, [p1, #64]; mov dj0, r2
 ; CHECK-NEXT:    vldb.pop.576.3d ex0, [p0, lf0, r24, d0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st dc0, [p2, #0]
-; CHECK-NEXT:    st dc4, [p3, #0]; ret lr
-; CHECK-NEXT:    st r24, [p1, dj1] // Delay Slot 5
-; CHECK-NEXT:    vst lfl0, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    st dc0, [p2, #0]; vmov x3, lfh0
+; CHECK-NEXT:    st dc4, [p3, #0]; ret lr; vmov x2, lfl0
+; CHECK-NEXT:    vst x3, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x2, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p4, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -433,8 +460,8 @@ define dso_local %struct.v64bfp16ebs16 @_Z16test_fifo_ld_popRP23v64bfp16ebs16_un
 ; CHECK-LABEL: _Z16test_fifo_ld_popRP23v64bfp16ebs16_unalignedR12fifo_state_t:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
-; CHECK-NEXT:    lda r24, [p1, dj0]; nopx
+; CHECK-NEXT:    lda p0, [p0, #0]; nopb ; nopx ; mov dj0, #128; nops
+; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -446,10 +473,13 @@ define dso_local %struct.v64bfp16ebs16 @_Z16test_fifo_ld_popRP23v64bfp16ebs16_un
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r24, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst lfl0, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x3, lfh0
+; CHECK-NEXT:    ret lr; vmov x2, lfl0
+; CHECK-NEXT:    vst x3, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x2, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p2, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -475,7 +505,7 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_1d_byteRP23v64bfp16
 ; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRP23v64bfp16ebs16_unalignedR12fifo_state_ti:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda p0, [p0, #0]; nopb ; nopx ; mov dj0, #128
 ; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -488,10 +518,13 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_1d_byteRP23v64bfp16
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r24, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst lfl0, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x3, lfh0
+; CHECK-NEXT:    ret lr; vmov x2, lfl0
+; CHECK-NEXT:    vst x3, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x2, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p2, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -518,9 +551,9 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_2d_byteRP23v64bfp16
 ; CHECK-LABEL: _Z24test_fifo_ld_pop_2d_byteRP23v64bfp16ebs16_unalignedR12fifo_state_tiiRii:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda dc0, [p2, #0]; nopxm
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj1, #128
-; CHECK-NEXT:    lda r24, [p1, dj1]
+; CHECK-NEXT:    lda dc0, [p2, #0]; nopb ; nopxm ; nops
+; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mov m0, r0
@@ -532,10 +565,13 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_2d_byteRP23v64bfp16
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st dc0, [p2, #0]; ret lr
-; CHECK-NEXT:    st r24, [p1, dj1] // Delay Slot 5
-; CHECK-NEXT:    vst lfl0, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x3, lfh0
+; CHECK-NEXT:    st dc0, [p2, #0]; ret lr; vmov x2, lfl0
+; CHECK-NEXT:    vst x3, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x2, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p3, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -570,24 +606,27 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_3d_byteRP23v64bfp16
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    lda dc0, [p2, #0]; nopb ; nops ; nopxm ; nopv
-; CHECK-NEXT:    lda dc4, [p3, #0]
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj1, #128
-; CHECK-NEXT:    lda r24, [p1, dj1]
+; CHECK-NEXT:    lda dc4, [p3, #0]; nopx
+; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    mov m0, r0
 ; CHECK-NEXT:    mov dn0, r1
-; CHECK-NEXT:    mov dj0, r2
+; CHECK-NEXT:    mov dn4, r3
 ; CHECK-NEXT:    mov p4, p0
-; CHECK-NEXT:    vlda lfl0, [p1, #0]; mov dn4, r3
-; CHECK-NEXT:    vlda lfh0, [p1, #64]; mov dj4, r4
+; CHECK-NEXT:    vlda lfl0, [p1, #0]; mov dj4, r4
+; CHECK-NEXT:    vlda lfh0, [p1, #64]; mov dj0, r2
 ; CHECK-NEXT:    vldb.pop.544.3d ex0, [p0, lf0, r24, d0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st dc0, [p2, #0]
-; CHECK-NEXT:    st dc4, [p3, #0]; ret lr
-; CHECK-NEXT:    st r24, [p1, dj1] // Delay Slot 5
-; CHECK-NEXT:    vst lfl0, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    st dc0, [p2, #0]; vmov x3, lfh0
+; CHECK-NEXT:    st dc4, [p3, #0]; ret lr; vmov x2, lfl0
+; CHECK-NEXT:    vst x3, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x2, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p4, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -628,7 +667,7 @@ define dso_local noundef <64 x i8> @_Z17test_fifo_ld_popxRPDv64_hR12fifo_state_t
 ; CHECK-LABEL: _Z17test_fifo_ld_popxRPDv64_hR12fifo_state_t:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda p0, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda p0, [p0, #0]; nopb ; nopx ; mov dj0, #128; nops
 ; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -643,10 +682,12 @@ define dso_local noundef <64 x i8> @_Z17test_fifo_ld_popxRPDv64_hR12fifo_state_t
 ; CHECK-NEXT:    vmov lfe, x0
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vst lfl0, [p1, #0]; ret lr
-; CHECK-NEXT:    st r24, [p1, dj0]; vmov x2, lfe // Delay Slot 5
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 4
-; CHECK-NEXT:    vst x2, [p1, #192] // Delay Slot 3
+; CHECK-NEXT:    vmov x2, lfe
+; CHECK-NEXT:    vmov x3, lfh0
+; CHECK-NEXT:    vst x2, [p1, #192]; ret lr; vmov x2, lfl0
+; CHECK-NEXT:    vst x3, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x2, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p2, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -673,8 +714,8 @@ define dso_local void @_Z18test_fifo_ld_fillxRP22v64bfp16ebs8_unalignedR12fifo_s
 ; CHECK-LABEL: _Z18test_fifo_ld_fillxRP22v64bfp16ebs8_unalignedR12fifo_state_tii:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda p0, [p0, #0]; nopb ; nops ; nopx ; mov dj0, #128; nopv
-; CHECK-NEXT:    lda r24, [p1, dj0]; nopx
+; CHECK-NEXT:    lda p0, [p0, #0]; nopx ; mov dj0, #128
+; CHECK-NEXT:    lda r24, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    vlda lfl0, [p1, #0]
@@ -688,10 +729,12 @@ define dso_local void @_Z18test_fifo_ld_fillxRP22v64bfp16ebs8_unalignedR12fifo_s
 ; CHECK-NEXT:    vmov lfe, x0
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    vst lfl0, [p1, #0]; ret lr
-; CHECK-NEXT:    st r24, [p1, dj0]; vmov x0, lfe // Delay Slot 5
-; CHECK-NEXT:    vst lfh0, [p1, #64] // Delay Slot 4
-; CHECK-NEXT:    vst x0, [p1, #192] // Delay Slot 3
+; CHECK-NEXT:    vmov x0, lfe
+; CHECK-NEXT:    vmov x1, lfh0
+; CHECK-NEXT:    vst x0, [p1, #192]; ret lr; vmov x0, lfl0
+; CHECK-NEXT:    vst x1, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x0, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r24, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p0, [p2, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
diff --git a/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll b/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll
index a717eeb05134..1866580f47d8 100644
--- a/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll
@@ -46,7 +46,7 @@ define dso_local void @_Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t(ptr no
 ; CHECK-LABEL: _Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nopxm ; nops
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopx
 ; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
 ; CHECK-NEXT:    lda r26, [p1, dj0]
 ; CHECK-NEXT:    vlda sfh, [p1, #64]
@@ -56,10 +56,12 @@ define dso_local void @_Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t(ptr no
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    vst.push.512 x0, [p2, sf, r26]
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x1, sfh
+; CHECK-NEXT:    ret lr; vmov x0, sfl
+; CHECK-NEXT:    vst x1, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x0, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r26, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -83,7 +85,7 @@ define dso_local void @_Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t(ptr noca
 ; CHECK-LABEL: _Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nopxm ; nops
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopx
 ; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
 ; CHECK-NEXT:    lda r26, [p1, dj0]
 ; CHECK-NEXT:    vlda sfh, [p1, #64]
@@ -93,10 +95,12 @@ define dso_local void @_Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t(ptr noca
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    vst.flush.512 [p2, sf, r26]
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x1, sfh
+; CHECK-NEXT:    ret lr; vmov x0, sfl
+; CHECK-NEXT:    vst x1, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x0, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r26, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -119,7 +123,7 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti
 ; CHECK-LABEL: _Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nopxm
+; CHECK-NEXT:    vlda sfl, [p1, #0]
 ; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
 ; CHECK-NEXT:    lda r26, [p1, dj0]
 ; CHECK-NEXT:    vlda sfh, [p1, #64]
@@ -129,10 +133,12 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mov m0, r0
 ; CHECK-NEXT:    vst.flush.512 [p2, sf, r26, m0]
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x1, sfh
+; CHECK-NEXT:    ret lr; vmov x0, sfl
+; CHECK-NEXT:    vst x1, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x0, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r26, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -156,9 +162,9 @@ define dso_local void @_Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_ti
 ; CHECK-LABEL: _Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_tiiRii:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda dc0, [p2, #0]; nopb ; nopxm
-; CHECK-NEXT:    lda p2, [p0, #0]; mov dj1, #128
-; CHECK-NEXT:    lda r26, [p1, dj1]
+; CHECK-NEXT:    lda dc0, [p2, #0]
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r26, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    vlda sfl, [p1, #0]
 ; CHECK-NEXT:    vlda sfh, [p1, #64]; mov m0, r0
@@ -169,10 +175,11 @@ define dso_local void @_Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_ti
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st dc0, [p3, #0]; ret lr
-; CHECK-NEXT:    st r26, [p1, dj1] // Delay Slot 5
-; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    vmov x1, sfh
+; CHECK-NEXT:    st dc0, [p3, #0]; ret lr; vmov x0, sfl
+; CHECK-NEXT:    vst x1, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x0, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r26, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -203,24 +210,24 @@ define dso_local void @_Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_ti
 ; CHECK-LABEL: _Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nops ; nopxm ; nopv
-; CHECK-NEXT:    lda dc0, [p2, #0]; nopx
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopxm
+; CHECK-NEXT:    lda dc0, [p2, #0]
 ; CHECK-NEXT:    lda dc4, [p3, #0]
-; CHECK-NEXT:    lda p2, [p0, #0]; mov dj1, #128
-; CHECK-NEXT:    lda r26, [p1, dj1]
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r26, [p1, dj0]
 ; CHECK-NEXT:    vlda sfh, [p1, #64]; mov m0, r0
 ; CHECK-NEXT:    mov dn0, r1
-; CHECK-NEXT:    mov dj0, r2
-; CHECK-NEXT:    mov p4, p2
 ; CHECK-NEXT:    mov dn4, r3
+; CHECK-NEXT:    mov p4, p2
 ; CHECK-NEXT:    mov dj4, r4
+; CHECK-NEXT:    mov dj0, r2
 ; CHECK-NEXT:    vst.flush.512.3d [p2, sf, r26, d0]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st dc0, [p4, #0]
-; CHECK-NEXT:    st dc4, [p3, #0]; ret lr
-; CHECK-NEXT:    st r26, [p1, dj1] // Delay Slot 5
-; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st dc0, [p4, #0]; vmov x1, sfh
+; CHECK-NEXT:    st dc4, [p3, #0]; ret lr; vmov x0, sfl
+; CHECK-NEXT:    vst x1, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x0, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r26, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -258,7 +265,7 @@ define dso_local void @_Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t(ptr
 ; CHECK-LABEL: _Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda p2, [p0, #0]; nopb ; nopx ; mov dj0, #128; nops
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
 ; CHECK-NEXT:    lda r26, [p1, dj0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    vlda sfl, [p1, #0]
@@ -269,10 +276,12 @@ define dso_local void @_Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t(ptr
 ; CHECK-NEXT:    vst.flush.512.conv [p2, sf, r26]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x1, sfh
+; CHECK-NEXT:    ret lr; vmov x0, sfl
+; CHECK-NEXT:    vst x1, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x0, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r26, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -295,8 +304,8 @@ define dso_local void @_Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_sta
 ; CHECK-LABEL: _Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_state_ti:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda p2, [p0, #0]; nopb ; nopx ; mov dj0, #128
-; CHECK-NEXT:    lda r26, [p1, dj0]
+; CHECK-NEXT:    lda p2, [p0, #0]; nopb ; nops ; nopx ; mov dj0, #128; nopv
+; CHECK-NEXT:    lda r26, [p1, dj0]; nopb ; nopx
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    vlda sfl, [p1, #0]
 ; CHECK-NEXT:    vlda sfh, [p1, #64]
@@ -306,10 +315,12 @@ define dso_local void @_Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_sta
 ; CHECK-NEXT:    vst.flush.512.conv [p2, sf, r26, m0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x1, sfh
+; CHECK-NEXT:    ret lr; vmov x0, sfl
+; CHECK-NEXT:    vst x1, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x0, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r26, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -333,10 +344,10 @@ define dso_local void @_Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_sta
 ; CHECK-LABEL: _Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_state_tiiRii:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nopxm ; nops
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopx
 ; CHECK-NEXT:    lda dc0, [p2, #0]
-; CHECK-NEXT:    lda p2, [p0, #0]; mov dj1, #128
-; CHECK-NEXT:    lda r26, [p1, dj1]
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r26, [p1, dj0]
 ; CHECK-NEXT:    vlda sfh, [p1, #64]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mov m0, r0
@@ -345,10 +356,11 @@ define dso_local void @_Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_sta
 ; CHECK-NEXT:    mov dj0, r2
 ; CHECK-NEXT:    vst.flush.512.2d [p2, sf, r26, d0]
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st dc0, [p3, #0]; ret lr
-; CHECK-NEXT:    st r26, [p1, dj1] // Delay Slot 5
-; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    vmov x1, sfh
+; CHECK-NEXT:    st dc0, [p3, #0]; ret lr; vmov x0, sfl
+; CHECK-NEXT:    vst x1, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x0, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r26, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -379,25 +391,25 @@ define dso_local void @_Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_sta
 ; CHECK-LABEL: _Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    lda dc0, [p2, #0]; nopb ; nops ; nopxm ; nopv
+; CHECK-NEXT:    lda dc0, [p2, #0]; nopb ; nopx
 ; CHECK-NEXT:    lda dc4, [p3, #0]
-; CHECK-NEXT:    lda p2, [p0, #0]; mov dj1, #128
-; CHECK-NEXT:    lda r26, [p1, dj1]
+; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
+; CHECK-NEXT:    lda r26, [p1, dj0]
 ; CHECK-NEXT:    mov m0, r0
 ; CHECK-NEXT:    vlda sfl, [p1, #0]; mov dn0, r1
-; CHECK-NEXT:    vlda sfh, [p1, #64]; mov dj0, r2
+; CHECK-NEXT:    vlda sfh, [p1, #64]; mov dn4, r3
 ; CHECK-NEXT:    mov p4, p2
-; CHECK-NEXT:    mov dn4, r3
 ; CHECK-NEXT:    mov dj4, r4
+; CHECK-NEXT:    mov dj0, r2
 ; CHECK-NEXT:    vst.flush.512.conv.3d [p2, sf, r26, d0]
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    st dc0, [p4, #0]
-; CHECK-NEXT:    st dc4, [p3, #0]; ret lr
-; CHECK-NEXT:    st r26, [p1, dj1] // Delay Slot 5
-; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    st dc0, [p4, #0]; vmov x1, sfh
+; CHECK-NEXT:    st dc4, [p3, #0]; ret lr; vmov x0, sfl
+; CHECK-NEXT:    vst x1, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x0, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r26, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -468,7 +480,7 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs16(ptr nocapture nonnull ali
 ; CHECK-LABEL: test_fifo_st_push_v64bfp16ebs16:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nopxm ; nops
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopx
 ; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
 ; CHECK-NEXT:    lda r26, [p1, dj0]
 ; CHECK-NEXT:    vlda sfh, [p1, #64]
@@ -478,10 +490,12 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs16(ptr nocapture nonnull ali
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    vst.push.544 ex0, [p2, sf, r26]
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x1, sfh
+; CHECK-NEXT:    ret lr; vmov x0, sfl
+; CHECK-NEXT:    vst x1, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x0, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r26, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry:
@@ -538,7 +552,7 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs8(ptr nocapture nonnull alig
 ; CHECK-LABEL: test_fifo_st_push_v64bfp16ebs8:
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    vlda sfl, [p1, #0]; nopb ; nopxm ; nops
+; CHECK-NEXT:    vlda sfl, [p1, #0]; nopx
 ; CHECK-NEXT:    lda p2, [p0, #0]; mov dj0, #128
 ; CHECK-NEXT:    lda r26, [p1, dj0]
 ; CHECK-NEXT:    vlda sfh, [p1, #64]
@@ -548,10 +562,12 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs8(ptr nocapture nonnull alig
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    vst.push.576 ex0, [p2, sf, r26]
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    st r26, [p1, dj0] // Delay Slot 5
-; CHECK-NEXT:    vst sfl, [p1, #0] // Delay Slot 4
-; CHECK-NEXT:    vst sfh, [p1, #64] // Delay Slot 3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x1, sfh
+; CHECK-NEXT:    ret lr; vmov x0, sfl
+; CHECK-NEXT:    vst x1, [p1, #64] // Delay Slot 5
+; CHECK-NEXT:    vst x0, [p1], #128 // Delay Slot 4
+; CHECK-NEXT:    st r26, [p1, #0] // Delay Slot 3
 ; CHECK-NEXT:    st p2, [p0, #0] // Delay Slot 2
 ; CHECK-NEXT:    nop // Delay Slot 1
 entry: