diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
index 024ecbd09b48..fb5431c74242 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
@@ -58,6 +58,10 @@ static cl::opt<int> PostPipelinerMaxII(
     "aie-postpipeliner-maxii", cl::init(40),
     cl::desc("[AIE] Maximum II to be tried in the post-ra pipeliner"));
 
+static cl::opt<int> PostPipelinerMaxTryII(
+    "aie-postpipeliner-maxtry-ii", cl::init(10),
+    cl::desc("[AIE] Maximum II steps to be tried in the post-ra pipeliner"));
+
 namespace llvm::AIE {
 
 void dumpInterBlock(const InterBlockEdges &Edges) {
@@ -600,6 +604,7 @@ SchedulingStage InterBlockScheduling::updateScheduling(BlockState &BS) {
     auto &PostSWP = BS.getPostSWP();
     if (PostSWP.canAccept(*BS.TheBlock)) {
       BS.FixPoint.II = PostSWP.getResMII(*BS.TheBlock);
+      BS.FixPoint.IITries = 1;
       return BS.FixPoint.Stage = SchedulingStage::Pipelining;
     }
   }
@@ -614,7 +619,8 @@ SchedulingStage InterBlockScheduling::updatePipelining(BlockState &BS) {
 
   // Otherwise try a larger II.
   // We cut off at larger IIs to prevent excessive compilation time.
-  if (++BS.FixPoint.II <= PostPipelinerMaxII) {
+  if (++BS.FixPoint.II <= PostPipelinerMaxII &&
+      ++BS.FixPoint.IITries <= PostPipelinerMaxTryII) {
     return BS.FixPoint.Stage = SchedulingStage::Pipelining;
   }
 
diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
index 25da239326c0..5b9b293031b4 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
@@ -148,6 +148,8 @@ class FixedpointState {
   int ResourceMargin = 0;
   // The II of the modulo schedule we are trying.
   int II = 0;
+  // The number of II steps we've made from the minimum
+  int IITries = 0;
   // Results from the convergence test
   int MaxLatencyExtent = 0;
   int MaxResourceExtent = 0;
diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp
index 48f194d0878b..13368dff39c1 100644
--- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp
+++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp
@@ -17,6 +17,8 @@
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/Support/MathExtras.h"
+#include <limits>
+#include <string>
 
 #define DEBUG_TYPE "postpipeliner"
 #define DEBUG_SUMMARY(X) DEBUG_WITH_TYPE("postpipeliner-summary", X)
@@ -200,6 +202,11 @@ void PostPipeliner::computeForward() {
   for (int K = 0; K < NInstr; K++) {
     auto &Me = Info[K];
     SUnit &SU = DAG->SUnits[K];
+    Me.Slots = getSlotCounts(*SU.getInstr(), TII);
+    // Accumulate the slots of Me and all data predecessors.
+    SlotCounts Slots(Me.Slots);
+    int PredEarliest = std::numeric_limits<int>::max();
+    int Count = 0;
     for (auto &Dep : SU.Preds) {
       if (Dep.getKind() != SDep::Data) {
         continue;
@@ -208,10 +215,18 @@ void PostPipeliner::computeForward() {
       assert(P < K);
       Me.Ancestors.insert(P);
       auto &Pred = Info[P];
+      Slots += Pred.Slots;
+      Count++;
+      PredEarliest = std::min(PredEarliest, Pred.Earliest);
       for (int Anc : Pred.Ancestors) {
         Me.Ancestors.insert(Anc);
       }
     }
+    // When we need more slots than we have data predecessors, we have local
+    // resource contention that we can safely account for in Earliest.
+    if (Count > 0 && Slots.max() > Count) {
+      Me.Earliest = std::max(Me.Earliest, PredEarliest + Slots.max() - 1);
+    }
     for (auto &Dep : SU.Succs) {
       auto *Succ = Dep.getSUnit();
       if (Succ->isBoundaryNode()) {
@@ -221,7 +236,6 @@ void PostPipeliner::computeForward() {
       const int NewEarliest = Me.Earliest + Dep.getSignedLatency();
       SInfo.Earliest = std::max(SInfo.Earliest, NewEarliest);
     }
-    Me.Slots = getSlotCounts(*SU.getInstr(), TII);
   }
 }
 
@@ -323,11 +337,15 @@ bool PostPipeliner::computeLoopCarriedParameters() {
   }
 
   // Save the static values for ease of reset
-  for (auto &N : Info) {
+  for (auto &N : Info.Nodes) {
     N.StaticEarliest = N.Earliest;
     N.StaticLatest = N.Latest;
   }
-  return true;
+  Info.compute();
+
+  // If no node can be scheduled in cycle 0, we must have a circuit that
+  // is longer than II
+  return Info.MinEarliest == 0;
 }
 
 int PostPipeliner::computeMinScheduleLength() const {
@@ -343,24 +361,23 @@ int PostPipeliner::computeMinScheduleLength() const {
   return MinLength;
 }
 
-void dumpGraph(int NInstr, const std::vector<NodeInfo> &Info,
-               ScheduleDAGInstrs *DAG) {
+void dumpGraph(const ScheduleInfo &Info, ScheduleDAGInstrs *DAG) {
   dbgs() << "digraph {\n";
 
-  for (int K = 0; K < NInstr; K++) {
+  for (int K = 0; K < Info.NInstr; K++) {
     auto &SU = DAG->SUnits[K];
     for (auto &Dep : SU.Succs) {
       auto *Succ = Dep.getSUnit();
       int S = Succ->NodeNum;
-      if (S % NInstr == K) {
+      if (S % Info.NInstr == K || Succ->isBoundaryNode()) {
         continue;
       }
 
       dbgs() << "\tSU" << K << " -> "
              << "SU" << S;
 
-      if (S >= NInstr) {
-        dbgs() << "_" << S % NInstr;
+      if (S >= Info.NInstr) {
+        dbgs() << "_" << S % Info.NInstr;
       }
       if (Dep.getKind() == SDep::Data) {
         dbgs() << " [color=red] ";
@@ -380,6 +397,25 @@ void dumpGraph(int NInstr, const std::vector<NodeInfo> &Info,
   dbgs() << "}\n";
 }
 
+void dumpIntervals(const ScheduleInfo &Info, int MinLength) {
+  dbgs() << "Intervals:\n";
+  for (int K = 0; K < Info.NInstr; K++) {
+    std::string Head = "SU" + std::to_string(K);
+    dbgs() << Head;
+    for (int I = Head.length() - 6; I < MinLength; I++) {
+      if (I == 0) {
+        dbgs() << "|";
+      }
+      if (I >= Info[K].Earliest && I <= MinLength + Info[K].Latest) {
+        dbgs() << "*";
+      } else {
+        dbgs() << " ";
+      }
+    }
+    dbgs() << "\n";
+  }
+}
+
 int PostPipeliner::mostUrgent(PostPipelinerStrategy &Strategy) {
   assert(FirstUnscheduled <= LastUnscheduled);
   while (Info[FirstUnscheduled].Scheduled) {
@@ -476,7 +512,7 @@ bool PostPipeliner::scheduleFirstIteration(PostPipelinerStrategy &Strategy) {
 }
 
 namespace {
-void dumpEarliestChain(const std::vector<NodeInfo> &Info, int N) {
+void dumpEarliestChain(const ScheduleInfo &Info, int N) {
   auto Prev = Info[N].LastEarliestPusher;
   if (Prev) {
     dumpEarliestChain(Info, *Prev);
@@ -515,8 +551,7 @@ bool PostPipeliner::scheduleOtherIterations() {
 
 class DefaultStrategy : public PostPipelinerStrategy {
 public:
-  DefaultStrategy(ScheduleDAGMI &DAG, std::vector<NodeInfo> &Info,
-                  int LatestBias)
+  DefaultStrategy(ScheduleDAGMI &DAG, ScheduleInfo &Info, int LatestBias)
       : PostPipelinerStrategy(DAG, Info, LatestBias) {}
   bool better(const SUnit &A, const SUnit &B) override {
     return Info[A.NodeNum].Latest < Info[B.NodeNum].Latest;
@@ -524,7 +559,12 @@ class DefaultStrategy : public PostPipelinerStrategy {
 };
 
 class ConfigStrategy : public PostPipelinerStrategy {
+protected:
+  int II = 0;
+
+private:
   bool TopDown = true;
+  bool Alternate = false;
 
 public:
   enum PriorityComponent {
@@ -552,6 +592,13 @@ class ConfigStrategy : public PostPipelinerStrategy {
     }
     return "Size - Illegal";
   }
+  struct Configuration {
+    int ExtraStages = 0;
+    bool TopDown = true;
+    bool Alternate = false;
+    int Runs = 0;
+    ArrayRef<PriorityComponent> Components;
+  };
 
 private:
   std::string Name;
@@ -597,6 +644,12 @@ class ConfigStrategy : public PostPipelinerStrategy {
     return false;
   }
 
+  int earliest(const SUnit &N) override { return Info[N.NodeNum].Earliest; }
+
+  int latest(const SUnit &N) override {
+    return Info[N.NodeNum].Latest + LatestBias;
+  }
+
   void selected(const SUnit &N) override {
     // Promote the critical path
     NodeInfo *Pushed = &Info[N.NodeNum];
@@ -628,15 +681,20 @@ class ConfigStrategy : public PostPipelinerStrategy {
         PredSiblingScheduled.insert(PDep.getSUnit()->NodeNum);
       }
     }
+    if (Alternate) {
+      TopDown = !TopDown;
+    }
   }
 
 public:
   std::string name() override { return Name; }
-  ConfigStrategy(ScheduleDAGInstrs &DAG, std::vector<NodeInfo> &Info,
-                 int Length, bool TopDown,
+  ConfigStrategy(ScheduleDAGInstrs &DAG, ScheduleInfo &Info, int Length, int II,
+                 bool TopDown, bool Alternate,
                  ArrayRef<PriorityComponent> Components)
-      : PostPipelinerStrategy(DAG, Info, Length), TopDown(TopDown) {
-    Name = "Config_" + std::to_string(Length) + "_" + std::to_string(TopDown);
+      : PostPipelinerStrategy(DAG, Info, Length), II(II), TopDown(TopDown),
+        Alternate(Alternate) {
+    Name = "Config_" + std::to_string(Length) + "_" + std::to_string(TopDown) +
+           "_" + std::to_string(Alternate);
     for (auto Comp : Components) {
       Name += "_" + getPriorityName(Comp);
       Priority.emplace_back(Comp);
@@ -644,59 +702,50 @@ class ConfigStrategy : public PostPipelinerStrategy {
   }
 };
 
-static const struct {
-  int ExtraStages;
-  bool TopDown;
-  bool Rerun;
-  ConfigStrategy::PriorityComponent Components[3];
-} Strategies[] = {
+static const ConfigStrategy::PriorityComponent
+    NodeNum[] = {ConfigStrategy::NodeNum},
+    Latest[] = {ConfigStrategy::Latest},
+    Critical[] = {ConfigStrategy::Critical},
+    CriticalLCDLatest[] = {ConfigStrategy::Critical, ConfigStrategy::LCDLatest};
+
+static const ConfigStrategy::Configuration Strategies[] = {
     // Loosely speaking, a lower value of the first parameter targets
     // a lower stage count, which benefits code size.
-    // Rerurn is only useful for heuristics that use it, e.g. Critical
-    {1, true, false, {ConfigStrategy::NodeNum}},
-    {1, true, false, {ConfigStrategy::Latest}},
-    {1, true, true, {ConfigStrategy::Critical}},
-    {1, true, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}},
-    {0, false, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}},
-    {1, false, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}},
+    // Runs>1 is only useful for heuristics that use it, e.g. Critical
+    // {ExtraStages, TopDown, Alternate, Runs, Components}
+    {1, true, false, 1, NodeNum},
+    {1, true, false, 1, Latest},
+    {1, true, false, 2, Critical},
+    {1, true, false, 2, CriticalLCDLatest},
+    {0, false, false, 2, CriticalLCDLatest},
+    {1, false, false, 2, CriticalLCDLatest},
     // This is pure bottom up
-    {1, false, false, {ConfigStrategy::NodeNum}},
+    {1, false, false, 1, NodeNum},
 };
 
 bool PostPipeliner::tryHeuristics() {
   int MinLength = computeMinScheduleLength();
-
   DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n");
 
   int HeuristicIndex = 0;
-  for (auto &[ExtraStages, TopDown, Rerun, Components] : Strategies) {
+  for (const auto &Config : Strategies) {
     if (Heuristic >= 0 && Heuristic != HeuristicIndex++) {
       continue;
     }
-    ConfigStrategy S(*DAG, Info, MinLength + ExtraStages * II, TopDown,
-                     Components);
+    ConfigStrategy S(*DAG, Info, MinLength + Config.ExtraStages * II, II,
+                     Config.TopDown, Config.Alternate, Config.Components);
     resetSchedule(/*FullReset=*/true);
-    DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() << "\n");
-    if (scheduleFirstIteration(S) && scheduleOtherIterations()) {
-      DEBUG_SUMMARY(dbgs() << "    Strategy " << S.name() << " found II=" << II
+    for (int Run = 0; Run < Config.Runs; Run++) {
+      DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() << " run=" << Run
                            << "\n");
-      return true;
-    }
-
-    DEBUG_SUMMARY(dbgs() << " failed\n");
-    if (!Rerun) {
-      continue;
-    }
-
-    // Rerun with dynamic information retained
-    resetSchedule(/*FullReset=*/false);
-    DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name()
-                         << " with critical path");
-    if (scheduleFirstIteration(S) && scheduleOtherIterations()) {
-      DEBUG_SUMMARY(dbgs() << " found II=" << II << "\n");
-      return true;
+      if (scheduleFirstIteration(S) && scheduleOtherIterations()) {
+        DEBUG_SUMMARY(dbgs() << "    Strategy " << S.name() << " run=" << Run
+                             << " found II=" << II << "\n");
+        return true;
+      }
+      resetSchedule(/*FullReset=*/false);
     }
-    DEBUG_SUMMARY(dbgs() << " failed\n");
+    DEBUG_SUMMARY(dbgs() << "    Strategy " << S.name() << " failed\n");
   }
   DEBUG_SUMMARY(dbgs() << "=== II=" << II << " Failed ===\n");
   return false;
@@ -707,7 +756,7 @@ bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) {
   assert(NTotalInstrs % NInstr == 0);
   NCopies = NTotalInstrs / NInstr;
   if (NCopies == 1) {
-    LLVM_DEBUG(dbgs() << "PostPipeliner: Not feasible\n");
+    LLVM_DEBUG(dbgs() << "PostPipeliner: Not feasible - Too few stages\n");
     return false;
   }
   II = InitiationInterval;
@@ -718,15 +767,19 @@ bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) {
   Depth = NCopies * II + HR.getPipelineDepth();
   Scoreboard.reset(Depth);
 
-  Info.clear();
-  Info.resize(NTotalInstrs);
+  Info.init(NInstr, NCopies);
 
   LLVM_DEBUG(for (int I = 0; I < NInstr;
                   I++) { dbgs() << I << " " << *DAG->SUnits[I].getInstr(); });
-  LLVM_DEBUG(dumpGraph(NInstr, Info, DAG));
+  LLVM_DEBUG(dumpGraph(Info, DAG));
 
-  computeLoopCarriedParameters();
+  bool Feasible = computeLoopCarriedParameters();
+  if (!Feasible) {
+    LLVM_DEBUG(dbgs() << "PostPipeliner: Not feasible - RecMII\n");
+    return false;
+  }
 
+  LLVM_DEBUG(dumpIntervals(Info, computeMinScheduleLength()));
   if (!tryHeuristics()) {
     LLVM_DEBUG(dbgs() << "PostPipeliner: No schedule found\n");
     return false;
@@ -735,6 +788,12 @@ bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) {
   computeStages();
   LLVM_DEBUG(dbgs() << "PostPipeliner: Schedule found, NS=" << NStages
                     << " II=" << II << "\n");
+  // Let's not risk adding prologue and epilogue noise, but note it would
+  // represent a valid 'regular' loop schedule.
+  if (NStages == 1) {
+    LLVM_DEBUG(dbgs() << "PostPipeliner: Degenerate pipeline, NStages=1\n");
+    return false;
+  }
 
   // Check that we don't exceed the number of copies in the DAG. In that case
   // we didn't reach steady state, and we may have missed conflicts.
@@ -838,4 +897,15 @@ void NodeInfo::reset(bool FullReset) {
   }
 }
 
+void ScheduleInfo::compute() {
+  MinEarliest = 0;
+  MaxEarliest = 0;
+  MinLatest = -1;
+  for (int K = 0; K < NInstr; K++) {
+    MinEarliest = std::min(MinEarliest, Nodes[K].Earliest);
+    MaxEarliest = std::max(MaxEarliest, Nodes[K].Earliest);
+    MinLatest = std::min(MinLatest, Nodes[K].Latest);
+  }
+}
+
 } // namespace llvm::AIE
diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.h b/llvm/lib/Target/AIE/AIEPostPipeliner.h
index 5fa8ca8d7f49..c065ef34f348 100644
--- a/llvm/lib/Target/AIE/AIEPostPipeliner.h
+++ b/llvm/lib/Target/AIE/AIEPostPipeliner.h
@@ -90,14 +90,46 @@ class NodeInfo {
   void reset(bool FullReset);
 };
 
+class ScheduleInfo {
+public:
+  // The main adminiostration of the PostPipeliner. Every node represents
+  // an instruction. The array can be indexed with the NodeNum of the
+  // corresponding SUnit
+  std::vector<NodeInfo> Nodes;
+
+  // The number of instructions in the original loop body
+  int NInstr;
+
+  // Some global statistics over Earliest and Latest of all nodes.
+  // MaxEarliest and MinLatest relate to the length of the schedule.
+  int MaxEarliest = 0;
+  int MinLatest = -1;
+
+  // After propagating Earliest of the second iteration back to the first
+  // iteration, the minimum over Earliest is directly related to the
+  // Recurrence Minimum Initiation Interval.
+  int MinEarliest = 0;
+
+  void init(int NOrig, int NCopies) {
+    NInstr = NOrig;
+    Nodes.clear();
+    Nodes.resize(NInstr * NCopies);
+  }
+
+  NodeInfo &operator[](int N) { return Nodes[N]; }
+  const NodeInfo &operator[](int N) const { return Nodes[N]; }
+  // Compute some useful derived values
+  void compute();
+};
+
 class PostPipelinerStrategy {
 protected:
   ScheduleDAGInstrs &DAG;
-  std::vector<NodeInfo> &Info;
+  ScheduleInfo &Info;
   int LatestBias = 0;
 
 public:
-  PostPipelinerStrategy(ScheduleDAGInstrs &DAG, std::vector<NodeInfo> &Info,
+  PostPipelinerStrategy(ScheduleDAGInstrs &DAG, ScheduleInfo &Info,
                         int LatestBias)
       : DAG(DAG), Info(Info), LatestBias(LatestBias) {};
   virtual ~PostPipelinerStrategy() {};
@@ -139,9 +171,10 @@ class PostPipeliner {
   int FirstUnscheduled = 0;
   int LastUnscheduled = -1;
 
-  /// Holds the cycle of each SUnit. The following should hold:
+  /// Holds the schuling information for each instruction. The following
+  /// should hold:
   /// Cycle(N) mod II == Cycle(N % NInstr) mod II
-  std::vector<NodeInfo> Info;
+  ScheduleInfo Info;
 
   // The scoreboard and its depth
   ResourceScoreboard<FuncUnitWrapper> Scoreboard;
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-feasibleRA.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-feasibleRA.mir
new file mode 100644
index 000000000000..7e4dd73fa457
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-feasibleRA.mir
@@ -0,0 +1,231 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \
+# RUN:   --debug-only=postpipeliner-summary -o - | FileCheck %s
+
+
+# Derived from conv2d_bf16_0. The register allocation is tweaked manually to 
+# make it pipelinable with II=16
+
+--- |
+  define dso_local void @conv2d(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
+  ; CHECK-LABEL: conv2d:
+  ; CHECK:         .p2align 4
+  ; CHECK-NEXT:  // %bb.0: // %entry
+  ; CHECK-NEXT:    mova r1, #0; nopb ; nopxm
+  ; CHECK-NEXT:    ge r1, r1, r0
+  ; CHECK-NEXT:    jnz r1, #.LBB0_4
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]; nopx
+  ; CHECK-NEXT:    vldb wl8, [p0], m4
+  ; CHECK-NEXT:    vldb wh6, [p0, #32]
+  ; CHECK-NEXT:    vldb wl6, [p0], m4
+  ; CHECK-NEXT:    vldb wh4, [p0, #32]
+  ; CHECK-NEXT:    vldb wl4, [p0], m4; add.nc lc, r0, #-1
+  ; CHECK-NEXT:    vldb wh11, [p0, #32]; movxm ls, #.LBB0_2
+  ; CHECK-NEXT:    vldb.3d wl11, [p0], d1; movxm le, #.L_LEnd0
+  ; CHECK-NEXT:    vldb wh8, [p1, #32]; nopa ; nops ; nopx ; vshift.align x3, x3, s0, x8, r3; nopv
+  ; CHECK-NEXT:    vldb wl8, [p1, #0]; nopa ; nops ; nopx ; mov r5, p0; nopv
+  ; CHECK-NEXT:    vldb wh6, [p1, #96]; nopa ; nops ; and r4, r5, r0; vshift.align x2, x2, s0, x6, r3; nopv
+  ; CHECK-NEXT:    vldb wl6, [p1, #64]; nopa ; nops ; nopx ; vshuffle x9, x3, x2, r11; nopv
+  ; CHECK-NEXT:    nopb ; vlda wh10, [p1, #160]; nops ; nopx ; vshift.align x1, x1, s0, x4, r3; nopv
+  ; CHECK-NEXT:    nopb ; vlda wl10, [p1, #128]; nops ; nopx ; vshuffle x4, x3, x2, r12; nopv
+  ; CHECK-NEXT:    vlda wh7, [p1, #224]; add r3, r4, #34; vshift.align x0, x0, s0, x11, r3
+  ; CHECK-NEXT:    vlda wl7, [p1, #192]; paddb [p1], #256; vshuffle x11, x1, x0, r11
+  ; CHECK-NEXT:    vshuffle x5, x11, x9, r12; vmac.f bmh0, bmh0, x4, x8, r31
+  ; CHECK-NEXT:    vshuffle x11, x11, x9, r15
+  ; CHECK-NEXT:    vshuffle x9, x1, x0, r26; vmac.f bmh3, bmh3, x4, x6, r31
+  ; CHECK-NEXT:    vmac.f bmh6, bmh6, x5, x6, r31
+  ; CHECK-NEXT:    vmac.f bml3, bml3, x9, x8, r31
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_2: // %for.body
+  ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; vmac.f bml5, bml5, x9, x6, r31
+  ; CHECK-NEXT:    nopa ; vldb wl8, [p0], m4; nopxm ; vmac.f bmh5, bmh5, x9, x10, r31
+  ; CHECK-NEXT:    vldb wh6, [p0, #32]; vmac.f bml0, bml0, x9, x7, r31
+  ; CHECK-NEXT:    vldb wl6, [p0], m4; vmac.f bmh4, bmh4, x4, x7, r31
+  ; CHECK-NEXT:    vldb wh4, [p0, #32]; vmac.f bml1, bml1, x4, x10, r31
+  ; CHECK-NEXT:    vldb wl4, [p0], m4; vmac.f bmh2, bmh2, x5, x8, r31
+  ; CHECK-NEXT:    vldb wh11, [p0, #32]; vmac.f bml4, bml4, x11, x8, r31
+  ; CHECK-NEXT:    vldb.3d wl11, [p0], d1; vmac.f bml6, bml6, x11, x6, r31
+  ; CHECK-NEXT:    vldb wh8, [p1, #32]; vshift.align x3, x3, s0, x8, r3; vmac.f bml2, bml2, x5, x10, r31
+  ; CHECK-NEXT:    vldb wl8, [p1, #0]; mov r5, p0; vmac.f bmh1, bmh1, x5, x7, r31
+  ; CHECK-NEXT:    vldb wh6, [p1, #96]; and r4, r5, r0; vshift.align x2, x2, s0, x6, r3; vmac.f bmh7, bmh7, x11, x7, r31
+  ; CHECK-NEXT:    vldb wl6, [p1, #64]; vshuffle x9, x3, x2, r11; vmac.f bmh8, bmh8, x11, x10, r31
+  ; CHECK-NEXT:    vlda wh10, [p1, #160]; vshift.align x1, x1, s0, x4, r3
+  ; CHECK-NEXT:    vlda wl10, [p1, #128]; vshuffle x4, x3, x2, r12
+  ; CHECK-NEXT:    vlda wh7, [p1, #224]; add r3, r4, #34; vshift.align x0, x0, s0, x11, r3
+  ; CHECK-NEXT:    vlda wl7, [p1, #192]; paddb [p1], #256; vshuffle x11, x1, x0, r11
+  ; CHECK-NEXT:    vshuffle x5, x11, x9, r12; vmac.f bmh0, bmh0, x4, x8, r31
+  ; CHECK-NEXT:    vshuffle x11, x11, x9, r15
+  ; CHECK-NEXT:    vshuffle x9, x1, x0, r26; vmac.f bmh3, bmh3, x4, x6, r31
+  ; CHECK-NEXT:    vmac.f bmh6, bmh6, x5, x6, r31
+  ; CHECK-NEXT:  .L_LEnd0:
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; vmac.f bml3, bml3, x9, x8, r31
+  ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; vmac.f bml5, bml5, x9, x6, r31
+  ; CHECK-NEXT:    vmac.f bmh5, bmh5, x9, x10, r31
+  ; CHECK-NEXT:    vmac.f bml0, bml0, x9, x7, r31
+  ; CHECK-NEXT:    vmac.f bmh4, bmh4, x4, x7, r31
+  ; CHECK-NEXT:    vmac.f bml1, bml1, x4, x10, r31
+  ; CHECK-NEXT:    vmac.f bmh2, bmh2, x5, x8, r31
+  ; CHECK-NEXT:    vmac.f bml4, bml4, x11, x8, r31
+  ; CHECK-NEXT:    vmac.f bml6, bml6, x11, x6, r31
+  ; CHECK-NEXT:    vmac.f bml2, bml2, x5, x10, r31
+  ; CHECK-NEXT:    vmac.f bmh1, bmh1, x5, x7, r31
+  ; CHECK-NEXT:    vmac.f bmh7, bmh7, x11, x7, r31
+  ; CHECK-NEXT:    vmac.f bmh8, bmh8, x11, x10, r31
+  ; CHECK-NEXT:    nopx
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; ret lr
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  entry:
+    %cmp5 = icmp sgt i32 %n, 0
+    br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %p5 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
+    %p6 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
+    %0 = load i32, ptr addrspace(6) %p6, align 4, !tbaa !2
+    %add = add nsw i32 %0, 1
+    store i32 %add, ptr addrspace(5) %p5, align 4, !tbaa !2
+    %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %p5, i20 1
+    %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %p6, i20 1
+    %1 = call i1 @llvm.loop.decrement.i32(i32 1)
+    br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
+  }
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i32(i32) #1
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i32(i32) #1
+
+  attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+  attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.itercount.range", i64 10}
+
+...
+---
+name:            conv2d
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry (align 16):
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $p0, $p1, $r0
+
+    $r1 = MOV_RLC_imm10_pseudo 0
+    $r1 = GE $r1, $r0
+    JNZ $r1, %bb.3
+    DelayedSchedBarrier
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+    liveins: $p0, $p1, $r0
+
+    $lc = ADD_NC $r0, 0
+    $ls = MOVXM_lng_cg %bb.2
+    $le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
+
+  bb.2.for.body (align 16):
+    liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc3, $dc4, $dc5, $dc7, $dj1, $dj3, $dj5, $dj7, $dn1, $dn5, $dn7, $m0, $m1, $m3, $m4, $m5, $m7, $p0, $p1, $p2, $p3, $p4, $p5, $p7, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $s0, $s1, $x0, $x1, $x2, $x3, $x4, $x6, $d1_3d:0x000000000003C870, $dn3, $dn0, $dn4, $dj4
+
+    $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5)
+    $wl8, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.p5, addrspace 5)
+    $x3 = VSHIFT_ALIGN $x3, $s0, $x8, $r3
+    $wh6 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5)
+    $wl6, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.p5, addrspace 5)
+    $x2 = VSHIFT_ALIGN $x2, $s0, $x6, $r3
+    $wh4 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5)
+    $wl4, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.p5, addrspace 5)
+    $wh11 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5)
+    $wl11, $p0, $dc1, $dc5 = VLD_3D_pseudo $p0, $d1_3d :: (load (<16 x s16>) from %ir.p5, addrspace 5)
+    $x1 = VSHIFT_ALIGN $x1, $s0, $x4, $r3
+    $x0 = VSHIFT_ALIGN $x0, $s0, $x11, $r3
+    $wh8 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5)
+    $wl8 = VLD_idx_imm_3x32_pseudo $p1, 0 :: (load (<16 x s16>) from %ir.p5, addrspace 5)
+    $wh6 = VLD_idx_imm_3x32_pseudo $p1, 96 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5)
+    $wl6 = VLD_idx_imm_3x32_pseudo $p1, 64 :: (load (<16 x s16>) from %ir.p5, addrspace 5)
+
+    $x4 = VSHUFFLE $x3, $x2, $r12
+    $x9 = VSHUFFLE $x3, $x2, $r11
+    $x11 = VSHUFFLE $x1, $x0, $r11
+    $x5 = VSHUFFLE $x11, $x9, $r12
+    $x11 = VSHUFFLE $x11, $x9, $r15
+    $x9 = VSHUFFLE $x1, $x0, $r26
+
+    $wh10 = VLDA_dmw_lda_w_ag_idx_imm $p1, 160 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5)
+    $wl10 = VLDA_dmw_lda_w_ag_idx_imm $p1, 128 :: (load (<16 x s16>) from %ir.p5, addrspace 5)
+    $wh7 = VLDA_dmw_lda_w_ag_idx_imm $p1, 224 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5)
+    $wl7 = VLDA_dmw_lda_w_ag_idx_imm $p1, 192 :: (load (<16 x s16>) from %ir.p5, addrspace 5)
+    $p1 = nuw PADD_imm9_pseudo $p1, 256
+
+    $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x9, $x8, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x8, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x4, $x6, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x9, $x6, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x9, $x10, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x9, $x7, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x4, $x7, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x4, $x10, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x5, $x6, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x8, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x11, $x8, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x11, $x6, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x5, $x10, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x5, $x7, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x11, $x7, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x11, $x10, $r31, implicit-def $srfpflags, implicit $crfpmask
+    $r5 = MOV_mv_scl $p0
+    $r4 = AND $r5, $r0
+    $r3 = nuw nsw ADD_add_r_ri $r4, 34, implicit-def $srcarry
+    PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
+
+  bb.3.for.cond.cleanup (align 16):
+    RET implicit $lr
+    DelayedSchedBarrier
+
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir
index a5dae2d34a2a..655cdee89a7a 100644
--- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir
@@ -29,13 +29,13 @@
   ; CHECK-NEXT:    .p2align 4
   ; CHECK-NEXT:  .LBB0_1: // %for.body
   ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-  ; CHECK-NEXT:    nopb ; lda r0, [p2, #0]; nops ; nopx ; mov p2, p1; nopv
-  ; CHECK-NEXT:    nopa ; nopx
-  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nopb ; lda r0, [p2, #0]; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopx
   ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    mov p2, p1
   ; CHECK-NEXT:  .L_LEnd0:
   ; CHECK-NEXT:    nopb ; nopa ; st r0, [p0, #0]; nopxm ; nopv
   ; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup