From 99d82771b37b008657acef993a9d72ef4b0ed588 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= <gaetan.bossu@amd.com>
Date: Fri, 4 Oct 2024 15:32:23 +0100
Subject: [PATCH] [AIEX] Basic heuristics for scheduling loops with LCDs

---
 llvm/lib/Target/AIE/AIEInterBlockScheduling.h |  1 +
 llvm/lib/Target/AIE/AIEMachineScheduler.cpp   | 51 +++++++++++++++++++
 .../CodeGen/AIE/aie2/end-to-end/Add2D-red.ll  | 24 +++++----
 3 files changed, 65 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
index 0ff32e68b665..c1d69c605ab9 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h
@@ -100,6 +100,7 @@ class InterBlockEdges {
   /// Retrieve the SUnit that represents MI's instance before the
   /// boundary, null if not found.
   const SUnit *getPreBoundaryNode(MachineInstr *MI) const;
+
   /// Check whether SU represents an instruction after the boundary
   bool isPostBoundaryNode(SUnit *SU) const;
 };
diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
index dee5fa2df4d0..5b27c303933f 100644
--- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
+++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
@@ -83,6 +83,10 @@ static cl::opt<bool>
     InterBlockAlignment("aie-interblock-alignment", cl::init(true),
                         cl::desc("Allow for alignment of successor blocks"));
 
+static cl::opt<bool> UseLoopHeuristics(
+    "aie-loop-sched-heuristics", cl::init(true),
+    cl::desc("Use special picking heuristics when scheduling a loop region"));
+
 namespace {
 // A sentinel value to represent an unknown SUnit.
 const constexpr unsigned UnknownSUNum = ~0;
@@ -694,6 +698,30 @@ void AIEPostRASchedStrategy::handleRegionConflicts(
   }
 }
 
+/// The earliest use of this instruction in the next iteration.
+/// Note that we reason with "bottom-up" cycle, so a larger cycle means it's
+/// used earlier in topological order. If the SU has no loop-carried dependency,
+/// this will be MAX_INT.
+int getEarliestLoopCarriedUse(const SUnit &SU,
+                              const InterBlockEdges &LoopEdges) {
+  const SUnit *SUInCurrentIteration =
+      LoopEdges.getPreBoundaryNode(SU.getInstr());
+  assert(SUInCurrentIteration);
+  assert(SUInCurrentIteration->getHeight() >= SU.getHeight());
+
+  // Look at loop-carried dependencies to see how early the instruction will be
+  // needed in the next iteration.
+  int EarliestCycle = std::numeric_limits<int>::max();
+  for (const SDep &Succ : SUInCurrentIteration->Succs) {
+    if (!LoopEdges.isPostBoundaryNode(Succ.getSUnit()))
+      continue;
+
+    EarliestCycle = std::min(EarliestCycle, int(Succ.getSUnit()->getHeight()));
+  }
+
+  return EarliestCycle;
+}
+
 /// Apply a set of heuristics to a new candidate for PostRA scheduling.
 ///
 /// \param Cand provides the policy and current best candidate.
@@ -739,6 +767,29 @@ bool AIEPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
       return TryCand.Reason != NoCand;
     }
 
+    // Special heuristics for loops.
+    // Note that they aren't used for the first fixpoint iteration: this is
+    // currently a workaround because we want a very optimistic schedule in that
+    // first iteration. That is because it decides the slot assignments for
+    // multi-slot instructions. This rule can probably be deleted once the
+    // loop-aware scheduler knows how to reassign those.
+    const BlockState &BS = getInterBlock().getBlockState(CurMBB);
+    if (UseLoopHeuristics && BS.Kind == AIE::BlockType::Loop &&
+        BS.getRegions().size() == 1 && BS.FixPoint.NumIters > 0) {
+      const InterBlockEdges &LoopEdges = BS.getBoundaryEdges();
+
+      // For instructions with equal dependence chains, prioritize scheduling
+      // instructions that are used later in the next iteration. The point is
+      // to teach our heuristics a tiny bit about LCDs.
+      if (tryLess(getEarliestLoopCarriedUse(*TryCand.SU, LoopEdges) +
+                      TryCand.SU->BotReadyCycle,
+                  getEarliestLoopCarriedUse(*Cand.SU, LoopEdges) +
+                      Cand.SU->BotReadyCycle,
+                  TryCand, Cand, BotPathReduce)) {
+        return TryCand.Reason != NoCand;
+      }
+    }
+
     // Prefer the instruction whose dependent chain is estimated to
     // finish executing later. This can help reducing the overall height
     // of the region.
diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
index e21e9c38b183..8ba21e031402 100644
--- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
+++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
@@ -124,19 +124,21 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
 ; ASM-NEXT:    .p2align 4
 ; ASM-NEXT:  .LBB0_3: // %for.body
 ; ASM-NEXT:    // =>This Inner Loop Header: Depth=1
-; ASM-NEXT:    vlda.ups.s32.d8 cm2, s1, [p1], m1; nopb ; nopxm ; nops
-; ASM-NEXT:    vlda.3d.ups.s32.d8 cm6, s1, [p2], d0
-; ASM-NEXT:    vlda.ups.s32.d8 cm5, s1, [p1], m1; vst.srs.d8.s32 cm7, s0, [p3], #32; vadd cm3, cm4, cm3, r0
-; ASM-NEXT:    vlda.ups.s32.d8 cm3, s1, [p1], m1; vadd cm5, cm6, cm5, r0
-; ASM-NEXT:    vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; add r1, r1, #-4; vadd cm7, cm1, cm0, r0
-; ASM-NEXT:    vlda.ups.s32.d8 cm0, s1, [p1], m1; jnz r1, #.LBB0_3
-; ASM-NEXT:    vlda.3d.ups.s32.d8 cm4, s1, [p2], d0 // Delay Slot 5
-; ASM-NEXT:    vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; vst.srs.d8.s32 cm8, s0, [p3], #32 // Delay Slot 4
+; ASM-NEXT:    nopb ; vlda.ups.s32.d8 cm2, s1, [p1], m1; nops ; nopxm ; nopv
+; ASM-NEXT:    vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; nopx ; vadd cm5, cm6, cm5, r0
+; ASM-NEXT:    vlda.ups.s32.d8 cm5, s1, [p1], m1
+; ASM-NEXT:    vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; add r1, r1, #-4
+; ASM-NEXT:    vst.srs.d8.s32 cm7, s0, [p3], #32; jnz r1, #.LBB0_3; vadd cm3, cm4, cm3, r0
+; ASM-NEXT:    vlda.ups.s32.d8 cm3, s1, [p1], m1; vst.srs.d8.s32 cm8, s0, [p3], #32 // Delay Slot 5
+; ASM-NEXT:    vlda.3d.ups.s32.d8 cm4, s1, [p2], d0 // Delay Slot 4
 ; ASM-NEXT:    vst.srs.d8.s32 cm5, s0, [p3], #32 // Delay Slot 3
-; ASM-NEXT:    nop // Delay Slot 2
-; ASM-NEXT:    vst.srs.d8.s32 cm3, s0, [p3], #32; vadd cm8, cm6, cm2, r0 // Delay Slot 1
+; ASM-NEXT:    vlda.ups.s32.d8 cm0, s1, [p1], m1; vadd cm7, cm1, cm0, r0 // Delay Slot 2
+; ASM-NEXT:    vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; vst.srs.d8.s32 cm3, s0, [p3], #32; vadd cm8, cm6, cm2, r0 // Delay Slot 1
 ; ASM-NEXT:  // %bb.4:
-; ASM-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+; ASM-NEXT:    nopa ; nopxm
+; ASM-NEXT:    nop
+; ASM-NEXT:    nop
+; ASM-NEXT:    nop
 ; ASM-NEXT:    .p2align 4
 ; ASM-NEXT:  .LBB0_5:
 ; ASM-NEXT:    nopb ; nopa ; nops ; nopxm ; vadd cm5, cm6, cm5, r0