From 99d82771b37b008657acef993a9d72ef4b0ed588 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Fri, 4 Oct 2024 15:32:23 +0100 Subject: [PATCH] [AIEX] Basic heuristics for scheduling loops with LCDs --- llvm/lib/Target/AIE/AIEInterBlockScheduling.h | 1 + llvm/lib/Target/AIE/AIEMachineScheduler.cpp | 51 +++++++++++++++++++ .../CodeGen/AIE/aie2/end-to-end/Add2D-red.ll | 24 +++++---- 3 files changed, 65 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h index 0ff32e68b665..c1d69c605ab9 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h @@ -100,6 +100,7 @@ class InterBlockEdges { /// Retrieve the SUnit that represents MI's instance before the /// boundary, null if not found. const SUnit *getPreBoundaryNode(MachineInstr *MI) const; + /// Check whether SU represents an instruction after the boundary bool isPostBoundaryNode(SUnit *SU) const; }; diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index dee5fa2df4d0..5b27c303933f 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -83,6 +83,10 @@ static cl::opt InterBlockAlignment("aie-interblock-alignment", cl::init(true), cl::desc("Allow for alignment of successor blocks")); +static cl::opt UseLoopHeuristics( + "aie-loop-sched-heuristics", cl::init(true), + cl::desc("Use special picking heuristics when scheduling a loop region")); + namespace { // A sentinel value to represent an unknown SUnit. const constexpr unsigned UnknownSUNum = ~0; @@ -694,6 +698,30 @@ void AIEPostRASchedStrategy::handleRegionConflicts( } } +/// The earliest use of this instruction in the next iteration. +/// Note that we reason with "bottom-up" cycle, so a larger cycle means it's +/// used earlier in topological order. If the SU has no loop-carried dependency, +/// this will be MAX_INT. +int getEarliestLoopCarriedUse(const SUnit &SU, + const InterBlockEdges &LoopEdges) { + const SUnit *SUInCurrentIteration = + LoopEdges.getPreBoundaryNode(SU.getInstr()); + assert(SUInCurrentIteration); + assert(SUInCurrentIteration->getHeight() >= SU.getHeight()); + + // Look at loop-carried dependencies to see how early the instruction will be + // needed in the next iteration. + int EarliestCycle = std::numeric_limits::max(); + for (const SDep &Succ : SUInCurrentIteration->Succs) { + if (!LoopEdges.isPostBoundaryNode(Succ.getSUnit())) + continue; + + EarliestCycle = std::min(EarliestCycle, int(Succ.getSUnit()->getHeight())); + } + + return EarliestCycle; +} + /// Apply a set of heuristics to a new candidate for PostRA scheduling. /// /// \param Cand provides the policy and current best candidate. @@ -739,6 +767,29 @@ bool AIEPostRASchedStrategy::tryCandidate(SchedCandidate &Cand, return TryCand.Reason != NoCand; } + // Special heuristics for loops. + // Note that they aren't used for the first fixpoint iteration: this is + // currently a workaround because we want a very optimistic schedule in that + // first iteration. That is because it decides the slot assignments for + // multi-slot instructions. This rule can probably be deleted once the + // loop-aware scheduler knows how to reassign those. + const BlockState &BS = getInterBlock().getBlockState(CurMBB); + if (UseLoopHeuristics && BS.Kind == AIE::BlockType::Loop && + BS.getRegions().size() == 1 && BS.FixPoint.NumIters > 0) { + const InterBlockEdges &LoopEdges = BS.getBoundaryEdges(); + + // For instructions with equal dependence chains, prioritize scheduling + // instructions that are used later in the next iteration. The point is + // to teach our heuristics a tiny bit about LCDs. + if (tryLess(getEarliestLoopCarriedUse(*TryCand.SU, LoopEdges) + + TryCand.SU->BotReadyCycle, + getEarliestLoopCarriedUse(*Cand.SU, LoopEdges) + + Cand.SU->BotReadyCycle, + TryCand, Cand, BotPathReduce)) { + return TryCand.Reason != NoCand; + } + } + // Prefer the instruction whose dependent chain is estimated to // finish executing later. This can help reducing the overall height // of the region. diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll index e21e9c38b183..8ba21e031402 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll @@ -124,19 +124,21 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-NEXT: .p2align 4 ; ASM-NEXT: .LBB0_3: // %for.body ; ASM-NEXT: // =>This Inner Loop Header: Depth=1 -; ASM-NEXT: vlda.ups.s32.d8 cm2, s1, [p1], m1; nopb ; nopxm ; nops -; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0 -; ASM-NEXT: vlda.ups.s32.d8 cm5, s1, [p1], m1; vst.srs.d8.s32 cm7, s0, [p3], #32; vadd cm3, cm4, cm3, r0 -; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; vadd cm5, cm6, cm5, r0 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; add r1, r1, #-4; vadd cm7, cm1, cm0, r0 -; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; jnz r1, #.LBB0_3 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm4, s1, [p2], d0 // Delay Slot 5 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; vst.srs.d8.s32 cm8, s0, [p3], #32 // Delay Slot 4 +; ASM-NEXT: nopb ; vlda.ups.s32.d8 cm2, s1, [p1], m1; nops ; nopxm ; nopv +; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; nopx ; vadd cm5, cm6, cm5, r0 +; ASM-NEXT: vlda.ups.s32.d8 cm5, s1, [p1], m1 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; add r1, r1, #-4 +; ASM-NEXT: vst.srs.d8.s32 cm7, s0, [p3], #32; jnz r1, #.LBB0_3; vadd cm3, cm4, cm3, r0 +; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; vst.srs.d8.s32 cm8, s0, [p3], #32 // Delay Slot 5 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm4, s1, [p2], d0 // Delay Slot 4 ; ASM-NEXT: vst.srs.d8.s32 cm5, s0, [p3], #32 // Delay Slot 3 -; ASM-NEXT: nop // Delay Slot 2 -; ASM-NEXT: vst.srs.d8.s32 cm3, s0, [p3], #32; vadd cm8, cm6, cm2, r0 // Delay Slot 1 +; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; vadd cm7, cm1, cm0, r0 // Delay Slot 2 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; vst.srs.d8.s32 cm3, s0, [p3], #32; vadd cm8, cm6, cm2, r0 // Delay Slot 1 ; ASM-NEXT: // %bb.4: -; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; nopv +; ASM-NEXT: nopa ; nopxm +; ASM-NEXT: nop +; ASM-NEXT: nop +; ASM-NEXT: nop ; ASM-NEXT: .p2align 4 ; ASM-NEXT: .LBB0_5: ; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; vadd cm5, cm6, cm5, r0