diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index 024ecbd09b48..fb5431c74242 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -58,6 +58,10 @@ static cl::opt PostPipelinerMaxII( "aie-postpipeliner-maxii", cl::init(40), cl::desc("[AIE] Maximum II to be tried in the post-ra pipeliner")); +static cl::opt PostPipelinerMaxTryII( + "aie-postpipeliner-maxtry-ii", cl::init(10), + cl::desc("[AIE] Maximum II steps to be tried in the post-ra pipeliner")); + namespace llvm::AIE { void dumpInterBlock(const InterBlockEdges &Edges) { @@ -600,6 +604,7 @@ SchedulingStage InterBlockScheduling::updateScheduling(BlockState &BS) { auto &PostSWP = BS.getPostSWP(); if (PostSWP.canAccept(*BS.TheBlock)) { BS.FixPoint.II = PostSWP.getResMII(*BS.TheBlock); + BS.FixPoint.IITries = 1; return BS.FixPoint.Stage = SchedulingStage::Pipelining; } } @@ -614,7 +619,8 @@ SchedulingStage InterBlockScheduling::updatePipelining(BlockState &BS) { // Otherwise try a larger II. // We cut off at larger IIs to prevent excessive compilation time. - if (++BS.FixPoint.II <= PostPipelinerMaxII) { + if (++BS.FixPoint.II <= PostPipelinerMaxII && + ++BS.FixPoint.IITries <= PostPipelinerMaxTryII) { return BS.FixPoint.Stage = SchedulingStage::Pipelining; } diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h index 25da239326c0..5b9b293031b4 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h @@ -148,6 +148,8 @@ class FixedpointState { int ResourceMargin = 0; // The II of the modulo schedule we are trying. int II = 0; + // The number of II steps we've made from the minimum + int IITries = 0; // Results from the convergence test int MaxLatencyExtent = 0; int MaxResourceExtent = 0; diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 48f194d0878b..13368dff39c1 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -17,6 +17,8 @@ #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/Support/MathExtras.h" +#include +#include #define DEBUG_TYPE "postpipeliner" #define DEBUG_SUMMARY(X) DEBUG_WITH_TYPE("postpipeliner-summary", X) @@ -200,6 +202,11 @@ void PostPipeliner::computeForward() { for (int K = 0; K < NInstr; K++) { auto &Me = Info[K]; SUnit &SU = DAG->SUnits[K]; + Me.Slots = getSlotCounts(*SU.getInstr(), TII); + // Accumulate the slots of Me and all data predecessors. + SlotCounts Slots(Me.Slots); + int PredEarliest = std::numeric_limits::max(); + int Count = 0; for (auto &Dep : SU.Preds) { if (Dep.getKind() != SDep::Data) { continue; @@ -208,10 +215,18 @@ void PostPipeliner::computeForward() { assert(P < K); Me.Ancestors.insert(P); auto &Pred = Info[P]; + Slots += Pred.Slots; + Count++; + PredEarliest = std::min(PredEarliest, Pred.Earliest); for (int Anc : Pred.Ancestors) { Me.Ancestors.insert(Anc); } } + // When we need more slots than we have data predecessors, we have local + // resource contention that we can safely account for in Earliest. + if (Count > 0 && Slots.max() > Count) { + Me.Earliest = std::max(Me.Earliest, PredEarliest + Slots.max() - 1); + } for (auto &Dep : SU.Succs) { auto *Succ = Dep.getSUnit(); if (Succ->isBoundaryNode()) { @@ -221,7 +236,6 @@ void PostPipeliner::computeForward() { const int NewEarliest = Me.Earliest + Dep.getSignedLatency(); SInfo.Earliest = std::max(SInfo.Earliest, NewEarliest); } - Me.Slots = getSlotCounts(*SU.getInstr(), TII); } } @@ -323,11 +337,15 @@ bool PostPipeliner::computeLoopCarriedParameters() { } // Save the static values for ease of reset - for (auto &N : Info) { + for (auto &N : Info.Nodes) { N.StaticEarliest = N.Earliest; N.StaticLatest = N.Latest; } - return true; + Info.compute(); + + // If no node can be scheduled in cycle 0, we must have a circuit that + // is longer than II + return Info.MinEarliest == 0; } int PostPipeliner::computeMinScheduleLength() const { @@ -343,24 +361,23 @@ int PostPipeliner::computeMinScheduleLength() const { return MinLength; } -void dumpGraph(int NInstr, const std::vector &Info, - ScheduleDAGInstrs *DAG) { +void dumpGraph(const ScheduleInfo &Info, ScheduleDAGInstrs *DAG) { dbgs() << "digraph {\n"; - for (int K = 0; K < NInstr; K++) { + for (int K = 0; K < Info.NInstr; K++) { auto &SU = DAG->SUnits[K]; for (auto &Dep : SU.Succs) { auto *Succ = Dep.getSUnit(); int S = Succ->NodeNum; - if (S % NInstr == K) { + if (S % Info.NInstr == K || Succ->isBoundaryNode()) { continue; } dbgs() << "\tSU" << K << " -> " << "SU" << S; - if (S >= NInstr) { - dbgs() << "_" << S % NInstr; + if (S >= Info.NInstr) { + dbgs() << "_" << S % Info.NInstr; } if (Dep.getKind() == SDep::Data) { dbgs() << " [color=red] "; @@ -380,6 +397,25 @@ void dumpGraph(int NInstr, const std::vector &Info, dbgs() << "}\n"; } +void dumpIntervals(const ScheduleInfo &Info, int MinLength) { + dbgs() << "Intervals:\n"; + for (int K = 0; K < Info.NInstr; K++) { + std::string Head = "SU" + std::to_string(K); + dbgs() << Head; + for (int I = Head.length() - 6; I < MinLength; I++) { + if (I == 0) { + dbgs() << "|"; + } + if (I >= Info[K].Earliest && I <= MinLength + Info[K].Latest) { + dbgs() << "*"; + } else { + dbgs() << " "; + } + } + dbgs() << "\n"; + } +} + int PostPipeliner::mostUrgent(PostPipelinerStrategy &Strategy) { assert(FirstUnscheduled <= LastUnscheduled); while (Info[FirstUnscheduled].Scheduled) { @@ -476,7 +512,7 @@ bool PostPipeliner::scheduleFirstIteration(PostPipelinerStrategy &Strategy) { } namespace { -void dumpEarliestChain(const std::vector &Info, int N) { +void dumpEarliestChain(const ScheduleInfo &Info, int N) { auto Prev = Info[N].LastEarliestPusher; if (Prev) { dumpEarliestChain(Info, *Prev); @@ -515,8 +551,7 @@ bool PostPipeliner::scheduleOtherIterations() { class DefaultStrategy : public PostPipelinerStrategy { public: - DefaultStrategy(ScheduleDAGMI &DAG, std::vector &Info, - int LatestBias) + DefaultStrategy(ScheduleDAGMI &DAG, ScheduleInfo &Info, int LatestBias) : PostPipelinerStrategy(DAG, Info, LatestBias) {} bool better(const SUnit &A, const SUnit &B) override { return Info[A.NodeNum].Latest < Info[B.NodeNum].Latest; @@ -524,7 +559,12 @@ class DefaultStrategy : public PostPipelinerStrategy { }; class ConfigStrategy : public PostPipelinerStrategy { +protected: + int II = 0; + +private: bool TopDown = true; + bool Alternate = false; public: enum PriorityComponent { @@ -552,6 +592,13 @@ class ConfigStrategy : public PostPipelinerStrategy { } return "Size - Illegal"; } + struct Configuration { + int ExtraStages = 0; + bool TopDown = true; + bool Alternate = false; + int Runs = 0; + ArrayRef Components; + }; private: std::string Name; @@ -597,6 +644,12 @@ class ConfigStrategy : public PostPipelinerStrategy { return false; } + int earliest(const SUnit &N) override { return Info[N.NodeNum].Earliest; } + + int latest(const SUnit &N) override { + return Info[N.NodeNum].Latest + LatestBias; + } + void selected(const SUnit &N) override { // Promote the critical path NodeInfo *Pushed = &Info[N.NodeNum]; @@ -628,15 +681,20 @@ class ConfigStrategy : public PostPipelinerStrategy { PredSiblingScheduled.insert(PDep.getSUnit()->NodeNum); } } + if (Alternate) { + TopDown = !TopDown; + } } public: std::string name() override { return Name; } - ConfigStrategy(ScheduleDAGInstrs &DAG, std::vector &Info, - int Length, bool TopDown, + ConfigStrategy(ScheduleDAGInstrs &DAG, ScheduleInfo &Info, int Length, int II, + bool TopDown, bool Alternate, ArrayRef Components) - : PostPipelinerStrategy(DAG, Info, Length), TopDown(TopDown) { - Name = "Config_" + std::to_string(Length) + "_" + std::to_string(TopDown); + : PostPipelinerStrategy(DAG, Info, Length), II(II), TopDown(TopDown), + Alternate(Alternate) { + Name = "Config_" + std::to_string(Length) + "_" + std::to_string(TopDown) + + "_" + std::to_string(Alternate); for (auto Comp : Components) { Name += "_" + getPriorityName(Comp); Priority.emplace_back(Comp); @@ -644,59 +702,50 @@ class ConfigStrategy : public PostPipelinerStrategy { } }; -static const struct { - int ExtraStages; - bool TopDown; - bool Rerun; - ConfigStrategy::PriorityComponent Components[3]; -} Strategies[] = { +static const ConfigStrategy::PriorityComponent + NodeNum[] = {ConfigStrategy::NodeNum}, + Latest[] = {ConfigStrategy::Latest}, + Critical[] = {ConfigStrategy::Critical}, + CriticalLCDLatest[] = {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}; + +static const ConfigStrategy::Configuration Strategies[] = { // Loosely speaking, a lower value of the first parameter targets // a lower stage count, which benefits code size. - // Rerurn is only useful for heuristics that use it, e.g. Critical - {1, true, false, {ConfigStrategy::NodeNum}}, - {1, true, false, {ConfigStrategy::Latest}}, - {1, true, true, {ConfigStrategy::Critical}}, - {1, true, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}}, - {0, false, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}}, - {1, false, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}}, + // Runs>1 is only useful for heuristics that use it, e.g. Critical + // {ExtraStages, TopDown, Alternate, Runs, Components} + {1, true, false, 1, NodeNum}, + {1, true, false, 1, Latest}, + {1, true, false, 2, Critical}, + {1, true, false, 2, CriticalLCDLatest}, + {0, false, false, 2, CriticalLCDLatest}, + {1, false, false, 2, CriticalLCDLatest}, // This is pure bottom up - {1, false, false, {ConfigStrategy::NodeNum}}, + {1, false, false, 1, NodeNum}, }; bool PostPipeliner::tryHeuristics() { int MinLength = computeMinScheduleLength(); - DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n"); int HeuristicIndex = 0; - for (auto &[ExtraStages, TopDown, Rerun, Components] : Strategies) { + for (const auto &Config : Strategies) { if (Heuristic >= 0 && Heuristic != HeuristicIndex++) { continue; } - ConfigStrategy S(*DAG, Info, MinLength + ExtraStages * II, TopDown, - Components); + ConfigStrategy S(*DAG, Info, MinLength + Config.ExtraStages * II, II, + Config.TopDown, Config.Alternate, Config.Components); resetSchedule(/*FullReset=*/true); - DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() << "\n"); - if (scheduleFirstIteration(S) && scheduleOtherIterations()) { - DEBUG_SUMMARY(dbgs() << " Strategy " << S.name() << " found II=" << II + for (int Run = 0; Run < Config.Runs; Run++) { + DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() << " run=" << Run << "\n"); - return true; - } - - DEBUG_SUMMARY(dbgs() << " failed\n"); - if (!Rerun) { - continue; - } - - // Rerun with dynamic information retained - resetSchedule(/*FullReset=*/false); - DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() - << " with critical path"); - if (scheduleFirstIteration(S) && scheduleOtherIterations()) { - DEBUG_SUMMARY(dbgs() << " found II=" << II << "\n"); - return true; + if (scheduleFirstIteration(S) && scheduleOtherIterations()) { + DEBUG_SUMMARY(dbgs() << " Strategy " << S.name() << " run=" << Run + << " found II=" << II << "\n"); + return true; + } + resetSchedule(/*FullReset=*/false); } - DEBUG_SUMMARY(dbgs() << " failed\n"); + DEBUG_SUMMARY(dbgs() << " Strategy " << S.name() << " failed\n"); } DEBUG_SUMMARY(dbgs() << "=== II=" << II << " Failed ===\n"); return false; @@ -707,7 +756,7 @@ bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) { assert(NTotalInstrs % NInstr == 0); NCopies = NTotalInstrs / NInstr; if (NCopies == 1) { - LLVM_DEBUG(dbgs() << "PostPipeliner: Not feasible\n"); + LLVM_DEBUG(dbgs() << "PostPipeliner: Not feasible - Too few stages\n"); return false; } II = InitiationInterval; @@ -718,15 +767,19 @@ bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) { Depth = NCopies * II + HR.getPipelineDepth(); Scoreboard.reset(Depth); - Info.clear(); - Info.resize(NTotalInstrs); + Info.init(NInstr, NCopies); LLVM_DEBUG(for (int I = 0; I < NInstr; I++) { dbgs() << I << " " << *DAG->SUnits[I].getInstr(); }); - LLVM_DEBUG(dumpGraph(NInstr, Info, DAG)); + LLVM_DEBUG(dumpGraph(Info, DAG)); - computeLoopCarriedParameters(); + bool Feasible = computeLoopCarriedParameters(); + if (!Feasible) { + LLVM_DEBUG(dbgs() << "PostPipeliner: Not feasible - RecMII\n"); + return false; + } + LLVM_DEBUG(dumpIntervals(Info, computeMinScheduleLength())); if (!tryHeuristics()) { LLVM_DEBUG(dbgs() << "PostPipeliner: No schedule found\n"); return false; @@ -735,6 +788,12 @@ bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) { computeStages(); LLVM_DEBUG(dbgs() << "PostPipeliner: Schedule found, NS=" << NStages << " II=" << II << "\n"); + // Let's not risk adding prologue and epilogue noise, but note it would + // represent a valid 'regular' loop schedule. + if (NStages == 1) { + LLVM_DEBUG(dbgs() << "PostPipeliner: Degenerate pipeline, NStages=1\n"); + return false; + } // Check that we don't exceed the number of copies in the DAG. In that case // we didn't reach steady state, and we may have missed conflicts. @@ -838,4 +897,15 @@ void NodeInfo::reset(bool FullReset) { } } +void ScheduleInfo::compute() { + MinEarliest = 0; + MaxEarliest = 0; + MinLatest = -1; + for (int K = 0; K < NInstr; K++) { + MinEarliest = std::min(MinEarliest, Nodes[K].Earliest); + MaxEarliest = std::max(MaxEarliest, Nodes[K].Earliest); + MinLatest = std::min(MinLatest, Nodes[K].Latest); + } +} + } // namespace llvm::AIE diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.h b/llvm/lib/Target/AIE/AIEPostPipeliner.h index 5fa8ca8d7f49..c065ef34f348 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.h +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.h @@ -90,14 +90,46 @@ class NodeInfo { void reset(bool FullReset); }; +class ScheduleInfo { +public: + // The main adminiostration of the PostPipeliner. Every node represents + // an instruction. The array can be indexed with the NodeNum of the + // corresponding SUnit + std::vector Nodes; + + // The number of instructions in the original loop body + int NInstr; + + // Some global statistics over Earliest and Latest of all nodes. + // MaxEarliest and MinLatest relate to the length of the schedule. + int MaxEarliest = 0; + int MinLatest = -1; + + // After propagating Earliest of the second iteration back to the first + // iteration, the minimum over Earliest is directly related to the + // Recurrence Minimum Initiation Interval. + int MinEarliest = 0; + + void init(int NOrig, int NCopies) { + NInstr = NOrig; + Nodes.clear(); + Nodes.resize(NInstr * NCopies); + } + + NodeInfo &operator[](int N) { return Nodes[N]; } + const NodeInfo &operator[](int N) const { return Nodes[N]; } + // Compute some useful derived values + void compute(); +}; + class PostPipelinerStrategy { protected: ScheduleDAGInstrs &DAG; - std::vector &Info; + ScheduleInfo &Info; int LatestBias = 0; public: - PostPipelinerStrategy(ScheduleDAGInstrs &DAG, std::vector &Info, + PostPipelinerStrategy(ScheduleDAGInstrs &DAG, ScheduleInfo &Info, int LatestBias) : DAG(DAG), Info(Info), LatestBias(LatestBias) {}; virtual ~PostPipelinerStrategy() {}; @@ -139,9 +171,10 @@ class PostPipeliner { int FirstUnscheduled = 0; int LastUnscheduled = -1; - /// Holds the cycle of each SUnit. The following should hold: + /// Holds the schuling information for each instruction. The following + /// should hold: /// Cycle(N) mod II == Cycle(N % NInstr) mod II - std::vector Info; + ScheduleInfo Info; // The scoreboard and its depth ResourceScoreboard Scoreboard; diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-feasibleRA.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-feasibleRA.mir new file mode 100644 index 000000000000..7e4dd73fa457 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-feasibleRA.mir @@ -0,0 +1,231 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s + + +# Derived from conv2d_bf16_0. The register allocation is tweaked manually to +# make it pipelinable with II=16 + +--- | + define dso_local void @conv2d(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: conv2d: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopx + ; CHECK-NEXT: vldb wl8, [p0], m4 + ; CHECK-NEXT: vldb wh6, [p0, #32] + ; CHECK-NEXT: vldb wl6, [p0], m4 + ; CHECK-NEXT: vldb wh4, [p0, #32] + ; CHECK-NEXT: vldb wl4, [p0], m4; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wh11, [p0, #32]; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb.3d wl11, [p0], d1; movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wh8, [p1, #32]; nopa ; nops ; nopx ; vshift.align x3, x3, s0, x8, r3; nopv + ; CHECK-NEXT: vldb wl8, [p1, #0]; nopa ; nops ; nopx ; mov r5, p0; nopv + ; CHECK-NEXT: vldb wh6, [p1, #96]; nopa ; nops ; and r4, r5, r0; vshift.align x2, x2, s0, x6, r3; nopv + ; CHECK-NEXT: vldb wl6, [p1, #64]; nopa ; nops ; nopx ; vshuffle x9, x3, x2, r11; nopv + ; CHECK-NEXT: nopb ; vlda wh10, [p1, #160]; nops ; nopx ; vshift.align x1, x1, s0, x4, r3; nopv + ; CHECK-NEXT: nopb ; vlda wl10, [p1, #128]; nops ; nopx ; vshuffle x4, x3, x2, r12; nopv + ; CHECK-NEXT: vlda wh7, [p1, #224]; add r3, r4, #34; vshift.align x0, x0, s0, x11, r3 + ; CHECK-NEXT: vlda wl7, [p1, #192]; paddb [p1], #256; vshuffle x11, x1, x0, r11 + ; CHECK-NEXT: vshuffle x5, x11, x9, r12; vmac.f bmh0, bmh0, x4, x8, r31 + ; CHECK-NEXT: vshuffle x11, x11, x9, r15 + ; CHECK-NEXT: vshuffle x9, x1, x0, r26; vmac.f bmh3, bmh3, x4, x6, r31 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x5, x6, r31 + ; CHECK-NEXT: vmac.f bml3, bml3, x9, x8, r31 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; vmac.f bml5, bml5, x9, x6, r31 + ; CHECK-NEXT: nopa ; vldb wl8, [p0], m4; nopxm ; vmac.f bmh5, bmh5, x9, x10, r31 + ; CHECK-NEXT: vldb wh6, [p0, #32]; vmac.f bml0, bml0, x9, x7, r31 + ; CHECK-NEXT: vldb wl6, [p0], m4; vmac.f bmh4, bmh4, x4, x7, r31 + ; CHECK-NEXT: vldb wh4, [p0, #32]; vmac.f bml1, bml1, x4, x10, r31 + ; CHECK-NEXT: vldb wl4, [p0], m4; vmac.f bmh2, bmh2, x5, x8, r31 + ; CHECK-NEXT: vldb wh11, [p0, #32]; vmac.f bml4, bml4, x11, x8, r31 + ; CHECK-NEXT: vldb.3d wl11, [p0], d1; vmac.f bml6, bml6, x11, x6, r31 + ; CHECK-NEXT: vldb wh8, [p1, #32]; vshift.align x3, x3, s0, x8, r3; vmac.f bml2, bml2, x5, x10, r31 + ; CHECK-NEXT: vldb wl8, [p1, #0]; mov r5, p0; vmac.f bmh1, bmh1, x5, x7, r31 + ; CHECK-NEXT: vldb wh6, [p1, #96]; and r4, r5, r0; vshift.align x2, x2, s0, x6, r3; vmac.f bmh7, bmh7, x11, x7, r31 + ; CHECK-NEXT: vldb wl6, [p1, #64]; vshuffle x9, x3, x2, r11; vmac.f bmh8, bmh8, x11, x10, r31 + ; CHECK-NEXT: vlda wh10, [p1, #160]; vshift.align x1, x1, s0, x4, r3 + ; CHECK-NEXT: vlda wl10, [p1, #128]; vshuffle x4, x3, x2, r12 + ; CHECK-NEXT: vlda wh7, [p1, #224]; add r3, r4, #34; vshift.align x0, x0, s0, x11, r3 + ; CHECK-NEXT: vlda wl7, [p1, #192]; paddb [p1], #256; vshuffle x11, x1, x0, r11 + ; CHECK-NEXT: vshuffle x5, x11, x9, r12; vmac.f bmh0, bmh0, x4, x8, r31 + ; CHECK-NEXT: vshuffle x11, x11, x9, r15 + ; CHECK-NEXT: vshuffle x9, x1, x0, r26; vmac.f bmh3, bmh3, x4, x6, r31 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x5, x6, r31 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bml3, bml3, x9, x8, r31 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bml5, bml5, x9, x6, r31 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x9, x10, r31 + ; CHECK-NEXT: vmac.f bml0, bml0, x9, x7, r31 + ; CHECK-NEXT: vmac.f bmh4, bmh4, x4, x7, r31 + ; CHECK-NEXT: vmac.f bml1, bml1, x4, x10, r31 + ; CHECK-NEXT: vmac.f bmh2, bmh2, x5, x8, r31 + ; CHECK-NEXT: vmac.f bml4, bml4, x11, x8, r31 + ; CHECK-NEXT: vmac.f bml6, bml6, x11, x6, r31 + ; CHECK-NEXT: vmac.f bml2, bml2, x5, x10, r31 + ; CHECK-NEXT: vmac.f bmh1, bmh1, x5, x7, r31 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x11, x7, r31 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x11, x10, r31 + ; CHECK-NEXT: nopx + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %p5 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %p6 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %p6, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %p5, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %p5, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %p6, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: conv2d +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc3, $dc4, $dc5, $dc7, $dj1, $dj3, $dj5, $dj7, $dn1, $dn5, $dn7, $m0, $m1, $m3, $m4, $m5, $m7, $p0, $p1, $p2, $p3, $p4, $p5, $p7, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $s0, $s1, $x0, $x1, $x2, $x3, $x4, $x6, $d1_3d:0x000000000003C870, $dn3, $dn0, $dn4, $dj4 + + $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5) + $wl8, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.p5, addrspace 5) + $x3 = VSHIFT_ALIGN $x3, $s0, $x8, $r3 + $wh6 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5) + $wl6, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.p5, addrspace 5) + $x2 = VSHIFT_ALIGN $x2, $s0, $x6, $r3 + $wh4 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5) + $wl4, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.p5, addrspace 5) + $wh11 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5) + $wl11, $p0, $dc1, $dc5 = VLD_3D_pseudo $p0, $d1_3d :: (load (<16 x s16>) from %ir.p5, addrspace 5) + $x1 = VSHIFT_ALIGN $x1, $s0, $x4, $r3 + $x0 = VSHIFT_ALIGN $x0, $s0, $x11, $r3 + $wh8 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5) + $wl8 = VLD_idx_imm_3x32_pseudo $p1, 0 :: (load (<16 x s16>) from %ir.p5, addrspace 5) + $wh6 = VLD_idx_imm_3x32_pseudo $p1, 96 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5) + $wl6 = VLD_idx_imm_3x32_pseudo $p1, 64 :: (load (<16 x s16>) from %ir.p5, addrspace 5) + + $x4 = VSHUFFLE $x3, $x2, $r12 + $x9 = VSHUFFLE $x3, $x2, $r11 + $x11 = VSHUFFLE $x1, $x0, $r11 + $x5 = VSHUFFLE $x11, $x9, $r12 + $x11 = VSHUFFLE $x11, $x9, $r15 + $x9 = VSHUFFLE $x1, $x0, $r26 + + $wh10 = VLDA_dmw_lda_w_ag_idx_imm $p1, 160 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5) + $wl10 = VLDA_dmw_lda_w_ag_idx_imm $p1, 128 :: (load (<16 x s16>) from %ir.p5, addrspace 5) + $wh7 = VLDA_dmw_lda_w_ag_idx_imm $p1, 224 :: (load (<16 x s16>) from %ir.p5 + 32, addrspace 5) + $wl7 = VLDA_dmw_lda_w_ag_idx_imm $p1, 192 :: (load (<16 x s16>) from %ir.p5, addrspace 5) + $p1 = nuw PADD_imm9_pseudo $p1, 256 + + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x9, $x8, $r31, implicit-def $srfpflags, implicit $crfpmask + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x8, $r31, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x4, $x6, $r31, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x9, $x6, $r31, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x9, $x10, $r31, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x9, $x7, $r31, implicit-def $srfpflags, implicit $crfpmask + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x4, $x7, $r31, implicit-def $srfpflags, implicit $crfpmask + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x4, $x10, $r31, implicit-def $srfpflags, implicit $crfpmask + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x5, $x6, $r31, implicit-def $srfpflags, implicit $crfpmask + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x8, $r31, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x11, $x8, $r31, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x11, $x6, $r31, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x5, $x10, $r31, implicit-def $srfpflags, implicit $crfpmask + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x5, $x7, $r31, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x11, $x7, $r31, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x11, $x10, $r31, implicit-def $srfpflags, implicit $crfpmask + $r5 = MOV_mv_scl $p0 + $r4 = AND $r5, $r0 + $r3 = nuw nsw ADD_add_r_ri $r4, 34, implicit-def $srcarry + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir index a5dae2d34a2a..655cdee89a7a 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir @@ -29,13 +29,13 @@ ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopb ; lda r0, [p2, #0]; nops ; nopx ; mov p2, p1; nopv - ; CHECK-NEXT: nopa ; nopx - ; CHECK-NEXT: nop + ; CHECK-NEXT: nopb ; lda r0, [p2, #0]; nops ; nopxm ; nopv + ; CHECK-NEXT: nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop + ; CHECK-NEXT: mov p2, p1 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; nopa ; st r0, [p0, #0]; nopxm ; nopv ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup