Skip to content

Commit

Permalink
[AIEX] Prevent libcalls for scalar floating point operations in ZOL
Browse files Browse the repository at this point in the history
Refactor TTI to have options and common code in one place
  • Loading branch information
Martien de Jong committed Feb 24, 2025
1 parent 7d9048c commit 1564db6
Show file tree
Hide file tree
Showing 8 changed files with 260 additions and 264 deletions.
165 changes: 19 additions & 146 deletions llvm/lib/Target/AIE/AIE2TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,152 +20,6 @@ using namespace llvm;

#define DEBUG_TYPE "aie2tti"

cl::opt<bool> EnableAIEHardwareLoops("enable-aie-hardware-loops",
cl::desc("Enable hardware loops on AIE"),
cl::init(true), cl::Hidden);

cl::opt<bool>
AllowAIEZOL("enable-aie-zero-overhead-loops",
cl::desc("Enable true zero overhead hardware loops on AIE"),
cl::init(true), cl::Hidden);

cl::opt<int> MinIterCountHLReject(
"aie-hardware-loops-minitercount", cl::Hidden, cl::init(3),
cl::desc("Minimum trip count threshold for HL rejection"));

cl::opt<bool>
ForceHLGeneration("aie-force-hl-gen", cl::Hidden, cl::init(false),
cl::desc("Force HL generation ignoring metadata info."));

cl::opt<bool>
ConsiderLSROuterLoops("aie-lsr-consider-outer", cl::Hidden, cl::init(false),
cl::desc("Whether to consider outer loops for LSR"));

cl::opt<bool>
EnableAutoUnroll("aie-unroll-auto", cl::Hidden, cl::init(true),
cl::desc("Whether to unroll loops without pragmas"));
cl::opt<unsigned>
MaxUnrollCount("aie-unroll-max-count", cl::Hidden, cl::init(4),
cl::desc("Maximum partial unroll count for loops"));
cl::opt<int> MaxUnrollLoads("aie-unroll-max-loads", cl::Hidden, cl::init(-1),
cl::desc("Maximum partial unroll count for loops"));
cl::opt<unsigned>
MaxUnrollCost("aie-unroll-max-cost", cl::Hidden, cl::init(200),
cl::desc("Maximum partial unroll cost for loops"));
cl::opt<unsigned> PreferSwpOverUnroll(
"aie-prefer-swp-over-unroll", cl::Hidden, cl::init(9),
cl::desc("Aim for pipelining if MinIterCount is at least this value."));

void AIE2TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) {
UP.Partial = UP.Runtime = true;
BaseT::getUnrollingPreferences(L, SE, UP, ORE);
UP.Partial &= UP.Runtime &= EnableAutoUnroll;
UP.MaxCount = MaxUnrollCount;
UP.FullUnrollMaxCount = 32;
UP.Threshold = MaxUnrollCost;
UP.AllowExpensiveTripCount = true;

if (L->getNumBlocks() == 1) {
BasicBlock *LoopBlock = L->getHeader();
if (MaxUnrollLoads >= 0) {
int NumLoads = count_if(*LoopBlock, [](const Instruction &I) {
return I.mayReadFromMemory();
});
if (NumLoads)
UP.MaxCount =
std::min(UP.MaxCount, unsigned(MaxUnrollLoads / NumLoads));
}
auto MinIterCount = getMinTripCount(L->getLoopID());
if (MinIterCount && *MinIterCount >= PreferSwpOverUnroll) {
UP.Partial = false;
UP.Runtime = false;
}
}
}

bool AIE2TTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) {

if (!EnableAIEHardwareLoops) {
LLVM_DEBUG(dbgs() << "AIE Loops: Disabled\n");
return false;
}

if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
LLVM_DEBUG(dbgs() << "AIE Loops: No static backedge taken count\n");
return false;
}

const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount));
const SCEV *TripCountSCEV = SE.getAddExpr(
BackedgeTakenCount, SE.getOne(BackedgeTakenCount->getType()));

// We need to store the trip count in GPR/LC, a 32-bit register.
if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
LLVM_DEBUG(dbgs() << "AIE Loops: Trip count does not fit into 32bits\n");
return false;
}

// For now, we'll handle only single BB loops for AIE
// zero-overhead loop.
if (L->getNumBlocks() > 1)
return false;

if (!ForceHLGeneration) {
std::optional<int64_t> MinTripCount = getMinTripCount(L, &SE);
if (MinTripCount) {
// Reject HL for this case.
if (*MinTripCount <= MinIterCountHLReject) {
return false;
}
} else {
// We have metadata, but not iteration information.
return false;
}
}

// Scan the loop: loops with calls - make it unprofitable
for (BasicBlock *BB : L->blocks()) {
for (Instruction &I : *BB) {
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
if (!isLoweredToCall(F))
continue;
}
return false;
}
}
}
// We don't want to use ZOL in these cases
LLVMContext &C = L->getHeader()->getContext();
HWLoopInfo.CountType = Type::getInt32Ty(C);
HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
// We always allow nested hardware loops, but only the innermost loop
// can use actual zero overhead loop instructions
HWLoopInfo.IsNestingLegal = true;
if (L->isInnermost() && AllowAIEZOL) {
LLVM_DEBUG(dbgs() << "AIE Loops: Loop is ZOL candidate\n");
HWLoopInfo.CounterInReg = false;
} else {
LLVM_DEBUG(dbgs() << "AIE Loops: Loop is JNZD candidate\n");
HWLoopInfo.CounterInReg = true;
}
return true;
}

bool AIE2TTIImpl::isProfitableOuterLSR(const Loop &L) const {
// Down-counting loops are essentially always profitable for AIE.
// They typically need a single GPR for counting down, while "up-counting"
// loop need one for the IV, and one for the upper bound.
return ConsiderLSROuterLoops.getNumOccurrences() > 0 ? ConsiderLSROuterLoops
: true;
}

static std::optional<Instruction *>
instCombineDemandedBits(InstCombiner &IC, IntrinsicInst &II, unsigned numBits) {
KnownBits ScalarKnown(32);
Expand All @@ -190,3 +44,22 @@ AIE2TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
}
return std::nullopt;
}

void AIE2TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) {
UP.Partial = UP.Runtime = true;
BaseT::getUnrollingPreferences(L, SE, UP, ORE);
Common.adjustUnrollingPreferences(L, SE, UP, ORE);
}

bool AIE2TTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) {
return Common.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}

bool AIE2TTIImpl::isProfitableOuterLSR(const Loop &L) const {
return Common.isProfitableOuterLSR(L);
}
17 changes: 8 additions & 9 deletions llvm/lib/Target/AIE/AIE2TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,13 @@ class AIE2TTIImpl : public AIEBaseTTIImpl<AIE2TTIImpl> {
typedef AIEBaseTTIImpl<AIE2TTIImpl> BaseT;
typedef TargetTransformInfo TTI;
friend BaseT;
AIETTICommon Common;

public:
explicit AIE2TTIImpl(const AIE2TargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout(),
(const AIESubtarget *)TM->getSubtargetImpl(F)) {}

void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE);

bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC, TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo);

bool isProfitableOuterLSR(const Loop &L) const;
std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) const;

Expand All @@ -54,6 +46,13 @@ class AIE2TTIImpl : public AIEBaseTTIImpl<AIE2TTIImpl> {
// This type of code can lead to additional pointer arithmetics and
// and pointer moves (especially due to the pre-pipeliner).
bool isProfitableFoldGEPIntoPHI() const { return false; }
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE);
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC, TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo);
bool isProfitableOuterLSR(const Loop &L) const;
};

} // end namespace llvm
Expand Down
Loading

0 comments on commit 1564db6

Please sign in to comment.