-
Notifications
You must be signed in to change notification settings - Fork 15
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
MachineLICM to hoist instructions with constant inputs #220
Changes from all commits
69a33fd
b9f375d
3b47796
d1bf511
e522fc9
3cf56f6
d8ee417
05b6fe2
7a93613
dab3e95
5daaf71
da8d3d6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -226,8 +226,8 @@ namespace { | |
void HoistPostRA(MachineInstr *MI, unsigned Def, MachineLoop *CurLoop, | ||
MachineBasicBlock *CurPreheader); | ||
|
||
void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, | ||
BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs, | ||
void ProcessMI(MachineInstr *MI, BitVector &RUDefs, BitVector &RUClobbers, | ||
SmallSet<int, 32> &StoredFIs, | ||
SmallVectorImpl<CandidateInfo> &Candidates, | ||
MachineLoop *CurLoop); | ||
|
||
|
@@ -356,7 +356,6 @@ bool MachineLICMBase::runOnMachineFunction(MachineFunction &MF) { | |
MRI = &MF.getRegInfo(); | ||
SchedModel.init(&ST); | ||
|
||
PreRegAlloc = MRI->isSSA(); | ||
HasProfileData = MF.getFunction().hasProfileData(); | ||
|
||
if (PreRegAlloc) | ||
|
@@ -427,10 +426,63 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) { | |
return false; | ||
} | ||
|
||
static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI, | ||
BitVector &RUs, | ||
const uint32_t *Mask) { | ||
// FIXME: This intentionally works in reverse due to some issues with the | ||
// Register Units infrastructure. | ||
// | ||
// This is used to apply callee-saved-register masks to the clobbered regunits | ||
// mask. | ||
// | ||
// The right way to approach this is to start with a BitVector full of ones, | ||
// then reset all the bits of the regunits of each register that is set in the | ||
// mask (registers preserved), then OR the resulting bits with the Clobbers | ||
// mask. This correctly prioritizes the saved registers, so if a RU is shared | ||
// between a register that is preserved, and one that is NOT preserved, that | ||
// RU will not be set in the output vector (the clobbers). | ||
// | ||
// What we have to do for now is the opposite: we have to assume that the | ||
// regunits of all registers that are NOT preserved are clobbered, even if | ||
// those regunits are preserved by another register. So if a RU is shared | ||
// like described previously, that RU will be set. | ||
// | ||
// This is to work around an issue which appears in AArch64, but isn't | ||
// exclusive to that target: AArch64's Qn registers (128 bits) have Dn | ||
// register (lower 64 bits). A few Dn registers are preserved by some calling | ||
// conventions, but Qn and Dn share exactly the same reg units. | ||
// | ||
// If we do this the right way, Qn will be marked as NOT clobbered even though | ||
// its upper 64 bits are NOT preserved. The conservative approach handles this | ||
// correctly at the cost of some missed optimizations on other targets. | ||
// | ||
// This is caused by how RegUnits are handled within TableGen. Ideally, Qn | ||
// should have an extra RegUnit to model the "unknown" bits not covered by the | ||
// subregs. | ||
BitVector RUsFromRegsNotInMask(TRI.getNumRegUnits()); | ||
const unsigned NumRegs = TRI.getNumRegs(); | ||
const unsigned MaskWords = (NumRegs + 31) / 32; | ||
for (unsigned K = 0; K < MaskWords; ++K) { | ||
const uint32_t Word = Mask[K]; | ||
for (unsigned Bit = 0; Bit < 32; ++Bit) { | ||
const unsigned PhysReg = (K * 32) + Bit; | ||
if (PhysReg == NumRegs) | ||
break; | ||
|
||
if (PhysReg && !((Word >> Bit) & 1)) { | ||
for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI) | ||
RUsFromRegsNotInMask.set(*RUI); | ||
} | ||
} | ||
} | ||
|
||
RUs |= RUsFromRegsNotInMask; | ||
} | ||
|
||
/// Examine the instruction for potentai LICM candidate. Also | ||
/// gather register def and frame object update information. | ||
void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, | ||
BitVector &PhysRegClobbers, | ||
void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &RUDefs, | ||
BitVector &RUClobbers, | ||
SmallSet<int, 32> &StoredFIs, | ||
SmallVectorImpl<CandidateInfo> &Candidates, | ||
MachineLoop *CurLoop) { | ||
|
@@ -452,7 +504,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, | |
// We can't hoist an instruction defining a physreg that is clobbered in | ||
// the loop. | ||
if (MO.isRegMask()) { | ||
PhysRegClobbers.setBitsNotInMask(MO.getRegMask()); | ||
applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, MO.getRegMask()); | ||
continue; | ||
} | ||
|
||
|
@@ -464,16 +516,24 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, | |
assert(Reg.isPhysical() && "Not expecting virtual register!"); | ||
|
||
if (!MO.isDef()) { | ||
if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg))) | ||
// If it's using a non-loop-invariant register, then it's obviously not | ||
// safe to hoist. | ||
HasNonInvariantUse = true; | ||
if (!HasNonInvariantUse) { | ||
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { | ||
// If it's using a non-loop-invariant register, then it's obviously | ||
// not safe to hoist. | ||
// Note this isn't a final check, as we haven't gathered all the loop | ||
// register definitions yet. | ||
if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) { | ||
HasNonInvariantUse = true; | ||
break; | ||
} | ||
} | ||
} | ||
continue; | ||
} | ||
|
||
if (MO.isImplicit()) { | ||
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) | ||
PhysRegClobbers.set(*AI); | ||
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) | ||
RUClobbers.set(*RUI); | ||
if (!MO.isDead()) | ||
// Non-dead implicit def? This cannot be hoisted. | ||
RuledOut = true; | ||
|
@@ -492,19 +552,18 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, | |
// If we have already seen another instruction that defines the same | ||
// register, then this is not safe. Two defs is indicated by setting a | ||
// PhysRegClobbers bit. | ||
for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) { | ||
if (PhysRegDefs.test(*AS)) | ||
PhysRegClobbers.set(*AS); | ||
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { | ||
if (RUDefs.test(*RUI)) { | ||
RUClobbers.set(*RUI); | ||
RuledOut = true; | ||
} else if (RUClobbers.test(*RUI)) { | ||
// MI defined register is seen defined by another instruction in | ||
// the loop, it cannot be a LICM candidate. | ||
RuledOut = true; | ||
} | ||
|
||
RUDefs.set(*RUI); | ||
} | ||
// Need a second loop because MCRegAliasIterator can visit the same | ||
// register twice. | ||
for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) | ||
PhysRegDefs.set(*AS); | ||
|
||
if (PhysRegClobbers.test(Reg)) | ||
// MI defined register is seen defined by another instruction in | ||
// the loop, it cannot be a LICM candidate. | ||
RuledOut = true; | ||
} | ||
|
||
// Only consider reloads for now and remats which do not have register | ||
|
@@ -525,9 +584,9 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, | |
if (!Preheader) | ||
return; | ||
|
||
unsigned NumRegs = TRI->getNumRegs(); | ||
BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop. | ||
BitVector PhysRegClobbers(NumRegs); // Regs defined more than once. | ||
unsigned NumRegUnits = TRI->getNumRegUnits(); | ||
BitVector RUDefs(NumRegUnits); // RUs defined once in the loop. | ||
BitVector RUClobbers(NumRegUnits); // RUs defined more than once. | ||
|
||
SmallVector<CandidateInfo, 32> Candidates; | ||
SmallSet<int, 32> StoredFIs; | ||
|
@@ -540,26 +599,31 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, | |
const MachineLoop *ML = MLI->getLoopFor(BB); | ||
if (ML && ML->getHeader()->isEHPad()) continue; | ||
|
||
// Conservatively treat live-in's as an external def. | ||
// FIXME: That means a reload that're reused in successor block(s) will not | ||
// be LICM'ed. | ||
for (const auto &LI : BB->liveins()) { | ||
for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) | ||
PhysRegDefs.set(*AI); | ||
} | ||
|
||
// Funclet entry blocks will clobber all registers | ||
if (const uint32_t *Mask = BB->getBeginClobberMask(TRI)) | ||
PhysRegClobbers.setBitsNotInMask(Mask); | ||
applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, Mask); | ||
|
||
SpeculationState = SpeculateUnknown; | ||
for (MachineInstr &MI : *BB) | ||
ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates, | ||
CurLoop); | ||
ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop); | ||
} | ||
|
||
// Mark registers as clobbered if they are livein and also defined in the loop | ||
for (const auto &LoopLI : CurLoop->getHeader()->liveins()) { | ||
MCPhysReg LoopLiveInReg = LoopLI.PhysReg; | ||
LaneBitmask LiveInMask = LoopLI.LaneMask; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. check: we filter off some cases where other aliasing reg units are live, but with disjoint lanes. Maybe some comment to clarify could be helpful. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly, we only account for the live reg units that are part of the lane mask |
||
for (MCRegUnitMaskIterator RUI(LoopLiveInReg, TRI); RUI.isValid(); ++RUI) { | ||
auto LiveInUnit = (*RUI).first; | ||
LaneBitmask UnitMask = (*RUI).second; | ||
// Check if the livein lanes overlap with the lanes touched by LiveInUnit | ||
if ((UnitMask & LiveInMask).any() && RUDefs.test(LiveInUnit)) { | ||
RUClobbers.set(LiveInUnit); | ||
} | ||
} | ||
} | ||
|
||
// Gather the registers read / clobbered by the terminator. | ||
BitVector TermRegs(NumRegs); | ||
BitVector TermRUs(NumRegUnits); | ||
MachineBasicBlock::iterator TI = Preheader->getFirstTerminator(); | ||
if (TI != Preheader->end()) { | ||
for (const MachineOperand &MO : TI->operands()) { | ||
|
@@ -568,8 +632,8 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, | |
Register Reg = MO.getReg(); | ||
if (!Reg) | ||
continue; | ||
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) | ||
TermRegs.set(*AI); | ||
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) | ||
TermRUs.set(*RUI); | ||
} | ||
} | ||
|
||
|
@@ -587,24 +651,36 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, | |
continue; | ||
|
||
unsigned Def = Candidate.Def; | ||
if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) { | ||
bool Safe = true; | ||
MachineInstr *MI = Candidate.MI; | ||
for (const MachineOperand &MO : MI->all_uses()) { | ||
if (!MO.getReg()) | ||
continue; | ||
Register Reg = MO.getReg(); | ||
if (PhysRegDefs.test(Reg) || | ||
PhysRegClobbers.test(Reg)) { | ||
bool Safe = true; | ||
for (MCRegUnitIterator RUI(Def, TRI); RUI.isValid(); ++RUI) { | ||
if (RUClobbers.test(*RUI) || TermRUs.test(*RUI)) { | ||
Safe = false; | ||
break; | ||
} | ||
} | ||
|
||
if (!Safe) | ||
continue; | ||
|
||
MachineInstr *MI = Candidate.MI; | ||
for (const MachineOperand &MO : MI->all_uses()) { | ||
if (!MO.getReg()) | ||
continue; | ||
for (MCRegUnitIterator RUI(MO.getReg(), TRI); RUI.isValid(); ++RUI) { | ||
if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) { | ||
// If it's using a non-loop-invariant register, then it's obviously | ||
// not safe to hoist. | ||
Safe = false; | ||
break; | ||
} | ||
} | ||
if (Safe) | ||
HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader); | ||
|
||
if (!Safe) | ||
break; | ||
} | ||
|
||
if (Safe) | ||
HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader); | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -361,10 +361,12 @@ class EnforceCopyEdges : public ScheduleDAGMutation { | |
|
||
class PropagateIncomingLatencies : public ScheduleDAGMutation { | ||
bool OnlyCopyLike; | ||
bool OnlyLocalSources; | ||
|
||
public: | ||
PropagateIncomingLatencies(bool OnlyCopyLike = true) | ||
: OnlyCopyLike(OnlyCopyLike) {} | ||
PropagateIncomingLatencies(bool OnlyCopyLike = true, | ||
bool OnlyLocalSources = true) | ||
: OnlyCopyLike(OnlyCopyLike), OnlyLocalSources(OnlyLocalSources) {} | ||
void apply(ScheduleDAGInstrs *DAG) override { | ||
auto IsData = [](const SDep &D) { return D.getKind() == SDep::Data; }; | ||
for (SUnit &SU : DAG->SUnits) { | ||
|
@@ -381,25 +383,55 @@ class PropagateIncomingLatencies : public ScheduleDAGMutation { | |
})) | ||
continue; | ||
|
||
// Find the common latency for all predecessors that can be | ||
// "moved" to successors. | ||
SDep *MinLatencyDep = nullptr; | ||
for (SDep &PredEdge : make_filter_range(SU.Preds, IsData)) { | ||
if (!MinLatencyDep || | ||
PredEdge.getLatency() < MinLatencyDep->getLatency()) | ||
MinLatencyDep = &PredEdge; | ||
// Avoid pushing a REG_SEQUENCE close to its sources if it is likely to | ||
// generate a hoistable COPY after regalloc. Keeping that COPY close to | ||
// its consumers instead will facilitate MachineLICM. | ||
// Indeed, that typically means that only the lanes corresponding to | ||
// internal sources will be loop-carried. The external lane will come | ||
// directly from the pre-header, and the corresponding COPY can then be | ||
// hoisted by MachineLICM. | ||
const MachineBasicBlock &MBB = *MI.getParent(); | ||
const MachineRegisterInfo &MRI = DAG->MRI; | ||
auto MayProduceHoistableCopy = [&MBB, &MRI](const MachineInstr &MI) { | ||
if (!MI.isRegSequence() || !MRI.isSSA()) | ||
return false; | ||
const auto NumExternal = | ||
count_if(MI.uses(), [&MBB, &MRI](const MachineOperand &MO) { | ||
return MO.isReg() && MO.getReg().isVirtual() && | ||
MRI.getVRegDef(MO.getReg())->getParent() != &MBB; | ||
}); | ||
const auto NumInternal = MI.getNumOperands() - 1 - (2 * NumExternal); | ||
return NumExternal == 1 && NumInternal >= 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here, do we consider a subregister index of an internal value as accounting for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not completely clear on what would be the best heuristic tbh. As mentioned here #220 (comment) I have experimented a bit, but it is challenging to find something that consistently yields good results. The current code works pretty well (see results in PR description), and I'm a bit afraid to specialise the heuristic too much for our current benchmarks if we keep tweaking it. I would suggest leaving that basic heuristic intact, and tweak it in follow-up work as we see fit. What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As this is a heuristic, we can leave as is because it can give us a nice help. I also tried some experiments during the review, and I think it is good as is (I also know that is hard to tune it as well).
martien-de-jong marked this conversation as resolved.
Show resolved
Hide resolved
|
||
}; | ||
|
||
// Whether to propagate latency from predecessors to successors (true), | ||
// or from successors to predecessors (false). | ||
const bool MoveLatToSuccessors = | ||
!OnlyLocalSources || !MayProduceHoistableCopy(MI); | ||
|
||
// Find the common latency for all predecessors (or successors) that | ||
// can be "moved" to successors (or predecessors). | ||
const SDep *MinLatencyDep = nullptr; | ||
ArrayRef<SDep> SuccsOrPreds = MoveLatToSuccessors ? SU.Preds : SU.Succs; | ||
for (const SDep &Edge : make_filter_range(SuccsOrPreds, IsData)) { | ||
if (!MinLatencyDep || Edge.getLatency() < MinLatencyDep->getLatency()) | ||
MinLatencyDep = &Edge; | ||
} | ||
if (!MinLatencyDep) | ||
continue; | ||
|
||
int PropagatableSrcLatency = MinLatencyDep->getLatency(); | ||
int AmountToShiftToSuccessors = MoveLatToSuccessors | ||
? int(MinLatencyDep->getLatency()) | ||
: -int(MinLatencyDep->getLatency()); | ||
for (SDep &PredEdge : make_filter_range(SU.Preds, IsData)) { | ||
martien-de-jong marked this conversation as resolved.
Show resolved
Hide resolved
|
||
updatePredLatency(PredEdge, SU, | ||
PredEdge.getLatency() - PropagatableSrcLatency); | ||
int(PredEdge.getLatency()) - | ||
AmountToShiftToSuccessors); | ||
} | ||
for (SDep &SuccEdge : make_filter_range(SU.Succs, IsData)) { | ||
updateSuccLatency(SuccEdge, SU, | ||
SuccEdge.getLatency() + PropagatableSrcLatency); | ||
int(SuccEdge.getLatency()) + | ||
AmountToShiftToSuccessors); | ||
} | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As we removed this fixme (and extended the implementation accordingly), I think this can be well received by the community.