Xilinx · gbossu · Oct 29, 2024 · Jun 11, 2024 · Jun 17, 2024 · Jun 20, 2024
@@ -226,8 +226,8 @@ namespace {
     void HoistPostRA(MachineInstr *MI, unsigned Def, MachineLoop *CurLoop,
                      MachineBasicBlock *CurPreheader);
 
-    void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
-                   BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs,
+    void ProcessMI(MachineInstr *MI, BitVector &RUDefs, BitVector &RUClobbers,
+                   SmallSet<int, 32> &StoredFIs,
                    SmallVectorImpl<CandidateInfo> &Candidates,
                    MachineLoop *CurLoop);
 
@@ -356,7 +356,6 @@ bool MachineLICMBase::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   SchedModel.init(&ST);
 
-  PreRegAlloc = MRI->isSSA();
   HasProfileData = MF.getFunction().hasProfileData();
 
   if (PreRegAlloc)
@@ -427,10 +426,63 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
   return false;
 }
 
+static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
+                                                BitVector &RUs,
+                                                const uint32_t *Mask) {
+  // FIXME: This intentionally works in reverse due to some issues with the
+  // Register Units infrastructure.
+  //
+  // This is used to apply callee-saved-register masks to the clobbered regunits
+  // mask.
+  //
+  // The right way to approach this is to start with a BitVector full of ones,
+  // then reset all the bits of the regunits of each register that is set in the
+  // mask (registers preserved), then OR the resulting bits with the Clobbers
+  // mask. This correctly prioritizes the saved registers, so if a RU is shared
+  // between a register that is preserved, and one that is NOT preserved, that
+  // RU will not be set in the output vector (the clobbers).
+  //
+  // What we have to do for now is the opposite: we have to assume that the
+  // regunits of all registers that are NOT preserved are clobbered, even if
+  // those regunits are preserved by another register. So if a RU is shared
+  // like described previously, that RU will be set.
+  //
+  // This is to work around an issue which appears in AArch64, but isn't
+  // exclusive to that target: AArch64's Qn registers (128 bits) have Dn
+  // register (lower 64 bits). A few Dn registers are preserved by some calling
+  // conventions, but Qn and Dn share exactly the same reg units.
+  //
+  // If we do this the right way, Qn will be marked as NOT clobbered even though
+  // its upper 64 bits are NOT preserved. The conservative approach handles this
+  // correctly at the cost of some missed optimizations on other targets.
+  //
+  // This is caused by how RegUnits are handled within TableGen. Ideally, Qn
+  // should have an extra RegUnit to model the "unknown" bits not covered by the
+  // subregs.
+  BitVector RUsFromRegsNotInMask(TRI.getNumRegUnits());
+  const unsigned NumRegs = TRI.getNumRegs();
+  const unsigned MaskWords = (NumRegs + 31) / 32;
+  for (unsigned K = 0; K < MaskWords; ++K) {
+    const uint32_t Word = Mask[K];
+    for (unsigned Bit = 0; Bit < 32; ++Bit) {
+      const unsigned PhysReg = (K * 32) + Bit;
+      if (PhysReg == NumRegs)
+        break;
+
+      if (PhysReg && !((Word >> Bit) & 1)) {
+        for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
+          RUsFromRegsNotInMask.set(*RUI);
+      }
+    }
+  }
+
+  RUs |= RUsFromRegsNotInMask;
+}
+
 /// Examine the instruction for potentai LICM candidate. Also
 /// gather register def and frame object update information.
-void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
-                                BitVector &PhysRegClobbers,
+void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &RUDefs,
+                                BitVector &RUClobbers,
                                 SmallSet<int, 32> &StoredFIs,
                                 SmallVectorImpl<CandidateInfo> &Candidates,
                                 MachineLoop *CurLoop) {
@@ -452,7 +504,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
     // We can't hoist an instruction defining a physreg that is clobbered in
     // the loop.
     if (MO.isRegMask()) {
-      PhysRegClobbers.setBitsNotInMask(MO.getRegMask());
+      applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, MO.getRegMask());
       continue;
     }
 
@@ -464,16 +516,24 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
     assert(Reg.isPhysical() && "Not expecting virtual register!");
 
     if (!MO.isDef()) {
-      if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg)))
-        // If it's using a non-loop-invariant register, then it's obviously not
-        // safe to hoist.
-        HasNonInvariantUse = true;
+      if (!HasNonInvariantUse) {
+        for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
+          // If it's using a non-loop-invariant register, then it's obviously
+          // not safe to hoist.
+          // Note this isn't a final check, as we haven't gathered all the loop
+          // register definitions yet.
+          if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
+            HasNonInvariantUse = true;
+            break;
+          }
+        }
+      }
       continue;
     }
 
     if (MO.isImplicit()) {
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        PhysRegClobbers.set(*AI);
+      for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+        RUClobbers.set(*RUI);
       if (!MO.isDead())
         // Non-dead implicit def? This cannot be hoisted.
         RuledOut = true;
@@ -492,19 +552,18 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
     // If we have already seen another instruction that defines the same
     // register, then this is not safe.  Two defs is indicated by setting a
     // PhysRegClobbers bit.
-    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
-      if (PhysRegDefs.test(*AS))
-        PhysRegClobbers.set(*AS);
+    for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
+      if (RUDefs.test(*RUI)) {
+        RUClobbers.set(*RUI);
+        RuledOut = true;
+      } else if (RUClobbers.test(*RUI)) {
+        // MI defined register is seen defined by another instruction in
+        // the loop, it cannot be a LICM candidate.
+        RuledOut = true;
+      }
+
+      RUDefs.set(*RUI);
     }
-    // Need a second loop because MCRegAliasIterator can visit the same
-    // register twice.
-    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS)
-      PhysRegDefs.set(*AS);
-
-    if (PhysRegClobbers.test(Reg))
-      // MI defined register is seen defined by another instruction in
-      // the loop, it cannot be a LICM candidate.
-      RuledOut = true;
   }
 
   // Only consider reloads for now and remats which do not have register
@@ -525,9 +584,9 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
   if (!Preheader)
     return;
 
-  unsigned NumRegs = TRI->getNumRegs();
-  BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop.
-  BitVector PhysRegClobbers(NumRegs); // Regs defined more than once.
+  unsigned NumRegUnits = TRI->getNumRegUnits();
+  BitVector RUDefs(NumRegUnits);     // RUs defined once in the loop.
+  BitVector RUClobbers(NumRegUnits); // RUs defined more than once.
 
   SmallVector<CandidateInfo, 32> Candidates;
   SmallSet<int, 32> StoredFIs;
@@ -540,26 +599,31 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
     const MachineLoop *ML = MLI->getLoopFor(BB);
     if (ML && ML->getHeader()->isEHPad()) continue;
 
-    // Conservatively treat live-in's as an external def.
-    // FIXME: That means a reload that're reused in successor block(s) will not
-    // be LICM'ed.
-    for (const auto &LI : BB->liveins()) {
-      for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI)
-        PhysRegDefs.set(*AI);
-    }
-
     // Funclet entry blocks will clobber all registers
     if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
-      PhysRegClobbers.setBitsNotInMask(Mask);
+      applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, Mask);
 
     SpeculationState = SpeculateUnknown;
     for (MachineInstr &MI : *BB)
-      ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates,
-                CurLoop);
+      ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop);
+  }
+
+  // Mark registers as clobbered if they are livein and also defined in the loop
+  for (const auto &LoopLI : CurLoop->getHeader()->liveins()) {
+    MCPhysReg LoopLiveInReg = LoopLI.PhysReg;
+    LaneBitmask LiveInMask = LoopLI.LaneMask;
+    for (MCRegUnitMaskIterator RUI(LoopLiveInReg, TRI); RUI.isValid(); ++RUI) {
+      auto LiveInUnit = (*RUI).first;
+      LaneBitmask UnitMask = (*RUI).second;
+      // Check if the livein lanes overlap with the lanes touched by LiveInUnit
+      if ((UnitMask & LiveInMask).any() && RUDefs.test(LiveInUnit)) {
+        RUClobbers.set(LiveInUnit);
+      }
+    }
   }
 
   // Gather the registers read / clobbered by the terminator.
-  BitVector TermRegs(NumRegs);
+  BitVector TermRUs(NumRegUnits);
   MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
   if (TI != Preheader->end()) {
     for (const MachineOperand &MO : TI->operands()) {
@@ -568,8 +632,8 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
       Register Reg = MO.getReg();
       if (!Reg)
         continue;
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        TermRegs.set(*AI);
+      for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+        TermRUs.set(*RUI);
     }
   }
 
@@ -587,24 +651,36 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
       continue;
 
     unsigned Def = Candidate.Def;
-    if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) {
-      bool Safe = true;
-      MachineInstr *MI = Candidate.MI;
-      for (const MachineOperand &MO : MI->all_uses()) {
-        if (!MO.getReg())
-          continue;
-        Register Reg = MO.getReg();
-        if (PhysRegDefs.test(Reg) ||
-            PhysRegClobbers.test(Reg)) {
+    bool Safe = true;
+    for (MCRegUnitIterator RUI(Def, TRI); RUI.isValid(); ++RUI) {
+      if (RUClobbers.test(*RUI) || TermRUs.test(*RUI)) {
+        Safe = false;
+        break;
+      }
+    }
+
+    if (!Safe)
+      continue;
+
+    MachineInstr *MI = Candidate.MI;
+    for (const MachineOperand &MO : MI->all_uses()) {
+      if (!MO.getReg())
+        continue;
+      for (MCRegUnitIterator RUI(MO.getReg(), TRI); RUI.isValid(); ++RUI) {
+        if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
           // If it's using a non-loop-invariant register, then it's obviously
           // not safe to hoist.
           Safe = false;
           break;
         }
       }
-      if (Safe)
-        HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader);
+
+      if (!Safe)
+        break;
     }
+
+    if (Safe)
+      HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader);
   }
 }
 

@@ -361,10 +361,12 @@ class EnforceCopyEdges : public ScheduleDAGMutation {
 
 class PropagateIncomingLatencies : public ScheduleDAGMutation {
   bool OnlyCopyLike;
+  bool OnlyLocalSources;
 
 public:
-  PropagateIncomingLatencies(bool OnlyCopyLike = true)
-      : OnlyCopyLike(OnlyCopyLike) {}
+  PropagateIncomingLatencies(bool OnlyCopyLike = true,
+                             bool OnlyLocalSources = true)
+      : OnlyCopyLike(OnlyCopyLike), OnlyLocalSources(OnlyLocalSources) {}
   void apply(ScheduleDAGInstrs *DAG) override {
     auto IsData = [](const SDep &D) { return D.getKind() == SDep::Data; };
     for (SUnit &SU : DAG->SUnits) {
@@ -381,25 +383,55 @@ class PropagateIncomingLatencies : public ScheduleDAGMutation {
           }))
         continue;
 
-      // Find the common latency for all predecessors that can be
-      // "moved" to successors.
-      SDep *MinLatencyDep = nullptr;
-      for (SDep &PredEdge : make_filter_range(SU.Preds, IsData)) {
-        if (!MinLatencyDep ||
-            PredEdge.getLatency() < MinLatencyDep->getLatency())
-          MinLatencyDep = &PredEdge;
+      // Avoid pushing a REG_SEQUENCE close to its sources if it is likely to
+      // generate a hoistable COPY after regalloc. Keeping that COPY close to
+      // its consumers instead will facilitate MachineLICM.
+      // Indeed, that typically means that only the lanes corresponding to
+      // internal sources will be loop-carried. The external lane will come
+      // directly from the pre-header, and the corresponding COPY can then be
+      // hoisted by MachineLICM.
+      const MachineBasicBlock &MBB = *MI.getParent();
+      const MachineRegisterInfo &MRI = DAG->MRI;
+      auto MayProduceHoistableCopy = [&MBB, &MRI](const MachineInstr &MI) {
+        if (!MI.isRegSequence() || !MRI.isSSA())
+          return false;
+        const auto NumExternal =
+            count_if(MI.uses(), [&MBB, &MRI](const MachineOperand &MO) {
+              return MO.isReg() && MO.getReg().isVirtual() &&
+                     MRI.getVRegDef(MO.getReg())->getParent() != &MBB;
+            });
+        const auto NumInternal = MI.getNumOperands() - 1 - (2 * NumExternal);
+        return NumExternal == 1 && NumInternal >= 1;
+      };
+
+      // Whether to propagate latency from predecessors to successors (true),
+      // or from successors to predecessors (false).
+      const bool MoveLatToSuccessors =
+          !OnlyLocalSources || !MayProduceHoistableCopy(MI);
+
+      // Find the common latency for all predecessors (or successors) that
+      // can be "moved" to successors (or predecessors).
+      const SDep *MinLatencyDep = nullptr;
+      ArrayRef<SDep> SuccsOrPreds = MoveLatToSuccessors ? SU.Preds : SU.Succs;
+      for (const SDep &Edge : make_filter_range(SuccsOrPreds, IsData)) {
+        if (!MinLatencyDep || Edge.getLatency() < MinLatencyDep->getLatency())
+          MinLatencyDep = &Edge;
       }
       if (!MinLatencyDep)
         continue;
 
-      int PropagatableSrcLatency = MinLatencyDep->getLatency();
+      int AmountToShiftToSuccessors = MoveLatToSuccessors
+                                          ? int(MinLatencyDep->getLatency())
+                                          : -int(MinLatencyDep->getLatency());
       for (SDep &PredEdge : make_filter_range(SU.Preds, IsData)) {
         updatePredLatency(PredEdge, SU,
-                          PredEdge.getLatency() - PropagatableSrcLatency);
+                          int(PredEdge.getLatency()) -
+                              AmountToShiftToSuccessors);
       }
       for (SDep &SuccEdge : make_filter_range(SU.Succs, IsData)) {
         updateSuccLatency(SuccEdge, SU,
-                          SuccEdge.getLatency() + PropagatableSrcLatency);
+                          int(SuccEdge.getLatency()) +
+                              AmountToShiftToSuccessors);
       }
     }
   }