[AIE2] Enhance VCONV_FP32_BF16 instr itinerary

Xilinx · Oct 14, 2024 · 9ff7d8a · 9ff7d8a
1 parent 86a2d53
commit 9ff7d8a
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 31 deletions.
diff --git a/llvm/lib/Target/AIE/AIE2GenFixupInstrInfo.td b/llvm/lib/Target/AIE/AIE2GenFixupInstrInfo.td
@@ -958,7 +958,7 @@ let Defs = [srUPS_of] in {
 }
 
 // 7.5 vconv.fp32.bf16 – bfloat16 to float Conversion instructions
-let Itinerary = II_VCONVfp32bf16 in {
+let Itinerary = II_VCONVfp32bf16, ItineraryRegPairs = [ItinRegClassPair<II_VCONVfp32bf16_WL,[OperandRegClass<1, eWL>]>] in {
   def VCONV_FP32_BF16 :
       AIE2_mv_ups_bf_inst_mv <(outs mBMm:$dst), (ins OP_mWm_1:$src),
                             "vconv.fp32.bf16", "$dst, $src">;

diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp
@@ -1281,26 +1281,10 @@ unsigned AIE2InstrInfo::getNumBypassedCycles(const InstrItineraryData *ItinData,
                                              unsigned DefIdx,
                                              const MachineInstr &UseMI,
                                              unsigned UseIdx) const {
-  // TODO: This should be tablegen-erated. This way we also wouldn't need
-  // trickery to find the class of the MOV_Bypass
-  const unsigned MovSlotBypassClass =
-      ItinData->getForwardingClass(get(AIE2::VMOV_mv_x).getSchedClass(), 0);
-  assert(MovSlotBypassClass != 0);
-
   auto GetForwardingClass = [&](const MachineInstr &MI, unsigned OpIdx) {
-    Register Reg = MI.getOperand(OpIdx).getReg();
-    switch (MI.getOpcode()) {
-    case AIE2::VCONV_FP32_BF16:
-      assert(OpIdx < 2);
-      return Reg.isPhysical() && AIE2::eWLRegClass.contains(Reg)
-                 ? MovSlotBypassClass
-                 : 0U;
-    default: {
-      const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
-      return ItinData->getForwardingClass(
-          getSchedClass(MI.getDesc(), MI.operands(), MRI), OpIdx);
-    }
-    }
+    const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+    return ItinData->getForwardingClass(
+        getSchedClass(MI.getDesc(), MI.operands(), MRI), OpIdx);
   };
 
   // FIXME: This assumes one cycle benefit for every pipeline forwarding.

diff --git a/llvm/lib/Target/AIE/AIE2Schedule.td b/llvm/lib/Target/AIE/AIE2Schedule.td
@@ -83,13 +83,6 @@ def R_WA_PORT : FuncUnit;
 // Reading D or P
 def P_RM_PORT : FuncUnit;
 
-// Writing D or P. The ISA suggests every register file has its own writeport, which is selected
-// based on the destination register.
-// FIXME:
-// We represent the instruction as writing P_WM_PORT, but we could prepare
-// itineraries for each of the regfiles. Once we have a way of dynamically selecting an
-// itinerary we can pick the right one, reducing conflicts.
-// Alternatively, we can split the instructions, but this will bloat things tremendously.
 def P_WM_PORT : FuncUnit;
 def M_WM_PORT : FuncUnit;
 def DJ_WM_PORT : FuncUnit;
@@ -285,6 +278,7 @@ def II_VCLR : InstrItinClass;
 def II_VCLRf : InstrItinClass;
 def II_VCONV : InstrItinClass;
 def II_VCONVfp32bf16 : InstrItinClass;
+def II_VCONVfp32bf16_WL : InstrItinClass;
 def II_VEQZ : InstrItinClass;
 def II_VEXTBCST : InstrItinClass;
 def II_VEXTRACT : InstrItinClass;
@@ -969,9 +963,8 @@ InstrItinData<II_VMOV_W_WMH_WML, [EmptyCycles<1>, SimpleCycle<W_WM_PORT>],
               [2,1], [NoBypass, MOV_Bypass]>,
 InstrItinData<II_VMOV_W_WML_WML, [EmptyCycles<1>, SimpleCycle<W_WM_PORT>],
               [2,1], [MOV_Bypass, MOV_Bypass]>,
-// FIXME: Remove MOV_Bypass from II_VMOV_X once VCONV_FP32_BF16 starts using new reg base itineraries
 InstrItinData<II_VMOV_X, [SimpleCycle<CM_RM_PORT>, PrefixCycle<W_WM_PORT>, SimpleCycle<CM_WM_PORT>],
-              [2,1], [MOV_Bypass, MOV_Bypass]>,
+              [2,1], [NoBypass, NoBypass]>,
 InstrItinData<II_VMOV_X_BM_BM, [SimpleCycle<CM_RM_PORT>, SimpleCycle<CM_WM_PORT>],
               [2,1], [NoBypass, NoBypass]>,
 InstrItinData<II_VMOV_X_XM_BM, [SimpleCycle<CM_RM_PORT>, SimpleCycle<W_WM_PORT>],
@@ -1074,10 +1067,10 @@ InstrItinData<II_VFLOORs32bf16_AM, [SimpleCycle<CM_RM_PORT>, SimpleCycle<W_WA_PO
               [2,1,1,/*srF2IFlags*/2,/*crF2IMask*/1]>,
 InstrItinData<II_VFLOORs32bf16_W, [SimpleCycle<W_RS_PORT>, SimpleCycle<W_WA_PORT>],
               [2,1,1,/*srF2IFlags*/2,/*crF2IMask*/1]>,
-// Note: This is a conservative itinerary for pre-RA scheduling, as it does not
-// model the MOV slot bypass correctly. See II_VMOV_W for details.
 InstrItinData<II_VCONVfp32bf16, [EmptyCycles<1>, SimpleCycle<CM_WM_PORT>],
               [2,1]>,
+InstrItinData<II_VCONVfp32bf16_WL, [EmptyCycles<1>, SimpleCycle<CM_WM_PORT>],
+              [2,1], [NoBypass, MOV_Bypass]>, 
 InstrItinData<II_VSHUFFLE, [EmptyCycles<1>, InstrStage<1,[W_WM_PORT],0>, InstrStage<1,[CM_WM_PORT],0>],
               [2,1,1,1], [MOV_Bypass,MOV_Bypass,MOV_Bypass,NoBypass]>,
 InstrItinData<II_VSHIFT, [EmptyCycles<1>, InstrStage<1,  [W_WM_PORT]>],

diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/negative_latencies/bypass.mir b/llvm/test/CodeGen/AIE/aie2/schedule/negative_latencies/bypass.mir
@@ -97,6 +97,36 @@ body:             |
 # CHECK-SU:    SU(1): Anti Latency=0{{$}}
 # CHECK-SU: SU(1):   $x0 = VBAND $x2, $x4
 
+# There is an anti-dependency between VCONV_FP32_BF16 and VBAND.
+# VCONV_FP32_BF16 reads in E1 while VBAND writes in E2. We need to account for the
+# potential WL bypass, otherwise the edge latency would be -1 and the
+# instructions could be re-ordered. They cannot be re-ordered, otherwise
+# the bypass would be taken and VMOV would wrongly take the new value for wl0.
+---
+name:            antidep_minus_1_plus_wl_bypass_VCONV_FP32_BF16
+alignment:       16
+body:             |
+  bb.0.entry:
+    liveins: $p0
+    ; CHECK-LABEL: name: antidep_minus_1_plus_wl_bypass_VCONV_FP32_BF16
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $bmh0 = VCONV_FP32_BF16 killed $wl0
+    ; CHECK-NEXT: $x0 = VBAND killed $x2, killed $x4
+    ; CHECK-NEXT: NOP
+    ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh0, killed $p0, 0
+    ; CHECK-NEXT: NOP
+    $bmh0 = VCONV_FP32_BF16 $wl0
+    $x0 = VBAND $x2, $x4
+    VST_dmw_sts_w_ag_idx_imm $wh0, $p0, 0
+...
+# CHECK-SU-LABEL: antidep_minus_1_plus_wl_bypass_VCONV_FP32_BF16:%bb.0 entry
+# CHECK-SU: SU(0):   $bmh0 = VCONV_FP32_BF16 $wl0
+# CHECK-SU:  Successors:
+# CHECK-SU:    SU(1): Anti Latency=0{{$}}
+# CHECK-SU: SU(1):   $x0 = VBAND $x2, $x4
+---
+
 # There is an anti-dependency between VMOV and VBAND.
 # VMOV reads in E1 while VBAND writes in E2. There is no potential bypass in use
 # because the MOV slot bypass does not work for wh registers.