Skip to content

Commit

Permalink
[AIE2] Enhance VCONV_FP32_BF16 instr itinerary
Browse files Browse the repository at this point in the history
  • Loading branch information
krishnamtibrewala committed Oct 14, 2024
1 parent 86a2d53 commit 9ff7d8a
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 31 deletions.
2 changes: 1 addition & 1 deletion llvm/lib/Target/AIE/AIE2GenFixupInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -958,7 +958,7 @@ let Defs = [srUPS_of] in {
}

// 7.5 vconv.fp32.bf16 – bfloat16 to float Conversion instructions
let Itinerary = II_VCONVfp32bf16 in {
let Itinerary = II_VCONVfp32bf16, ItineraryRegPairs = [ItinRegClassPair<II_VCONVfp32bf16_WL,[OperandRegClass<1, eWL>]>] in {
def VCONV_FP32_BF16 :
AIE2_mv_ups_bf_inst_mv <(outs mBMm:$dst), (ins OP_mWm_1:$src),
"vconv.fp32.bf16", "$dst, $src">;
Expand Down
22 changes: 3 additions & 19 deletions llvm/lib/Target/AIE/AIE2InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1281,26 +1281,10 @@ unsigned AIE2InstrInfo::getNumBypassedCycles(const InstrItineraryData *ItinData,
unsigned DefIdx,
const MachineInstr &UseMI,
unsigned UseIdx) const {
// TODO: This should be tablegen-erated. This way we also wouldn't need
// trickery to find the class of the MOV_Bypass
const unsigned MovSlotBypassClass =
ItinData->getForwardingClass(get(AIE2::VMOV_mv_x).getSchedClass(), 0);
assert(MovSlotBypassClass != 0);

auto GetForwardingClass = [&](const MachineInstr &MI, unsigned OpIdx) {
Register Reg = MI.getOperand(OpIdx).getReg();
switch (MI.getOpcode()) {
case AIE2::VCONV_FP32_BF16:
assert(OpIdx < 2);
return Reg.isPhysical() && AIE2::eWLRegClass.contains(Reg)
? MovSlotBypassClass
: 0U;
default: {
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
return ItinData->getForwardingClass(
getSchedClass(MI.getDesc(), MI.operands(), MRI), OpIdx);
}
}
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
return ItinData->getForwardingClass(
getSchedClass(MI.getDesc(), MI.operands(), MRI), OpIdx);
};

// FIXME: This assumes one cycle benefit for every pipeline forwarding.
Expand Down
15 changes: 4 additions & 11 deletions llvm/lib/Target/AIE/AIE2Schedule.td
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,6 @@ def R_WA_PORT : FuncUnit;
// Reading D or P
def P_RM_PORT : FuncUnit;

// Writing D or P. The ISA suggests every register file has its own writeport, which is selected
// based on the destination register.
// FIXME:
// We represent the instruction as writing P_WM_PORT, but we could prepare
// itineraries for each of the regfiles. Once we have a way of dynamically selecting an
// itinerary we can pick the right one, reducing conflicts.
// Alternatively, we can split the instructions, but this will bloat things tremendously.
def P_WM_PORT : FuncUnit;
def M_WM_PORT : FuncUnit;
def DJ_WM_PORT : FuncUnit;
Expand Down Expand Up @@ -285,6 +278,7 @@ def II_VCLR : InstrItinClass;
def II_VCLRf : InstrItinClass;
def II_VCONV : InstrItinClass;
def II_VCONVfp32bf16 : InstrItinClass;
def II_VCONVfp32bf16_WL : InstrItinClass;
def II_VEQZ : InstrItinClass;
def II_VEXTBCST : InstrItinClass;
def II_VEXTRACT : InstrItinClass;
Expand Down Expand Up @@ -969,9 +963,8 @@ InstrItinData<II_VMOV_W_WMH_WML, [EmptyCycles<1>, SimpleCycle<W_WM_PORT>],
[2,1], [NoBypass, MOV_Bypass]>,
InstrItinData<II_VMOV_W_WML_WML, [EmptyCycles<1>, SimpleCycle<W_WM_PORT>],
[2,1], [MOV_Bypass, MOV_Bypass]>,
// FIXME: Remove MOV_Bypass from II_VMOV_X once VCONV_FP32_BF16 starts using new reg base itineraries
InstrItinData<II_VMOV_X, [SimpleCycle<CM_RM_PORT>, PrefixCycle<W_WM_PORT>, SimpleCycle<CM_WM_PORT>],
[2,1], [MOV_Bypass, MOV_Bypass]>,
[2,1], [NoBypass, NoBypass]>,
InstrItinData<II_VMOV_X_BM_BM, [SimpleCycle<CM_RM_PORT>, SimpleCycle<CM_WM_PORT>],
[2,1], [NoBypass, NoBypass]>,
InstrItinData<II_VMOV_X_XM_BM, [SimpleCycle<CM_RM_PORT>, SimpleCycle<W_WM_PORT>],
Expand Down Expand Up @@ -1074,10 +1067,10 @@ InstrItinData<II_VFLOORs32bf16_AM, [SimpleCycle<CM_RM_PORT>, SimpleCycle<W_WA_PO
[2,1,1,/*srF2IFlags*/2,/*crF2IMask*/1]>,
InstrItinData<II_VFLOORs32bf16_W, [SimpleCycle<W_RS_PORT>, SimpleCycle<W_WA_PORT>],
[2,1,1,/*srF2IFlags*/2,/*crF2IMask*/1]>,
// Note: This is a conservative itinerary for pre-RA scheduling, as it does not
// model the MOV slot bypass correctly. See II_VMOV_W for details.
InstrItinData<II_VCONVfp32bf16, [EmptyCycles<1>, SimpleCycle<CM_WM_PORT>],
[2,1]>,
InstrItinData<II_VCONVfp32bf16_WL, [EmptyCycles<1>, SimpleCycle<CM_WM_PORT>],
[2,1], [NoBypass, MOV_Bypass]>,
InstrItinData<II_VSHUFFLE, [EmptyCycles<1>, InstrStage<1,[W_WM_PORT],0>, InstrStage<1,[CM_WM_PORT],0>],
[2,1,1,1], [MOV_Bypass,MOV_Bypass,MOV_Bypass,NoBypass]>,
InstrItinData<II_VSHIFT, [EmptyCycles<1>, InstrStage<1, [W_WM_PORT]>],
Expand Down
30 changes: 30 additions & 0 deletions llvm/test/CodeGen/AIE/aie2/schedule/negative_latencies/bypass.mir
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,36 @@ body: |
# CHECK-SU: SU(1): Anti Latency=0{{$}}
# CHECK-SU: SU(1): $x0 = VBAND $x2, $x4

# There is an anti-dependency between VCONV_FP32_BF16 and VBAND.
# VCONV_FP32_BF16 reads in E1 while VBAND writes in E2. We need to account for the
# potential WL bypass, otherwise the edge latency would be -1 and the
# instructions could be re-ordered. They cannot be re-ordered, otherwise
# the bypass would be taken and VMOV would wrongly take the new value for wl0.
---
name: antidep_minus_1_plus_wl_bypass_VCONV_FP32_BF16
alignment: 16
body: |
bb.0.entry:
liveins: $p0
; CHECK-LABEL: name: antidep_minus_1_plus_wl_bypass_VCONV_FP32_BF16
; CHECK: liveins: $p0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $bmh0 = VCONV_FP32_BF16 killed $wl0
; CHECK-NEXT: $x0 = VBAND killed $x2, killed $x4
; CHECK-NEXT: NOP
; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm killed $wh0, killed $p0, 0
; CHECK-NEXT: NOP
$bmh0 = VCONV_FP32_BF16 $wl0
$x0 = VBAND $x2, $x4
VST_dmw_sts_w_ag_idx_imm $wh0, $p0, 0
...
# CHECK-SU-LABEL: antidep_minus_1_plus_wl_bypass_VCONV_FP32_BF16:%bb.0 entry
# CHECK-SU: SU(0): $bmh0 = VCONV_FP32_BF16 $wl0
# CHECK-SU: Successors:
# CHECK-SU: SU(1): Anti Latency=0{{$}}
# CHECK-SU: SU(1): $x0 = VBAND $x2, $x4
---

# There is an anti-dependency between VMOV and VBAND.
# VMOV reads in E1 while VBAND writes in E2. There is no potential bypass in use
# because the MOV slot bypass does not work for wh registers.
Expand Down

0 comments on commit 9ff7d8a

Please sign in to comment.