Skip to content

Commit

Permalink
[AIEX] Propagate MMO for loads without this information
Browse files Browse the repository at this point in the history
This can prevent MachinePipeliner from considering some loads
as barriers (isDependenceBarrier).
  • Loading branch information
andcarminati committed Oct 21, 2024
1 parent 800ab8d commit 093ac91
Show file tree
Hide file tree
Showing 5 changed files with 334 additions and 0 deletions.
20 changes: 20 additions & 0 deletions llvm/lib/Target/AIE/AIE2InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1403,3 +1403,23 @@ AIE2InstrInfo::getVExtractOpInfo(const MachineInstr &MI) const {
return std::nullopt;
}
}

using VecOpAbstraction = AIEBaseInstrInfo::AbstractVecOp;
std::optional<const VecOpAbstraction>
AIE2InstrInfo::parseTargetVectorOp(const MachineInstr &MI) const {

switch (MI.getOpcode()) {
case AIE2::VADD_32:
return VecOpAbstraction{AbstractVecOp::AbstractOpcode::ADD,
MI.getOperand(1).getReg(),
MI.getOperand(2).getReg(), 0};
case AIE2::VSEL_32:
return AbstractVecOp{AbstractVecOp::AbstractOpcode::SELECT,
MI.getOperand(1).getReg(), MI.getOperand(2).getReg(),
MI.getOperand(3).getReg()};
case AIE2::VBCST_32:
return AbstractVecOp{AbstractVecOp::AbstractOpcode::BROADCAST, 0, 0,
MI.getOperand(1).getReg()};
}
return {};
}
3 changes: 3 additions & 0 deletions llvm/lib/Target/AIE/AIE2InstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,9 @@ class AIE2InstrInfo : public AIE2GenInstrInfo {
std::optional<const VExtractOpInfo>
getVExtractOpInfo(const MachineInstr &MI) const override;

std::optional<const AbstractVecOp>
parseTargetVectorOp(const MachineInstr &MI) const override;

protected:
SmallVector<AIEPseudoExpandInfo, 4>
getSpillPseudoExpandInfo(const MachineInstr &MI) const override;
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AIE/AIEBaseInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,21 @@ struct AIEBaseInstrInfo : public TargetInstrInfo {
llvm_unreachable("Target didn't implement getVExtractOpInfo!");
}

/// Abstract vector operation to help the decoding of complex operations.
struct AbstractVecOp {
enum class AbstractOpcode : unsigned { ADD, SELECT, BROADCAST };
AbstractOpcode Opcode;
Register VectorSrc1;
Register VectorSrc2;
Register ScalarSrc;
};

/// Retrieve an abstract representation, of an instruction.
virtual std::optional<const AbstractVecOp>
parseTargetVectorOp(const MachineInstr &MI) const {
return {};
}

protected:
/// Expand a spill pseudo-instruction into actual target instructions. This
/// will essentially split the register being handled into its sub-registers,
Expand Down
173 changes: 173 additions & 0 deletions llvm/lib/Target/AIE/AIEPostSelectOptimize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include <optional>

using namespace llvm;

Expand All @@ -60,6 +61,17 @@ static cl::opt<bool>
EnablePostSelectOptimize("aie-post-select-opt", cl::Hidden, cl::init(true),
cl::desc("Enable post select optimize."));

static cl::opt<bool>
LoadMMODeepSearch("aie-post-load-mmo-deep-search", cl::Hidden,
cl::init(false),
cl::desc("Enable deep search for load's missing MMOs"));

static cl::opt<unsigned>
LoadMMOSearchLimit("aie-post-load-mmo-search-limit", cl::Hidden,
cl::init(100),
cl::desc("Search limit for load's missing MMOs (number "
"of visited instructions)."));

namespace {

/// Information about a COPY that can be tracked by PhysRegCopyTracker.
Expand Down Expand Up @@ -438,6 +450,161 @@ bool duplicateAdressingRegs(MachineBasicBlock &MBB, MachineRegisterInfo &MRI) {
return tryToDuplicateLoadUse(LoadUses, NonLoadUses, MRI, TII);
}

/// This optimization tries to propagate GlobalValues as MMO of load operations
/// when they are missing.
bool findGlobalValues(const MachineInstr *MI,
SmallPtrSet<const Value *, 4> &GVSet,
SmallPtrSet<const MachineInstr *, 4> &VisitedInstrs,
MachineRegisterInfo &MRI, unsigned SearchLimit) {

// Exhausted the search.
if (VisitedInstrs.size() == SearchLimit)
return false;

// Skip copies.
while (MI->isCopy() && MI->getOperand(1).getReg().isVirtual())
MI = MRI.getVRegDef(MI->getOperand(1).getReg());

// Loaded values cannot be tracked as Globals, but they can
// be part of a final address calculation.
if (MI->mayLoad())
return true;

if (VisitedInstrs.find(MI) != VisitedInstrs.end())
return true;

VisitedInstrs.insert(MI);
bool Success = true;
for (auto MO : MI->uses()) {
if (MO.isGlobal()) {
GVSet.insert(MO.getGlobal());
} else if (MO.isReg() && MO.getReg().isVirtual()) {
if (MO.getReg().isVirtual()) {
const Register Reg = MO.getReg();
if (const MachineInstr *RegDef = MRI.getVRegDef(Reg))
Success &=
findGlobalValues(RegDef, GVSet, VisitedInstrs, MRI, SearchLimit);
} else {
// Physical registers can be anything (parameters).
return false;
}
}
}
return Success;
}

using Opcodes = AIEBaseInstrInfo::AbstractVecOp::AbstractOpcode;
using AbstractVecOp = AIEBaseInstrInfo::AbstractVecOp;
using OptionalVecOp = std::optional<AbstractVecOp>;

/// This optimization tries to propagate GlobalValues as MMO of load operations
/// when they are missing. This specific implementation considers only the
/// following pattern:
/// VADD (VSEL (VBCST(GlobalValue1), VBCST(GlobalValue2), Mask) Value)
bool findBroadcastedGlobalValues(const MachineInstr *MI,
SmallPtrSet<const Value *, 4> &GVSet,
MachineRegisterInfo &MRI) {

const TargetSubtargetInfo &ST = MI->getParent()->getParent()->getSubtarget();
const AIEBaseInstrInfo *TII =
static_cast<const AIEBaseInstrInfo *>(ST.getInstrInfo());

// Skip subvector copy.
if (MI->isCopy()) {
Register Src = MI->getOperand(1).getReg();
if (Src.isVirtual())
MI = MRI.getVRegDef(Src);
else
// We cannot proceed with physical registers.
return false;
}

auto ParseByReg = [&](Register Reg, Opcodes Opcode) -> OptionalVecOp {
OptionalVecOp VOp = TII->parseTargetVectorOp(*MRI.getVRegDef(Reg));
if (VOp && VOp->Opcode == Opcode)
return VOp;
return {};
};

const OptionalVecOp VAddOp = TII->parseTargetVectorOp(*MI);

// We start with an ADD.
if (!VAddOp || VAddOp->Opcode != Opcodes::ADD)
return false;

// We should have one select in one operand.
AbstractVecOp VSelect;
if (OptionalVecOp Op = ParseByReg(VAddOp->VectorSrc1, Opcodes::SELECT))
VSelect = *Op;
else if (OptionalVecOp Op = ParseByReg(VAddOp->VectorSrc2, Opcodes::SELECT))
VSelect = *Op;
else
return false;

// We should have two broadcasts.
const OptionalVecOp VBcast1 =
ParseByReg(VSelect.VectorSrc1, Opcodes::BROADCAST);
const OptionalVecOp VBcast2 =
ParseByReg(VSelect.VectorSrc2, Opcodes::BROADCAST);

if (!VBcast1 || !VBcast2)
return false;

// We should end the search in just one last step each broacast.
SmallPtrSet<const MachineInstr *, 4> VisitedInstrs;
return findGlobalValues(MRI.getVRegDef(VBcast1->ScalarSrc), GVSet,
VisitedInstrs, MRI, 1) &&
findGlobalValues(MRI.getVRegDef(VBcast2->ScalarSrc), GVSet,
VisitedInstrs, MRI, VisitedInstrs.size() + 1);
}

bool fixLoadMemOpInfo(MachineFunction &MF, MachineBasicBlock &MBB,
MachineRegisterInfo &MRI) {

bool Changed = false;

for (MachineInstr &MI : MBB) {
if (!MI.mayLoad() || !MI.memoperands_empty())
continue;

bool Success = true;
SmallPtrSet<const Value *, 4> GVSet;
SmallPtrSet<const MachineInstr *, 4> VisitedInstrs;

// As we don't know which operand is the pointer, we need
// to iterate over all uses.
for (auto MO : MI.uses()) {
if (MO.isReg() && MO.getReg().isVirtual()) {
const Register Reg = MO.getReg();
if (const MachineInstr *RegDef = MRI.getVRegDef(Reg)) {
if (LoadMMODeepSearch)
// Look to everything!
Success &= findGlobalValues(RegDef, GVSet, VisitedInstrs, MRI,
LoadMMOSearchLimit);
else
// Consiser only broadcasted cases.
Success &= findBroadcastedGlobalValues(RegDef, GVSet, MRI);
}
}
}

if (Success) {
// Add gathered GlobalValues as MMOs.
for (auto GV : GVSet) {
// As we only know the base pointer and nothing about the
// result of the address calculation, we simply don't assume
// size/alignment/offset, so we prevent AA from inferring wrong
// information.
MachineMemOperand *PtrLoadMMO = MF.getMachineMemOperand(
MachinePointerInfo(GV), MachineMemOperand::MOLoad, LLT(), Align());
MI.addMemOperand(MF, PtrLoadMMO);
Changed = true;
}
}
}
return Changed;
}

bool AIEPostSelectOptimize::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "\n******* POST I-SEL OPTIMIZATION PASS *******\n"
<< "********** Function: " << MF.getName() << '\n');
Expand Down Expand Up @@ -472,6 +639,12 @@ bool AIEPostSelectOptimize::runOnMachineFunction(MachineFunction &MF) {
}
}

// 4. Fix MMO for load instructions that don't use pointers
// registers (use vector instead, for example).
for (MachineBasicBlock &MBB : MF) {
Changed |= fixLoadMemOpInfo(MF, MBB, MF.getRegInfo());
}

return Changed;
}

Expand Down
123 changes: 123 additions & 0 deletions llvm/test/CodeGen/AIE/aie2/GlobalISel/propagate-mmo-noptr.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates

# RUN: llc -mtriple aie2 -run-pass=aie-post-select-optimize %s -o - | FileCheck %s
# RUN: llc -mtriple aie2 -run-pass=aie-post-select-optimize \
# RUN: -aie-post-load-mmo-deep-search=true %s -o - | FileCheck %s

--- |
target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
target triple = "aie2"

@softmax_ilut_ab = dso_local global [1 x i16] [i16 16256], align 32
@softmax_ilut_cd = dso_local global [1 x i16] [i16 16256], align 32
@softmax_flut_ab = dso_local global [1 x i16] [i16 16256], align 32
@softmax_flut_cd = dso_local global [1 x i16] [i16 16256], align 32

define void @test_4x16_load() { ret void }
...
---
name: test_4x16_load
alignment: 16
legalized: true
regBankSelected: true
selected: true
tracksRegLiveness: true
body: |
bb.0:
liveins: $p0, $wl0
; CHECK-LABEL: name: test_4x16_load
; CHECK: liveins: $p0, $wl0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec256 = COPY $wl0
; CHECK-NEXT: [[MOVXM_lng_cg:%[0-9]+]]:ep_as_32bit = MOVXM_lng_cg target-flags(aie2-global) @softmax_ilut_ab
; CHECK-NEXT: [[COPY2:%[0-9]+]]:er = COPY [[MOVXM_lng_cg]]
; CHECK-NEXT: [[MOVXM_lng_cg1:%[0-9]+]]:ep_as_32bit = MOVXM_lng_cg target-flags(aie2-global) @softmax_ilut_cd
; CHECK-NEXT: [[COPY3:%[0-9]+]]:er = COPY [[MOVXM_lng_cg1]]
; CHECK-NEXT: [[MOVXM_lng_cg2:%[0-9]+]]:ep_as_32bit = MOVXM_lng_cg target-flags(aie2-global) @softmax_flut_ab
; CHECK-NEXT: [[COPY4:%[0-9]+]]:er = COPY [[MOVXM_lng_cg2]]
; CHECK-NEXT: [[MOVXM_lng_cg3:%[0-9]+]]:ep_as_32bit = MOVXM_lng_cg target-flags(aie2-global) @softmax_flut_cd
; CHECK-NEXT: [[COPY5:%[0-9]+]]:er = COPY [[MOVXM_lng_cg3]]
; CHECK-NEXT: [[COPY6:%[0-9]+]]:mdm = COPY [[COPY2]]
; CHECK-NEXT: [[COPY7:%[0-9]+]]:er = COPY [[COPY6]]
; CHECK-NEXT: [[COPY8:%[0-9]+]]:mdm = COPY [[COPY3]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:er = COPY [[COPY8]]
; CHECK-NEXT: [[MOV_RLC_imm10_pseudo:%[0-9]+]]:ers4 = MOV_RLC_imm10_pseudo 0
; CHECK-NEXT: [[MOV_RLC_imm10_pseudo1:%[0-9]+]]:er = MOV_RLC_imm10_pseudo 2
; CHECK-NEXT: [[MOV_RLC_imm10_pseudo2:%[0-9]+]]:er = MOV_RLC_imm10_pseudo 6
; CHECK-NEXT: [[COPY10:%[0-9]+]]:mss = COPY [[COPY]]
; CHECK-NEXT: [[VFLOOR_S32_BF16_mFl2FxSrc_W:%[0-9]+]]:vec512 = VFLOOR_S32_BF16_mFl2FxSrc_W [[COPY1]], [[COPY10]], implicit-def dead $srf2iflags, implicit $crf2imask
; CHECK-NEXT: [[VSHUFFLE:%[0-9]+]]:vec512 = VSHUFFLE [[VFLOOR_S32_BF16_mFl2FxSrc_W]], [[VFLOOR_S32_BF16_mFl2FxSrc_W]], [[MOV_RLC_imm10_pseudo1]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:ewl = COPY [[VSHUFFLE]].sub_256_lo
; CHECK-NEXT: [[COPY12:%[0-9]+]]:vec256 = COPY [[COPY11]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:mss = COPY [[MOV_RLC_imm10_pseudo]]
; CHECK-NEXT: [[VUPS_S64_D16_mv_ups_w2c:%[0-9]+]]:acc1024 = VUPS_S64_D16_mv_ups_w2c [[COPY12]], [[COPY13]], implicit-def dead $srups_of, implicit $crsat, implicit $crupssign
; CHECK-NEXT: [[COPY14:%[0-9]+]]:mss = COPY [[MOV_RLC_imm10_pseudo2]]
; CHECK-NEXT: [[VSRS_S32_S64_mv_x_srs:%[0-9]+]]:vec512 = VSRS_S32_S64_mv_x_srs [[VUPS_S64_D16_mv_ups_w2c]], [[COPY14]], implicit-def dead $srsrs_of, implicit $crsat, implicit $crrnd
; CHECK-NEXT: [[VBCST_32_:%[0-9]+]]:vec512 = VBCST_32 [[COPY7]]
; CHECK-NEXT: [[VBCST_32_1:%[0-9]+]]:vec512 = VBCST_32 [[COPY9]]
; CHECK-NEXT: [[MOVXM_lng_cg4:%[0-9]+]]:ers8 = MOVXM_lng_cg 52428
; CHECK-NEXT: [[VSEL_32_:%[0-9]+]]:vec512 = VSEL_32 [[VBCST_32_]], [[VBCST_32_1]], [[MOVXM_lng_cg4]]
; CHECK-NEXT: [[VADD_32_:%[0-9]+]]:vec512 = VADD_32 [[VSEL_32_]], [[VSRS_S32_S64_mv_x_srs]]
; CHECK-NEXT: [[COPY15:%[0-9]+]]:ewl = COPY [[VADD_32_]].sub_256_lo
; CHECK-NEXT: [[COPY16:%[0-9]+]]:ewh = COPY [[VADD_32_]].sub_256_hi
; CHECK-NEXT: [[VLDB_4x16_LO:%[0-9]+]]:vec256 = VLDB_4x16_LO [[COPY15]] :: (load unknown-size from @softmax_ilut_ab, align 1), (load unknown-size from @softmax_ilut_cd, align 1)
; CHECK-NEXT: [[VLDB_4x16_HI:%[0-9]+]]:vec256 = VLDB_4x16_HI [[COPY15]] :: (load unknown-size from @softmax_ilut_ab, align 1), (load unknown-size from @softmax_ilut_cd, align 1)
; CHECK-NEXT: [[VLDB_4x16_LO1:%[0-9]+]]:vec256 = VLDB_4x16_LO [[COPY16]] :: (load unknown-size from @softmax_ilut_ab, align 1), (load unknown-size from @softmax_ilut_cd, align 1)
; CHECK-NEXT: [[VLDB_4x16_HI1:%[0-9]+]]:vec256 = VLDB_4x16_HI [[COPY16]] :: (load unknown-size from @softmax_ilut_ab, align 1), (load unknown-size from @softmax_ilut_cd, align 1)
; CHECK-NEXT: $wl0 = COPY [[VLDB_4x16_LO]]
; CHECK-NEXT: $wl1 = COPY [[VLDB_4x16_HI]]
; CHECK-NEXT: $wl2 = COPY [[VLDB_4x16_LO1]]
; CHECK-NEXT: $wl3 = COPY [[VLDB_4x16_HI1]]
; CHECK-NEXT: PseudoRET implicit $lr
%0:ep = COPY $p0
%121:vec256 = COPY $wl0
%97:ep_as_32bit = MOVXM_lng_cg target-flags(aie2-global) @softmax_ilut_ab
%96:er = COPY %97
%100:ep_as_32bit = MOVXM_lng_cg target-flags(aie2-global) @softmax_ilut_cd
%99:er = COPY %100
%103:ep_as_32bit = MOVXM_lng_cg target-flags(aie2-global) @softmax_flut_ab
%102:er = COPY %103
%106:ep_as_32bit = MOVXM_lng_cg target-flags(aie2-global) @softmax_flut_cd
%105:er = COPY %106
%284:mdm = COPY %96
%98:er = COPY %284
%285:mdm = COPY %99
%101:er = COPY %285
%95:ers4 = MOV_RLC_imm10_pseudo 0
%127:er = MOV_RLC_imm10_pseudo 2
%134:er = MOV_RLC_imm10_pseudo 6
%297:mss = COPY %0
%122:vec512 = VFLOOR_S32_BF16_mFl2FxSrc_W %121, %297, implicit-def dead $srf2iflags, implicit $crf2imask
%126:vec512 = VSHUFFLE %122, %122, %127
%128:ewl = COPY %126.sub_256_lo
%129:vec256 = COPY %128
%296:mss = COPY %95
%132:acc1024 = VUPS_S64_D16_mv_ups_w2c %129, %296, implicit-def dead $srups_of, implicit $crsat, implicit $crupssign
%295:mss = COPY %134
%133:vec512 = VSRS_S32_S64_mv_x_srs %132, %295, implicit-def dead $srsrs_of, implicit $crsat, implicit $crrnd
%136:vec512 = VBCST_32 %98
%137:vec512 = VBCST_32 %101
%139:ers8 = MOVXM_lng_cg 52428
%138:vec512 = VSEL_32 %136, %137, %139
%140:vec512 = VADD_32 %138, %133
%141:ewl = COPY %140.sub_256_lo
%147:ewh = COPY %140.sub_256_hi
%142:vec256 = VLDB_4x16_LO %141
%145:vec256 = VLDB_4x16_HI %141
%148:vec256 = VLDB_4x16_LO %147
%150:vec256 = VLDB_4x16_HI %147
$wl0 = COPY %142
$wl1 = COPY %145
$wl2 = COPY %148
$wl3 = COPY %150
PseudoRET implicit $lr
...

0 comments on commit 093ac91

Please sign in to comment.