Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AIE2] Add more memory/ptr combiners #214

Merged
merged 1 commit into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions llvm/lib/Target/AIE/AIE2InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1406,3 +1406,42 @@ AIE2InstrInfo::getVExtractOpInfo(const MachineInstr &MI) const {
return std::nullopt;
}
}

unsigned AIE2InstrInfo::getMaxLoadStoreSize() const { return 256; }

bool AIE2InstrInfo::canCombineWithLoadStore(const MachineInstr &MI) const {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having some look-ahead in early combiners seems like a good idea indeed!


if (!isa<GIntrinsic>(MI))
return false;

const unsigned ID = cast<GIntrinsic>(MI).getIntrinsicID();

switch (ID) {
case Intrinsic::aie2_I256_v16_acc32_srs:
case Intrinsic::aie2_I256_v16_acc64_srs:
case Intrinsic::aie2_I256_v32_acc32_srs:
case Intrinsic::aie2_I256_v8_acc64_srs:
case Intrinsic::aie2_I512_v16_acc64_srs:
case Intrinsic::aie2_I512_v32_acc32_srs:

case Intrinsic::aie2_acc32_v16_I256_ups:
case Intrinsic::aie2_acc32_v32_I256_ups:
case Intrinsic::aie2_acc32_v32_I512_ups:
case Intrinsic::aie2_acc64_v16_I256_ups:
case Intrinsic::aie2_acc64_v16_I512_ups:
case Intrinsic::aie2_acc64_v8_I256_ups:
return true;
}
return false;
}

bool AIE2InstrInfo::isProfitableToSplitType(const LLT Ty) const {
const LLT V16S32 = LLT::fixed_vector(16, 32);
const LLT V32S16 = LLT::fixed_vector(32, 16);
const LLT V64S8 = LLT::fixed_vector(64, 8);

if (Ty == V16S32 || Ty == V32S16 || Ty == V64S8)
return true;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Could this just be return Ty.isVector() && Ty.getSize() == 512;?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason is that it is not a good idea to break accumulators, because we will not combine them. My first trial was in this direction and I saw regressions.


return false;
}
6 changes: 6 additions & 0 deletions llvm/lib/Target/AIE/AIE2InstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,12 @@ class AIE2InstrInfo : public AIE2GenInstrInfo {
std::optional<const VExtractOpInfo>
getVExtractOpInfo(const MachineInstr &MI) const override;

unsigned getMaxLoadStoreSize() const override;

bool canCombineWithLoadStore(const MachineInstr &MI) const override;

bool isProfitableToSplitType(const LLT Ty) const override;

protected:
SmallVector<AIEPseudoExpandInfo, 4>
getSpillPseudoExpandInfo(const MachineInstr &MI) const override;
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/AIE/AIEBaseInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,23 @@ struct AIEBaseInstrInfo : public TargetInstrInfo {
llvm_unreachable("Target didn't implement getVExtractOpInfo!");
}

/// Return the maximun size for memory operations on this target.
virtual unsigned getMaxLoadStoreSize() const {
llvm_unreachable("Target didn't implement getMaxLoadStoreSize!");
}

/// Return true if this instruction can be combined with a memory operation.
virtual bool canCombineWithLoadStore(const MachineInstr &MI) const {
llvm_unreachable("Target didn't implement canCombineWithLoadStore!");
}

/// Return true if the type can be splitted to fit target's restrictions.
/// For example, by splitting those types in advance, it is possible to
/// reach more combiners during selection.
virtual bool isProfitableToSplitType(const LLT Ty) const {
llvm_unreachable("Target didn't implement isProfitableToSplitType!");
}

protected:
/// Expand a spill pseudo-instruction into actual target instructions. This
/// will essentially split the register being handled into its sub-registers,
Expand Down
30 changes: 29 additions & 1 deletion llvm/lib/Target/AIE/AIECombine.td
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,36 @@ def combine_add_vector_elt_undef : GICombineRule <
(apply [{ applyAddVecEltUndef(*${root}, MRI, B); }] )
>;

def combine_load_store_split_matchdata: GIDefMatchData<"unsigned">;
def combine_load_store_split : GICombineRule<
(defs root:$root, combine_load_store_split_matchdata:$matchinfo),
(match (wip_match_opcode G_LOAD, G_STORE): $root,
[{ return matchLoadStoreSplit(cast<GLoadStore>(*${root}), MRI, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]),
(apply [{ applyLoadStoreSplit(cast<GLoadStore>(*${root}), MRI, B, ${matchinfo}); }])
>;

def combine_offset_load_store_ptradd_matchdata: GIDefMatchData<"std::pair<Register, int64_t>">;
def combine_offset_load_store_ptradd : GICombineRule<
(defs root:$root, combine_offset_load_store_ptradd_matchdata:$matchinfo),
(match (wip_match_opcode G_AIE_OFFSET_LOAD, G_AIE_OFFSET_STORE): $root,
[{ return matchOffsetLoadStorePtrAdd(*${root}, MRI, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]),
(apply [{ applyOffsetLoadStorePtrAdd(*${root}, MRI, B, ${matchinfo}); }])
>;

def combine_offset_load_store_share_ptradd_matchdata: GIDefMatchData<"Register">;
def combine_offset_load_store_share_ptradd : GICombineRule<
(defs root:$root, combine_offset_load_store_share_ptradd_matchdata:$matchinfo),
(match (wip_match_opcode G_AIE_OFFSET_LOAD, G_AIE_OFFSET_STORE): $root,
[{ return matchOffsetLoadStoreSharePtrAdd(*${root}, MRI, Helper, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]),
(apply [{ applyOffsetLoadStoreSharePtrAdd(*${root}, MRI, B, ${matchinfo}); }])
>;

def AIE2PostLegalizerCustomCombiner
: GICombiner<"AIE2PostLegalizerCustomCombinerImpl", [ combine_load_store_increment,
: GICombiner<"AIE2PostLegalizerCustomCombinerImpl", [ combine_load_store_split,
ptr_add_immed_chain,
combine_load_store_increment,
combine_offset_load_store_ptradd,
combine_offset_load_store_share_ptradd,
combine_add_vector_elt_undef,
combine_extract_concat,
combine_unmerge_concat,
Expand Down
192 changes: 192 additions & 0 deletions llvm/lib/Target/AIE/AIECombinerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1358,3 +1358,195 @@ void llvm::applyUpdToConcat(MachineInstr &MI, MachineRegisterInfo &MRI,

MI.eraseFromParent();
}

bool llvm::matchLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI,
const AIEBaseInstrInfo &TII,
unsigned &MaxMemSize) {

const Register ValReg = MI.getReg(0);
const LLT ValTy = MRI.getType(ValReg);
const bool IsLoad = isa<GLoad>(MI);
MaxMemSize = TII.getMaxLoadStoreSize();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a specific reason we are passing & setting the MaxMemSize in match & using that in apply ? We could have used TII.getMaxLoadStoreSize(); directly in apply ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It could be. What I had in mind was a target specific match and a completely target independent apply. It gives the freedom to apply different splitting tragedies for future targets, considering different selection combines opportunities.


if (!TII.isProfitableToSplitType(ValTy))
return false;

/// Avoid splitting operations that can be combined `as is`.
if (IsLoad) {
for (MachineInstr &ConvInstr : MRI.use_instructions(ValReg)) {
if (TII.canCombineWithLoadStore(ConvInstr))
return false;
}
} else {
MachineInstr &ConvInstr = *getDefIgnoringCopiesAndBitcasts(ValReg, MRI);
if (TII.canCombineWithLoadStore(ConvInstr))
return false;
}

return true;
}

void llvm::applyLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, const unsigned MaxMemSize) {

assert(MaxMemSize && "MaxMemSize should be specified!");
B.setInstrAndDebugLoc(MI);
MachineFunction &MF = B.getMF();
const bool IsLoad = isa<GLoad>(MI);
const Register ValReg = MI.getReg(0);
const Register AddrReg = MI.getPointerReg();
const LLT ValTy = MRI.getType(ValReg);
const LLT PtrTy = MRI.getType(AddrReg);
const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
const unsigned NumParts = ValTy.getSizeInBits() / MaxMemSize;
const LLT NarrowTy = ValTy.divide(NumParts);
const MachineMemOperand MMO = MI.getMMO();

SmallVector<Register, 8> NarrowRegs;
if (!IsLoad)
extractParts(ValReg, NarrowTy, NumParts, NarrowRegs, B, MRI);

for (int I = NumParts - 1; I >= 0; I--) {
const unsigned ByteOffset = I * NarrowTy.getSizeInBytes();
Register NewAddrReg;
B.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
MachineMemOperand *NewMMO =
MF.getMachineMemOperand(&MMO, ByteOffset, NarrowTy);

if (IsLoad) {
Register Dst = MRI.createGenericVirtualRegister(NarrowTy);
NarrowRegs.push_back(Dst);
B.buildLoad(Dst, NewAddrReg, *NewMMO);
} else {
B.buildStore(NarrowRegs[I], NewAddrReg, *NewMMO);
}
}

if (IsLoad) {
std::reverse(NarrowRegs.begin(), NarrowRegs.end());
B.buildConcatVectors(ValReg, NarrowRegs);
}

MI.eraseFromParent();
}

/// Match something like this:
/// %293:_(s20) = G_CONSTANT i20 32
/// %67:_(s20) = G_CONSTANT i20 64
/// %68:_(p0) = nuw G_PTR_ADD %61, %67(s20)
/// %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %68(p0), %293(s20)

/// To convert to:
/// %298:_(s20) = G_CONSTANT i20 96
/// %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %61(p0), %298(s20)
bool llvm::matchOffsetLoadStorePtrAdd(MachineInstr &MI,
MachineRegisterInfo &MRI,
const AIEBaseInstrInfo &TII,
std::pair<Register, int64_t> &RegOffset) {

const Register AddrReg = MI.getOperand(1).getReg();

const auto CstOffsetLoadStore =
getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);

if (!CstOffsetLoadStore)
return false;

MachineInstr *DefAddrRegInstr = MRI.getVRegDef(AddrReg);

if (DefAddrRegInstr->getOpcode() != TargetOpcode::G_PTR_ADD)
return false;

const auto CstDefAddrRegInstr = getIConstantVRegValWithLookThrough(
DefAddrRegInstr->getOperand(2).getReg(), MRI);

if (!CstDefAddrRegInstr)
return false;

RegOffset.first = DefAddrRegInstr->getOperand(1).getReg();
RegOffset.second = CstDefAddrRegInstr->Value.getSExtValue() +
CstOffsetLoadStore->Value.getSExtValue();

return true;
}

void llvm::applyOffsetLoadStorePtrAdd(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
const std::pair<Register, int64_t> &RegOffset) {
B.setInstrAndDebugLoc(MI);

Register NewOffsetReg =
B.buildConstant(LLT::scalar(20), RegOffset.second).getReg(0);

MI.getOperand(1).setReg(RegOffset.first);
MI.getOperand(2).setReg(NewOffsetReg);
}

/// Match something like this:
/// %0:_(s20) = COPY $m0
/// %1:_(p0) = COPY $p0
/// %2:_(<16 x s32>) = COPY $x0
/// %6:_(p0) = G_PTR_ADD %1, %0(s20)
/// %18:_(s20) = G_CONSTANT i20 32
/// G_AIE_OFFSET_STORE %15(<8 x s32>), %6(p0), %18(s20)
/// G_AIE_OFFSET_STORE %14(<8 x s32>), %1(p0), %0(s20)

/// To convert to (pointer reuse/CSE):
/// %0:_(s20) = COPY $m0
/// %1:_(p0) = COPY $p0
/// %2:_(<16 x s32>) = COPY $x0
/// %6:_(p0) = G_PTR_ADD %1, %0(s20)
/// %18:_(s20) = G_CONSTANT i20 32
/// %19:_(s20) = G_CONSTANT i20 0
/// G_AIE_OFFSET_STORE %15(<8 x s32>), %6(p0), %18(s20)
/// G_AIE_OFFSET_STORE %14(<8 x s32>), %6(p0), %19(s20)
bool llvm::matchOffsetLoadStoreSharePtrAdd(MachineInstr &MI,
MachineRegisterInfo &MRI,
CombinerHelper &Helper,
const AIEBaseInstrInfo &TII,
Register &PtrAddReg) {
const Register PtrReg = MI.getOperand(1).getReg();
const Register OffsetReg = MI.getOperand(2).getReg();

const auto OffsetCst = getIConstantVRegValWithLookThrough(OffsetReg, MRI);

// If we have a constant here, don't touch because it is better
// to stay folded. Otherwise we will fold again in the previous
// combiner.
if (OffsetCst)
return false;

for (auto &Use : MRI.use_nodbg_instructions(PtrReg)) {
if (Use.getOpcode() != TargetOpcode::G_PTR_ADD)
continue;
if (Use.getOperand(2).getReg() != OffsetReg)
continue;
if (Use.getParent() != MI.getParent())
continue;
if (!Helper.dominates(Use, MI))
continue;

Register PaddDestReg = Use.getOperand(0).getReg();

// Dead instruction? Don't use it!
// Ony use if at least another instruction is using it.
if (hasNItemsOrMore(MRI.use_instr_nodbg_begin(PaddDestReg),
MRI.use_instr_nodbg_end(), 1)) {
PtrAddReg = PaddDestReg;
return true;
}
}

return false;
}

void llvm::applyOffsetLoadStoreSharePtrAdd(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B,
Register &PtrAddReg) {

Register NewOffsetReg = B.buildConstant(LLT::scalar(20), 0).getReg(0);

MI.getOperand(1).setReg(PtrAddReg);
MI.getOperand(2).setReg(NewOffsetReg);
}
21 changes: 21 additions & 0 deletions llvm/lib/Target/AIE/AIECombinerHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,27 @@ void applyUpdToConcat(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B,
std::map<unsigned, Register> &IndexRegMap);

bool matchLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI,
const AIEBaseInstrInfo &TII, unsigned &MaxMemSize);
void applyLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, const unsigned MaxMemSize);

bool matchOffsetLoadStorePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI,
const AIEBaseInstrInfo &TII,
std::pair<Register, int64_t> &RegOffset);

void applyOffsetLoadStorePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B,
const std::pair<Register, int64_t> &RegOffset);

bool matchOffsetLoadStoreSharePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI,
CombinerHelper &Helper,
const AIEBaseInstrInfo &TII,
Register &PtrAddReg);

void applyOffsetLoadStoreSharePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, Register &PtrAddReg);

} // namespace llvm

#endif
29 changes: 29 additions & 0 deletions llvm/test/CodeGen/AIE/GlobalISel/addrspace-before-selection.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
; This file is licensed under the Apache License v2.0 with LLVM Exceptions.
; See https://llvm.org/LICENSE.txt for license information.
; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
;
; (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
; RUN: llc -mtriple=aie2 -stop-before=instruction-select %s -o - 2>&1 | FileCheck %s

; Test if addrspace is correctly propagated after transformations, like memory op.
; split.

define dso_local noundef<16 x i32> @addrspace_propagation(ptr addrspace(6) nocapture readonly %ptr) local_unnamed_addr #0 {
; CHECK-LABEL: name: addrspace_propagation
; CHECK: bb.1.entry:
; CHECK-NEXT: liveins: $p0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:ptrregbank(p0) = COPY $p0
; CHECK-NEXT: [[C:%[0-9]+]]:modregbank(s20) = G_CONSTANT i20 128
; CHECK-NEXT: [[C1:%[0-9]+]]:modregbank(s20) = G_CONSTANT i20 160
; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (<8 x s32>) from %ir.arrayidx.1 + 32, addrspace 6)
; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<8 x s32>) from %ir.arrayidx.1, addrspace 6)
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vregbank(<16 x s32>) = G_CONCAT_VECTORS [[AIE_OFFSET_LOAD1]](<8 x s32>), [[AIE_OFFSET_LOAD]](<8 x s32>)
; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>)
; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0
entry:
%arrayidx.1 = getelementptr inbounds [16 x <16 x i32>], ptr addrspace(6) %ptr, i32 0, i32 2
%0 = load <16 x i32>, ptr addrspace(6) %arrayidx.1, align 32
ret <16 x i32> %0
}
Loading
Loading