-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AIE2] Add more memory/ptr combiners #214
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1406,3 +1406,42 @@ AIE2InstrInfo::getVExtractOpInfo(const MachineInstr &MI) const { | |
return std::nullopt; | ||
} | ||
} | ||
|
||
unsigned AIE2InstrInfo::getMaxLoadStoreSize() const { return 256; } | ||
|
||
bool AIE2InstrInfo::canCombineWithLoadStore(const MachineInstr &MI) const { | ||
|
||
if (!isa<GIntrinsic>(MI)) | ||
return false; | ||
|
||
const unsigned ID = cast<GIntrinsic>(MI).getIntrinsicID(); | ||
|
||
switch (ID) { | ||
case Intrinsic::aie2_I256_v16_acc32_srs: | ||
case Intrinsic::aie2_I256_v16_acc64_srs: | ||
case Intrinsic::aie2_I256_v32_acc32_srs: | ||
case Intrinsic::aie2_I256_v8_acc64_srs: | ||
case Intrinsic::aie2_I512_v16_acc64_srs: | ||
case Intrinsic::aie2_I512_v32_acc32_srs: | ||
|
||
case Intrinsic::aie2_acc32_v16_I256_ups: | ||
case Intrinsic::aie2_acc32_v32_I256_ups: | ||
case Intrinsic::aie2_acc32_v32_I512_ups: | ||
case Intrinsic::aie2_acc64_v16_I256_ups: | ||
case Intrinsic::aie2_acc64_v16_I512_ups: | ||
case Intrinsic::aie2_acc64_v8_I256_ups: | ||
gbossu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
bool AIE2InstrInfo::isProfitableToSplitType(const LLT Ty) const { | ||
const LLT V16S32 = LLT::fixed_vector(16, 32); | ||
const LLT V32S16 = LLT::fixed_vector(32, 16); | ||
const LLT V64S8 = LLT::fixed_vector(64, 8); | ||
|
||
if (Ty == V16S32 || Ty == V32S16 || Ty == V64S8) | ||
return true; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Could this just be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason is that it is not a good idea to break accumulators, because we will not combine them. My first trial was in this direction and I saw regressions. |
||
|
||
return false; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1358,3 +1358,195 @@ void llvm::applyUpdToConcat(MachineInstr &MI, MachineRegisterInfo &MRI, | |
|
||
MI.eraseFromParent(); | ||
} | ||
|
||
bool llvm::matchLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI, | ||
const AIEBaseInstrInfo &TII, | ||
unsigned &MaxMemSize) { | ||
|
||
const Register ValReg = MI.getReg(0); | ||
const LLT ValTy = MRI.getType(ValReg); | ||
const bool IsLoad = isa<GLoad>(MI); | ||
krishnamtibrewala marked this conversation as resolved.
Show resolved
Hide resolved
|
||
MaxMemSize = TII.getMaxLoadStoreSize(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a specific reason we are passing & setting the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It could be. What I had in mind was a target specific match and a completely target independent apply. It gives the freedom to apply different splitting tragedies for future targets, considering different selection combines opportunities. |
||
|
||
if (!TII.isProfitableToSplitType(ValTy)) | ||
return false; | ||
|
||
/// Avoid splitting operations that can be combined `as is`. | ||
if (IsLoad) { | ||
for (MachineInstr &ConvInstr : MRI.use_instructions(ValReg)) { | ||
if (TII.canCombineWithLoadStore(ConvInstr)) | ||
return false; | ||
} | ||
} else { | ||
MachineInstr &ConvInstr = *getDefIgnoringCopiesAndBitcasts(ValReg, MRI); | ||
if (TII.canCombineWithLoadStore(ConvInstr)) | ||
return false; | ||
} | ||
|
||
return true; | ||
} | ||
|
||
void llvm::applyLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI, | ||
MachineIRBuilder &B, const unsigned MaxMemSize) { | ||
|
||
assert(MaxMemSize && "MaxMemSize should be specified!"); | ||
B.setInstrAndDebugLoc(MI); | ||
MachineFunction &MF = B.getMF(); | ||
const bool IsLoad = isa<GLoad>(MI); | ||
const Register ValReg = MI.getReg(0); | ||
const Register AddrReg = MI.getPointerReg(); | ||
const LLT ValTy = MRI.getType(ValReg); | ||
const LLT PtrTy = MRI.getType(AddrReg); | ||
const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); | ||
const unsigned NumParts = ValTy.getSizeInBits() / MaxMemSize; | ||
const LLT NarrowTy = ValTy.divide(NumParts); | ||
const MachineMemOperand MMO = MI.getMMO(); | ||
|
||
SmallVector<Register, 8> NarrowRegs; | ||
if (!IsLoad) | ||
extractParts(ValReg, NarrowTy, NumParts, NarrowRegs, B, MRI); | ||
|
||
for (int I = NumParts - 1; I >= 0; I--) { | ||
const unsigned ByteOffset = I * NarrowTy.getSizeInBytes(); | ||
Register NewAddrReg; | ||
B.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset); | ||
MachineMemOperand *NewMMO = | ||
MF.getMachineMemOperand(&MMO, ByteOffset, NarrowTy); | ||
krishnamtibrewala marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if (IsLoad) { | ||
Register Dst = MRI.createGenericVirtualRegister(NarrowTy); | ||
NarrowRegs.push_back(Dst); | ||
B.buildLoad(Dst, NewAddrReg, *NewMMO); | ||
} else { | ||
B.buildStore(NarrowRegs[I], NewAddrReg, *NewMMO); | ||
} | ||
} | ||
|
||
if (IsLoad) { | ||
std::reverse(NarrowRegs.begin(), NarrowRegs.end()); | ||
B.buildConcatVectors(ValReg, NarrowRegs); | ||
} | ||
|
||
MI.eraseFromParent(); | ||
} | ||
|
||
/// Match something like this: | ||
/// %293:_(s20) = G_CONSTANT i20 32 | ||
/// %67:_(s20) = G_CONSTANT i20 64 | ||
/// %68:_(p0) = nuw G_PTR_ADD %61, %67(s20) | ||
/// %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %68(p0), %293(s20) | ||
|
||
/// To convert to: | ||
/// %298:_(s20) = G_CONSTANT i20 96 | ||
/// %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %61(p0), %298(s20) | ||
bool llvm::matchOffsetLoadStorePtrAdd(MachineInstr &MI, | ||
MachineRegisterInfo &MRI, | ||
const AIEBaseInstrInfo &TII, | ||
std::pair<Register, int64_t> &RegOffset) { | ||
|
||
const Register AddrReg = MI.getOperand(1).getReg(); | ||
|
||
const auto CstOffsetLoadStore = | ||
getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); | ||
|
||
if (!CstOffsetLoadStore) | ||
return false; | ||
|
||
MachineInstr *DefAddrRegInstr = MRI.getVRegDef(AddrReg); | ||
|
||
if (DefAddrRegInstr->getOpcode() != TargetOpcode::G_PTR_ADD) | ||
return false; | ||
|
||
const auto CstDefAddrRegInstr = getIConstantVRegValWithLookThrough( | ||
DefAddrRegInstr->getOperand(2).getReg(), MRI); | ||
|
||
if (!CstDefAddrRegInstr) | ||
return false; | ||
|
||
RegOffset.first = DefAddrRegInstr->getOperand(1).getReg(); | ||
RegOffset.second = CstDefAddrRegInstr->Value.getSExtValue() + | ||
CstOffsetLoadStore->Value.getSExtValue(); | ||
|
||
return true; | ||
} | ||
|
||
void llvm::applyOffsetLoadStorePtrAdd( | ||
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, | ||
const std::pair<Register, int64_t> &RegOffset) { | ||
B.setInstrAndDebugLoc(MI); | ||
|
||
Register NewOffsetReg = | ||
B.buildConstant(LLT::scalar(20), RegOffset.second).getReg(0); | ||
|
||
MI.getOperand(1).setReg(RegOffset.first); | ||
MI.getOperand(2).setReg(NewOffsetReg); | ||
} | ||
|
||
/// Match something like this: | ||
/// %0:_(s20) = COPY $m0 | ||
/// %1:_(p0) = COPY $p0 | ||
/// %2:_(<16 x s32>) = COPY $x0 | ||
/// %6:_(p0) = G_PTR_ADD %1, %0(s20) | ||
/// %18:_(s20) = G_CONSTANT i20 32 | ||
/// G_AIE_OFFSET_STORE %15(<8 x s32>), %6(p0), %18(s20) | ||
/// G_AIE_OFFSET_STORE %14(<8 x s32>), %1(p0), %0(s20) | ||
|
||
/// To convert to (pointer reuse/CSE): | ||
/// %0:_(s20) = COPY $m0 | ||
/// %1:_(p0) = COPY $p0 | ||
/// %2:_(<16 x s32>) = COPY $x0 | ||
/// %6:_(p0) = G_PTR_ADD %1, %0(s20) | ||
/// %18:_(s20) = G_CONSTANT i20 32 | ||
/// %19:_(s20) = G_CONSTANT i20 0 | ||
/// G_AIE_OFFSET_STORE %15(<8 x s32>), %6(p0), %18(s20) | ||
/// G_AIE_OFFSET_STORE %14(<8 x s32>), %6(p0), %19(s20) | ||
gbossu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
bool llvm::matchOffsetLoadStoreSharePtrAdd(MachineInstr &MI, | ||
MachineRegisterInfo &MRI, | ||
CombinerHelper &Helper, | ||
const AIEBaseInstrInfo &TII, | ||
Register &PtrAddReg) { | ||
const Register PtrReg = MI.getOperand(1).getReg(); | ||
const Register OffsetReg = MI.getOperand(2).getReg(); | ||
|
||
const auto OffsetCst = getIConstantVRegValWithLookThrough(OffsetReg, MRI); | ||
|
||
// If we have a constant here, don't touch because it is better | ||
// to stay folded. Otherwise we will fold again in the previous | ||
// combiner. | ||
if (OffsetCst) | ||
return false; | ||
|
||
for (auto &Use : MRI.use_nodbg_instructions(PtrReg)) { | ||
if (Use.getOpcode() != TargetOpcode::G_PTR_ADD) | ||
continue; | ||
if (Use.getOperand(2).getReg() != OffsetReg) | ||
continue; | ||
if (Use.getParent() != MI.getParent()) | ||
continue; | ||
if (!Helper.dominates(Use, MI)) | ||
continue; | ||
|
||
Register PaddDestReg = Use.getOperand(0).getReg(); | ||
|
||
// Dead instruction? Don't use it! | ||
// Ony use if at least another instruction is using it. | ||
if (hasNItemsOrMore(MRI.use_instr_nodbg_begin(PaddDestReg), | ||
MRI.use_instr_nodbg_end(), 1)) { | ||
PtrAddReg = PaddDestReg; | ||
return true; | ||
} | ||
} | ||
|
||
return false; | ||
} | ||
|
||
void llvm::applyOffsetLoadStoreSharePtrAdd(MachineInstr &MI, | ||
MachineRegisterInfo &MRI, | ||
MachineIRBuilder &B, | ||
Register &PtrAddReg) { | ||
|
||
Register NewOffsetReg = B.buildConstant(LLT::scalar(20), 0).getReg(0); | ||
|
||
MI.getOperand(1).setReg(PtrAddReg); | ||
MI.getOperand(2).setReg(NewOffsetReg); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 | ||
; This file is licensed under the Apache License v2.0 with LLVM Exceptions. | ||
; See https://llvm.org/LICENSE.txt for license information. | ||
; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
; | ||
; (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates | ||
; RUN: llc -mtriple=aie2 -stop-before=instruction-select %s -o - 2>&1 | FileCheck %s | ||
|
||
; Test if addrspace is correctly propagated after transformations, like memory op. | ||
; split. | ||
|
||
define dso_local noundef<16 x i32> @addrspace_propagation(ptr addrspace(6) nocapture readonly %ptr) local_unnamed_addr #0 { | ||
; CHECK-LABEL: name: addrspace_propagation | ||
; CHECK: bb.1.entry: | ||
; CHECK-NEXT: liveins: $p0 | ||
; CHECK-NEXT: {{ $}} | ||
; CHECK-NEXT: [[COPY:%[0-9]+]]:ptrregbank(p0) = COPY $p0 | ||
; CHECK-NEXT: [[C:%[0-9]+]]:modregbank(s20) = G_CONSTANT i20 128 | ||
; CHECK-NEXT: [[C1:%[0-9]+]]:modregbank(s20) = G_CONSTANT i20 160 | ||
; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (<8 x s32>) from %ir.arrayidx.1 + 32, addrspace 6) | ||
; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<8 x s32>) from %ir.arrayidx.1, addrspace 6) | ||
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vregbank(<16 x s32>) = G_CONCAT_VECTORS [[AIE_OFFSET_LOAD1]](<8 x s32>), [[AIE_OFFSET_LOAD]](<8 x s32>) | ||
; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) | ||
; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 | ||
entry: | ||
%arrayidx.1 = getelementptr inbounds [16 x <16 x i32>], ptr addrspace(6) %ptr, i32 0, i32 2 | ||
%0 = load <16 x i32>, ptr addrspace(6) %arrayidx.1, align 32 | ||
ret <16 x i32> %0 | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Having some look-ahead in early combiners seems like a good idea indeed!