-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Set inst_pref_size to maximum #126981
base: main
Are you sure you want to change the base?
[AMDGPU] Set inst_pref_size to maximum #126981
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesOn gfx11 and gfx12 set initial instruction prefetch size to a Fixes: SWDEV-513122 Full diff: https://github.com/llvm/llvm-project/pull/126981.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 031d8f0560ff2..d1d5b9a79ec5a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -622,12 +622,13 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
int64_t PGRM_Rsrc3 = 1;
bool EvaluatableRsrc3 =
- CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
+ CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGRM_Rsrc3);
(void)PGRM_Rsrc3;
(void)EvaluatableRsrc3;
- assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
+ assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
+ STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
static_cast<uint64_t>(PGRM_Rsrc3) == 0);
- KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
+ KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
KernelDescriptor.kernarg_preload = MCConstantExpr::create(
AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
@@ -748,7 +749,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
OutContext, IsLocal)
->getVariableValue(),
- getFunctionCodeSize(MF), MFI);
+ CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
return false;
}
@@ -757,7 +758,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
CurrentProgramInfo.NumArchVGPR,
STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
- CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
+ CurrentProgramInfo.ScratchSize,
+ CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
OutStreamer->emitRawComment(
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
@@ -821,22 +823,22 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
false);
[[maybe_unused]] int64_t PGMRSrc3;
- assert(STM.hasGFX90AInsts() ||
- (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
- PGMRSrc3) &&
+ assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
+ STM.hasGFX90AInsts() ||
+ (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
static_cast<uint64_t>(PGMRSrc3) == 0));
if (STM.hasGFX90AInsts()) {
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
getMCExprStr(MCKernelDescriptor::bits_get(
- CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ CurrentProgramInfo.ComputePGMRSrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
false);
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
getMCExprStr(MCKernelDescriptor::bits_get(
- CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ CurrentProgramInfo.ComputePGMRSrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
false);
@@ -893,27 +895,6 @@ void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
}
}
-uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = STM.getInstrInfo();
-
- uint64_t CodeSize = 0;
-
- for (const MachineBasicBlock &MBB : MF) {
- for (const MachineInstr &MI : MBB) {
- // TODO: CodeSize should account for multiple functions.
-
- // TODO: Should we count size of debug info?
- if (MI.isDebugInstr())
- continue;
-
- CodeSize += TII->getInstSizeInBytes(MI);
- }
- }
-
- return CodeSize;
-}
-
// AccumOffset computed for the MCExpr equivalent of:
// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
@@ -1249,24 +1230,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
ProgInfo.EXCPEnable = 0;
+ // return ((Dst & ~Mask) | (Value << Shift))
+ auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
+ uint32_t Shift) {
+ const auto *Shft = MCConstantExpr::create(Shift, Ctx);
+ const auto *Msk = MCConstantExpr::create(Mask, Ctx);
+ Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
+ Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
+ Ctx);
+ return Dst;
+ };
+
if (STM.hasGFX90AInsts()) {
- // return ((Dst & ~Mask) | (Value << Shift))
- auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
- uint32_t Shift) {
- const auto *Shft = MCConstantExpr::create(Shift, Ctx);
- const auto *Msk = MCConstantExpr::create(Mask, Ctx);
- Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
- Dst = MCBinaryExpr::createOr(
- Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
- return Dst;
- };
-
- ProgInfo.ComputePGMRSrc3GFX90A =
- SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
+ ProgInfo.ComputePGMRSrc3 =
+ SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
- ProgInfo.ComputePGMRSrc3GFX90A =
- SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
+ ProgInfo.ComputePGMRSrc3 =
+ SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
@@ -1287,6 +1268,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
", final occupancy is " + Twine(Occupancy));
F.getContext().diagnose(Diag);
}
+
+ if (isGFX11Plus(STM)) {
+ uint32_t CodeSizeInBytes =
+ (uint32_t)std::min(ProgInfo.getFunctionCodeSize(MF),
+ (uint64_t)std::numeric_limits<uint32_t>::max());
+ uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
+ uint32_t Field, Shift, Width;
+ if (isGFX11(STM)) {
+ Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
+ Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
+ Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
+ } else {
+ Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
+ Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
+ Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
+ }
+ uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
+ ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
+ CreateExpr(InstPrefSize), Field, Shift);
+ }
}
static unsigned getRsrcReg(CallingConv::ID CallConv) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index cc8c4411805e2..2c959d7dbbd07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -50,8 +50,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
MCCodeEmitter *DumpCodeInstEmitter = nullptr;
- uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
-
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out,
const SIProgramInfo &KernelInfo,
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 212edff097837..85ea3a9e17e09 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -27,6 +27,8 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx);
+ CodeSizeInBytes.reset();
+
VGPRBlocks = ZeroExpr;
SGPRBlocks = ZeroExpr;
Priority = 0;
@@ -55,7 +57,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
LdsSize = 0;
EXCPEnable = 0;
- ComputePGMRSrc3GFX90A = ZeroExpr;
+ ComputePGMRSrc3 = ZeroExpr;
NumVGPR = ZeroExpr;
NumArchVGPR = ZeroExpr;
@@ -199,3 +201,28 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
return MCConstantExpr::create(0, Ctx);
}
+
+uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
+ if (!CodeSizeInBytes.has_value()) {
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = STM.getInstrInfo();
+
+ uint64_t CodeSize = 0;
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ // TODO: CodeSize should account for multiple functions.
+
+ // TODO: Should we count size of debug info?
+ if (MI.isDebugInstr())
+ continue;
+
+ CodeSize += TII->getInstSizeInBytes(MI);
+ }
+ }
+
+ CodeSizeInBytes = CodeSize;
+ }
+
+ return *CodeSizeInBytes;
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index c358a2d9db10b..2836af033d4a4 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -19,6 +19,7 @@
#include "llvm/IR/CallingConv.h"
#include "llvm/Support/Compiler.h"
#include <cstdint>
+#include <optional>
namespace llvm {
@@ -29,6 +30,8 @@ class MachineFunction;
/// Track resource usage for kernels / entry functions.
struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
+ std::optional<uint64_t> CodeSizeInBytes;
+
// Fields set in PGM_RSRC1 pm4 packet.
const MCExpr *VGPRBlocks = nullptr;
const MCExpr *SGPRBlocks = nullptr;
@@ -60,7 +63,7 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
uint32_t LdsSize = 0;
uint32_t EXCPEnable = 0;
- const MCExpr *ComputePGMRSrc3GFX90A = nullptr;
+ const MCExpr *ComputePGMRSrc3 = nullptr;
const MCExpr *NumVGPR = nullptr;
const MCExpr *NumArchVGPR = nullptr;
@@ -97,6 +100,9 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
// non-MCExpr members.
void reset(const MachineFunction &MF);
+ // Get function code size and cache the value.
+ uint64_t getFunctionCodeSize(const MachineFunction &MF);
+
/// Compute the value of the ComputePGMRsrc1 register.
const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
MCContext &Ctx) const;
diff --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
new file mode 100644
index 0000000000000..671352446ca31
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s
+
+; GCN-LABEL: .amdhsa_kernel large
+; GFX11: .amdhsa_inst_pref_size 3
+; GFX11: codeLenInByte = 3{{[0-9][0-9]$}}
+; GFX12: .amdhsa_inst_pref_size 4
+; GFX12: codeLenInByte = 4{{[0-9][0-9]$}}
+define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+bb:
+ call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false)
+ ret void
+}
+
+; GCN-LABEL: .amdhsa_kernel small
+; GCN: .amdhsa_inst_pref_size 1
+; GCN: codeLenInByte = {{[0-9]$}}
+define amdgpu_kernel void @small() {
+bb:
+ ret void
+}
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
ddd7760
to
6efc6a3
Compare
6efc6a3
to
f55b179
Compare
On gfx11 and gfx12 set initial instruction prefetch size to a minimum of kernel size and maximum allowed value. Fixes: SWDEV-513122
f55b179
to
912528b
Compare
On gfx11 and gfx12 set initial instruction prefetch size to a
minimum of kernel size and maximum allowed value.
Fixes: SWDEV-513122