Skip to content

Commit

Permalink
[AMDGPU] Set inst_pref_size to maximum
Browse files Browse the repository at this point in the history
On gfx11 and gfx12 set initial instruction prefetch size to a
minimum of kernel size and maximum allowed value.

Fixes: SWDEV-513122
  • Loading branch information
rampitec committed Feb 13, 2025
1 parent eb3ece0 commit f55b179
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 25 deletions.
67 changes: 44 additions & 23 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -622,12 +622,13 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,

int64_t PGRM_Rsrc3 = 1;
bool EvaluatableRsrc3 =
CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGRM_Rsrc3);
(void)PGRM_Rsrc3;
(void)EvaluatableRsrc3;
assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
static_cast<uint64_t>(PGRM_Rsrc3) == 0);
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;

KernelDescriptor.kernarg_preload = MCConstantExpr::create(
AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
Expand Down Expand Up @@ -822,22 +823,22 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
false);

[[maybe_unused]] int64_t PGMRSrc3;
assert(STM.hasGFX90AInsts() ||
(CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
PGMRSrc3) &&
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
STM.hasGFX90AInsts() ||
(CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
static_cast<uint64_t>(PGMRSrc3) == 0));
if (STM.hasGFX90AInsts()) {
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
getMCExprStr(MCKernelDescriptor::bits_get(
CurrentProgramInfo.ComputePGMRSrc3GFX90A,
CurrentProgramInfo.ComputePGMRSrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
false);
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
getMCExprStr(MCKernelDescriptor::bits_get(
CurrentProgramInfo.ComputePGMRSrc3GFX90A,
CurrentProgramInfo.ComputePGMRSrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
false);
Expand Down Expand Up @@ -1229,24 +1230,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
ProgInfo.EXCPEnable = 0;

// return ((Dst & ~Mask) | (Value << Shift))
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
uint32_t Shift) {
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
Ctx);
return Dst;
};

if (STM.hasGFX90AInsts()) {
// return ((Dst & ~Mask) | (Value << Shift))
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
uint32_t Shift) {
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
Dst = MCBinaryExpr::createOr(
Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
return Dst;
};

ProgInfo.ComputePGMRSrc3GFX90A =
SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
ProgInfo.ComputePGMRSrc3 =
SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
ProgInfo.ComputePGMRSrc3GFX90A =
SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
ProgInfo.ComputePGMRSrc3 =
SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
Expand All @@ -1267,6 +1268,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
", final occupancy is " + Twine(Occupancy));
F.getContext().diagnose(Diag);
}

if (isGFX11Plus(STM)) {
uint32_t CodeSizeInBytes =
(uint32_t)std::min(ProgInfo.getFunctionCodeSize(MF),
(uint64_t)std::numeric_limits<uint32_t>::max());
uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
uint32_t Field, Shift, Width;
if (isGFX11(STM)) {
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
} else {
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
}
uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
CreateExpr(InstPrefSize), Field, Shift);
}
}

static unsigned getRsrcReg(CallingConv::ID CallConv) {
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
LdsSize = 0;
EXCPEnable = 0;

ComputePGMRSrc3GFX90A = ZeroExpr;
ComputePGMRSrc3 = ZeroExpr;

NumVGPR = ZeroExpr;
NumArchVGPR = ZeroExpr;
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIProgramInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
uint32_t LdsSize = 0;
uint32_t EXCPEnable = 0;

const MCExpr *ComputePGMRSrc3GFX90A = nullptr;
const MCExpr *ComputePGMRSrc3 = nullptr;

const MCExpr *NumVGPR = nullptr;
const MCExpr *NumArchVGPR = nullptr;
Expand Down
21 changes: 21 additions & 0 deletions llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s

; GCN-LABEL: .amdhsa_kernel large
; GFX11: .amdhsa_inst_pref_size 3
; GFX11: codeLenInByte = 3{{[0-9][0-9]$}}
; GFX12: .amdhsa_inst_pref_size 4
; GFX12: codeLenInByte = 4{{[0-9][0-9]$}}
define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) {
bb:
call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false)
ret void
}

; GCN-LABEL: .amdhsa_kernel small
; GCN: .amdhsa_inst_pref_size 1
; GCN: codeLenInByte = {{[0-9]$}}
define amdgpu_kernel void @small() {
bb:
ret void
}

0 comments on commit f55b179

Please sign in to comment.