Skip to content

Commit

Permalink
Change GPU visibility to 2 stage. Remove task shaders
Browse files Browse the repository at this point in the history
  • Loading branch information
godlikepanos committed Aug 3, 2024
1 parent f8bc6f2 commit e71a8b1
Show file tree
Hide file tree
Showing 35 changed files with 1,420 additions and 1,503 deletions.
2 changes: 1 addition & 1 deletion AnKi/Core/CVarSet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ void CVarSet::registerCVar(CVar* cvar)

Error CVarSet::setMultiple(ConstWeakArray<const Char*> arr)
{
for(U i = 0; i < arr.getSize(); ++i)
for(U32 i = 0; i < arr.getSize(); ++i)
{
ANKI_ASSERT(arr[i]);
const CString varName = arr[i];
Expand Down
2 changes: 1 addition & 1 deletion AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class GpuVisibleTransientMemoryPool : public MakeSingleton<GpuVisibleTransientMe
alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment);

BufferUsageBit buffUsage = BufferUsageBit::kAllUniform | BufferUsageBit::kAllStorage | BufferUsageBit::kIndirectDraw
| BufferUsageBit::kIndirectCompute | BufferUsageBit::kVertex | BufferUsageBit::kTransferDestination;
| BufferUsageBit::kIndirectCompute | BufferUsageBit::kVertex | BufferUsageBit::kAllTransfer;
if(GrManager::getSingleton().getDeviceCapabilities().m_rayTracingEnabled)
{
buffUsage |= (BufferUsageBit::kAccelerationStructureBuildScratch | BufferUsageBit::kAccelerationStructureBuild);
Expand Down
35 changes: 35 additions & 0 deletions AnKi/Core/StatsSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,41 @@ class StatCounter
#endif
}

template<std::integral T>
U64 max(T value)
{
#if ANKI_STATS_ENABLED
ANKI_ASSERT(!(m_flags & StatFlag::kFloat));
checkThread();
U64 orig;
if(!!(m_flags & StatFlag::kMainThreadUpdates))
{
orig = m_u;
m_u = value;
}
else
{
orig = m_atomic.max(value);
}
return orig;
#else
(void)value;
return 0;
#endif
}

template<std::floating_point T>
F64 max([[maybe_unused]] T value)
{
#if ANKI_STATS_ENABLED
ANKI_ASSERT("Not supported for floats");
return 0.0;
#else
(void)value;
return 0.0;
#endif
}

template<std::integral T>
U64 getValue() const
{
Expand Down
13 changes: 9 additions & 4 deletions AnKi/Gr/Vulkan/VkDescriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,17 @@ void DescriptorAllocator::createNewBlock()
inf.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
inf.flags = 0;
inf.maxSets = g_dsAllocatorConsts.m_maxSets * powu(kDescriptorSetGrowScale, m_blocks.getSize());
static_assert(DescriptorType::kAccelerationStructure == DescriptorType::kCount - 1, "Needs to be the last for the bellow to work");
inf.poolSizeCount = rtEnabled ? U32(DescriptorType::kCount) : U32(DescriptorType::kCount) - 1;
ANKI_ASSERT(g_dsAllocatorConsts.m_descriptorCount.getBack().first == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR
&& "Needs to be the last for the bellow to work");
inf.poolSizeCount = rtEnabled ? g_dsAllocatorConsts.m_descriptorCount.getSize() : g_dsAllocatorConsts.m_descriptorCount.getSize() - 1;
inf.pPoolSizes = poolSizes.getBegin();

VkDescriptorPool handle;
ANKI_VK_CHECKF(vkCreateDescriptorPool(getVkDevice(), &inf, nullptr, &handle));

Block& block = *m_blocks.emplaceBack();
block.m_pool = handle;
block.m_maxDsets = inf.maxSets;

g_descriptorSetsAllocatedStatVar.increment(1);
}
Expand All @@ -101,7 +103,7 @@ void DescriptorAllocator::allocate(VkDescriptorSetLayout layout, VkDescriptorSet
do
{
VkResult res;
if(m_blocks[m_activeBlock].m_dsetsAllocatedCount > g_dsAllocatorConsts.m_maxSets * powu(kDescriptorSetGrowScale, m_activeBlock) * 2)
if(m_blocks[m_activeBlock].m_dsetsAllocatedCount > m_blocks[m_activeBlock].m_maxDsets * 2)
{
// The driver doesn't respect VkDescriptorPoolCreateInfo::maxSets. It should have thrown OoM already. To avoid growing the same DS forever
// force OoM
Expand Down Expand Up @@ -163,7 +165,10 @@ void DescriptorAllocator::reset()
// Reset the remaining pools
for(Block& b : m_blocks)
{
vkResetDescriptorPool(getVkDevice(), b.m_pool, 0);
if(b.m_dsetsAllocatedCount > 0)
{
vkResetDescriptorPool(getVkDevice(), b.m_pool, 0);
}
b.m_dsetsAllocatedCount = 0;
}

Expand Down
1 change: 1 addition & 0 deletions AnKi/Gr/Vulkan/VkDescriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class DescriptorAllocator
public:
VkDescriptorPool m_pool = VK_NULL_HANDLE;
U32 m_dsetsAllocatedCount = 0;
U32 m_maxDsets = 0;
};

static constexpr U32 kDescriptorSetGrowScale = 2;
Expand Down
2 changes: 1 addition & 1 deletion AnKi/Gr/Vulkan/VkGraphicsState.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ Error PipelineCache::init(CString cacheDir)
ANKI_VK_CHECK(vkCreatePipelineCache(getVkDevice(), &ci, nullptr, &m_cacheHandle));

#if ANKI_PLATFORM_MOBILE
ANKI_ASSERT(GrManager::getSingleton().getDeviceCapabilities() != GpuVendor::kNone);
ANKI_ASSERT(GrManager::getSingleton().getDeviceCapabilities().m_gpuVendor != GpuVendor::kUnknown);
if(GrManager::getSingleton().getDeviceCapabilities().m_gpuVendor == GpuVendor::kQualcomm)
{
// Calling vkCreateGraphicsPipeline from multiple threads crashes qualcomm's compiler
Expand Down
23 changes: 1 addition & 22 deletions AnKi/Renderer/ForwardShading.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,6 @@ void ForwardShading::populateRenderGraph(RenderingContext& ctx)
visIn.m_viewportSize = getRenderer().getInternalResolution();

getRenderer().getGpuVisibility().populateRenderGraph(visIn, m_runCtx.m_visOut);

if(getRenderer().runSoftwareMeshletRendering())
{
GpuMeshletVisibilityInput meshIn;
meshIn.m_passesName = "FW shading";
meshIn.m_technique = RenderingTechnique::kForward;
meshIn.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjection;
meshIn.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
meshIn.m_viewportSize = getRenderer().getInternalResolution();
meshIn.m_rgraph = &rgraph;
meshIn.m_hzbRt = getRenderer().getGBuffer().getHzbRt();
meshIn.fillBuffers(m_runCtx.m_visOut);

getRenderer().getGpuVisibility().populateRenderGraph(meshIn, m_runCtx.m_meshVisOut);
}
}

void ForwardShading::run(const RenderingContext& ctx, RenderPassWorkContext& rgraphCtx)
Expand Down Expand Up @@ -101,11 +86,6 @@ void ForwardShading::run(const RenderingContext& ctx, RenderPassWorkContext& rgr
args.m_viewport = UVec4(0, 0, getRenderer().getInternalResolution());
args.fill(m_runCtx.m_visOut);

if(m_runCtx.m_meshVisOut.isFilled())
{
args.fill(m_runCtx.m_meshVisOut);
}

getRenderer().getRenderableDrawer().drawMdi(args, cmdb);

// Restore state
Expand All @@ -130,8 +110,7 @@ void ForwardShading::setDependencies(GraphicsRenderPass& pass)

if(m_runCtx.m_visOut.containsDrawcalls())
{
pass.newBufferDependency((m_runCtx.m_meshVisOut.isFilled()) ? m_runCtx.m_meshVisOut.m_dependency : m_runCtx.m_visOut.m_dependency,
BufferUsageBit::kIndirectDraw);
pass.newBufferDependency(m_runCtx.m_visOut.m_dependency, BufferUsageBit::kIndirectDraw);
}
}

Expand Down
1 change: 0 additions & 1 deletion AnKi/Renderer/ForwardShading.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ class ForwardShading : public RendererObject
{
public:
GpuVisibilityOutput m_visOut;
GpuMeshletVisibilityOutput m_meshVisOut;
} m_runCtx;
};
/// @}
Expand Down
28 changes: 2 additions & 26 deletions AnKi/Renderer/GBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)

// Visibility
GpuVisibilityOutput visOut;
GpuMeshletVisibilityOutput meshletVisOut;
{
const CommonMatrices& matrices = (getRenderer().getFrameCount() <= 1) ? ctx.m_matrices : ctx.m_prevMatrices;
const Array<F32, kMaxLodCount - 1> lodDistances = {g_lod0MaxDistanceCVar.get(), g_lod1MaxDistanceCVar.get()};
Expand All @@ -127,21 +126,6 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)

m_runCtx.m_visibleAaabbIndicesBuffer = visOut.m_visibleAaabbIndicesBuffer;
m_runCtx.m_visibleAaabbIndicesBufferDepedency = visOut.m_dependency;

if(getRenderer().runSoftwareMeshletRendering())
{
GpuMeshletVisibilityInput meshIn;
meshIn.m_passesName = "GBuffer";
meshIn.m_technique = RenderingTechnique::kGBuffer;
meshIn.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjection;
meshIn.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
meshIn.m_viewportSize = getRenderer().getInternalResolution();
meshIn.m_rgraph = &rgraph;
meshIn.m_hzbRt = getRenderer().getGBuffer().getHzbRt();
meshIn.fillBuffers(visOut);

getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
}
}

const Bool enableVrs = GrManager::getSingleton().getDeviceCapabilities().m_vrs && g_vrsCVar.get() && g_gbufferVrsCVar.get();
Expand Down Expand Up @@ -181,7 +165,7 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
pass.setRenderpassInfo(WeakArray{colorRti}, &depthRti, 0, 0, kMaxU32, kMaxU32, (enableVrs) ? &sriRt : nullptr,
(enableVrs) ? getRenderer().getVrsSriGeneration().getSriTexelDimension() : 0,
(enableVrs) ? getRenderer().getVrsSriGeneration().getSriTexelDimension() : 0);
pass.setWork([this, &ctx, visOut, meshletVisOut](RenderPassWorkContext& rgraphCtx) {
pass.setWork([this, &ctx, visOut](RenderPassWorkContext& rgraphCtx) {
ANKI_TRACE_SCOPED_EVENT(GBuffer);

CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
Expand Down Expand Up @@ -214,10 +198,6 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
}

args.fill(visOut);
if(meshletVisOut.isFilled())
{
args.fill(meshletVisOut);
}

cmdb.setDepthCompareOperation(CompareOperation::kLessEqual);
getRenderer().getRenderableDrawer().drawMdi(args, cmdb);
Expand All @@ -243,11 +223,7 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kStorageGeometryRead | BufferUsageBit::kStorageFragmentRead);

// Only add one depedency to the GPU visibility. No need to track all buffers
if(meshletVisOut.isFilled())
{
pass.newBufferDependency(meshletVisOut.m_dependency, BufferUsageBit::kIndirectDraw);
}
else if(visOut.containsDrawcalls())
if(visOut.containsDrawcalls())
{
pass.newBufferDependency(visOut.m_dependency, BufferUsageBit::kIndirectDraw);
}
Expand Down
3 changes: 0 additions & 3 deletions AnKi/Renderer/GBuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@

namespace anki {

// Forward
class GpuVisibilityOutput;

/// @addtogroup renderer
/// @{

Expand Down
52 changes: 6 additions & 46 deletions AnKi/Renderer/IndirectDiffuseProbes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,6 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
{
// GBuffer visibility
GpuVisibilityOutput visOut;
GpuMeshletVisibilityOutput meshletVisOut;
Frustum frustum;
{
frustum.setPerspective(kClusterObjectFrustumNearPlane, probeToRefresh->getRenderRadius(), kPi / 2.0f, kPi / 2.0f);
Expand All @@ -215,22 +214,9 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
visIn.m_lodDistances = lodDistances;
visIn.m_rgraph = &rgraph;
visIn.m_viewportSize = UVec2(m_tileSize);
visIn.m_limitMemory = true;

getRenderer().getGpuVisibility().populateRenderGraph(visIn, visOut);

if(getRenderer().runSoftwareMeshletRendering())
{
GpuMeshletVisibilityInput meshIn;
meshIn.m_passesName = visIn.m_passesName;
meshIn.m_technique = RenderingTechnique::kGBuffer;
meshIn.m_viewProjectionMatrix = frustum.getViewProjectionMatrix();
meshIn.m_cameraTransform = frustum.getViewMatrix().getInverseTransformation();
meshIn.m_viewportSize = UVec2(m_tileSize);
meshIn.m_rgraph = &rgraph;
meshIn.fillBuffers(visOut);

getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
}
}

// GBuffer
Expand Down Expand Up @@ -258,10 +244,9 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
pass.newTextureDependency(gbufferDepthRt, TextureUsageBit::kAllFramebuffer,
TextureSubresourceDesc::firstSurface(DepthStencilAspectBit::kDepth));

pass.newBufferDependency((meshletVisOut.isFilled()) ? meshletVisOut.m_dependency : visOut.m_dependency,
BufferUsageBit::kIndirectDraw);
pass.newBufferDependency(visOut.m_dependency, BufferUsageBit::kIndirectDraw);

pass.setWork([this, visOut, meshletVisOut, viewProjMat = frustum.getViewProjectionMatrix(),
pass.setWork([this, visOut, viewProjMat = frustum.getViewProjectionMatrix(),
viewMat = frustum.getViewMatrix()](RenderPassWorkContext& rgraphCtx) {
ANKI_TRACE_SCOPED_EVENT(RIndirectDiffuse);
CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
Expand All @@ -278,11 +263,6 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
args.m_viewport = UVec4(0, 0, m_tileSize, m_tileSize);
args.fill(visOut);

if(meshletVisOut.isFilled())
{
args.fill(meshletVisOut);
}

getRenderer().getRenderableDrawer().drawMdi(args, cmdb);

// It's secondary, no need to restore any state
Expand All @@ -291,7 +271,6 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)

// Shadow visibility. Optional
GpuVisibilityOutput shadowVisOut;
GpuMeshletVisibilityOutput shadowMeshletVisOut;
Mat4 cascadeProjMat;
Mat3x4 cascadeViewMat;
Mat4 cascadeViewProjMat;
Expand All @@ -313,22 +292,9 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
visIn.m_lodDistances = lodDistances;
visIn.m_rgraph = &rgraph;
visIn.m_viewportSize = UVec2(m_shadowMapping.m_rtDescr.m_height);
visIn.m_limitMemory = true;

getRenderer().getGpuVisibility().populateRenderGraph(visIn, shadowVisOut);

if(getRenderer().runSoftwareMeshletRendering())
{
GpuMeshletVisibilityInput meshIn;
meshIn.m_passesName = visIn.m_passesName;
meshIn.m_technique = RenderingTechnique::kDepth;
meshIn.m_viewProjectionMatrix = cascadeViewProjMat;
meshIn.m_cameraTransform = cascadeViewMat.getInverseTransformation();
meshIn.m_viewportSize = visIn.m_viewportSize;
meshIn.m_rgraph = &rgraph;
meshIn.fillBuffers(shadowVisOut);

getRenderer().getGpuVisibility().populateRenderGraph(meshIn, shadowMeshletVisOut);
}
}

// Shadow pass. Optional
Expand All @@ -345,10 +311,9 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)

pass.newTextureDependency(shadowsRt, TextureUsageBit::kAllFramebuffer,
TextureSubresourceDesc::firstSurface(DepthStencilAspectBit::kDepth));
pass.newBufferDependency((shadowMeshletVisOut.isFilled()) ? shadowMeshletVisOut.m_dependency : shadowVisOut.m_dependency,
BufferUsageBit::kIndirectDraw);
pass.newBufferDependency(shadowVisOut.m_dependency, BufferUsageBit::kIndirectDraw);

pass.setWork([this, shadowVisOut, shadowMeshletVisOut, cascadeViewProjMat, cascadeViewMat](RenderPassWorkContext& rgraphCtx) {
pass.setWork([this, shadowVisOut, cascadeViewProjMat, cascadeViewMat](RenderPassWorkContext& rgraphCtx) {
ANKI_TRACE_SCOPED_EVENT(RIndirectDiffuse);
CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;

Expand All @@ -367,11 +332,6 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
args.m_viewport = UVec4(0, 0, rez, rez);
args.fill(shadowVisOut);

if(shadowMeshletVisOut.isFilled())
{
args.fill(shadowMeshletVisOut);
}

getRenderer().getRenderableDrawer().drawMdi(args, cmdb);

// It's secondary, no need to restore the state
Expand Down
Loading

0 comments on commit e71a8b1

Please sign in to comment.