Skip to content

Commit

Permalink
Fix some D3D issues with structured buffers
Browse files Browse the repository at this point in the history
  • Loading branch information
godlikepanos committed Aug 11, 2024
1 parent 2b05fae commit fd51a66
Show file tree
Hide file tree
Showing 21 changed files with 201 additions and 169 deletions.
77 changes: 26 additions & 51 deletions AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,88 +15,63 @@ namespace anki {
/// @addtogroup core
/// @{

/// @memberof GpuVisibleTransientMemoryPool
class GpuVisibleTransientMemoryAllocation
/// GPU only transient memory. Used for temporary allocations. Allocations will get reset after each frame.
class GpuVisibleTransientMemoryPool : public MakeSingleton<GpuVisibleTransientMemoryPool>
{
friend class GpuVisibleTransientMemoryPool;
template<typename>
friend class MakeSingleton;

public:
Buffer& getBuffer() const
{
ANKI_ASSERT(isValid());
return *m_buffer;
}

PtrSize getOffset() const
{
ANKI_ASSERT(isValid());
return m_offset;
}

PtrSize getRange() const
BufferView allocate(PtrSize size, PtrSize alignment = 0)
{
ANKI_ASSERT(isValid());
return m_size;
alignment = (alignment == 0) ? m_alignment : alignment;
PtrSize offset;
Buffer* buffer;
m_pool.allocate(size, alignment, offset, buffer);
return BufferView(buffer, offset, size);
}

Bool isValid() const
template<typename T>
BufferView allocateStructuredBuffer(U32 count)
{
return m_buffer != nullptr;
return allocateStructuredBuffer(count, sizeof(T));
}

operator BufferView() const;

private:
Buffer* m_buffer = nullptr;
PtrSize m_offset = kMaxPtrSize;
PtrSize m_size = 0;
};

/// GPU only transient memory. Used for temporary allocations. Allocations will get reset after each frame.
class GpuVisibleTransientMemoryPool : public MakeSingleton<GpuVisibleTransientMemoryPool>
{
template<typename>
friend class MakeSingleton;

public:
GpuVisibleTransientMemoryAllocation allocate(PtrSize size)
BufferView allocateStructuredBuffer(U32 count, U32 structureSize)
{
GpuVisibleTransientMemoryAllocation out;
m_pool.allocate(size, out.m_offset, out.m_buffer);
out.m_size = size;
return out;
return allocate(PtrSize(structureSize * count), (m_structuredBufferAlignment == kMaxU32) ? structureSize : m_structuredBufferAlignment);
}

void endFrame();

private:
StackGpuMemoryPool m_pool;
U32 m_alignment = 0;
U32 m_frame = 0;
U32 m_structuredBufferAlignment = 0;

GpuVisibleTransientMemoryPool()
{
U32 alignment = GrManager::getSingleton().getDeviceCapabilities().m_uniformBufferBindOffsetAlignment;
alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment);
alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment);
m_structuredBufferAlignment = (GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
? kMaxU32
: GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment;

m_alignment = GrManager::getSingleton().getDeviceCapabilities().m_uniformBufferBindOffsetAlignment;
m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment);
m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment);

BufferUsageBit buffUsage = BufferUsageBit::kAllUniform | BufferUsageBit::kAllStorage | BufferUsageBit::kIndirectDraw
| BufferUsageBit::kIndirectCompute | BufferUsageBit::kVertex | BufferUsageBit::kAllTransfer;
if(GrManager::getSingleton().getDeviceCapabilities().m_rayTracingEnabled)
{
buffUsage |= (BufferUsageBit::kAccelerationStructureBuildScratch | BufferUsageBit::kAccelerationStructureBuild);
}
m_pool.init(10_MB, 2.0, 0, alignment, buffUsage, BufferMapAccessBit::kNone, true, "GpuVisibleTransientMemoryPool");
m_pool.init(10_MB, 2.0, 0, buffUsage, BufferMapAccessBit::kNone, true, "GpuVisibleTransientMemoryPool");
}

~GpuVisibleTransientMemoryPool() = default;
};

inline GpuVisibleTransientMemoryAllocation::operator BufferView() const
{
ANKI_ASSERT(isValid());
return {m_buffer, m_offset, m_size};
}
/// @}

} // end namespace anki
3 changes: 3 additions & 0 deletions AnKi/Gr/Common.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ class GpuDeviceCapabilities
/// API version.
U8 m_majorApiVersion = 0;

/// Align structured buffers using the structure's size and not the m_storageBufferBindOffsetAlignment.
Bool m_structuredBufferNaturalAlignment = false;

/// RT.
Bool m_rayTracingEnabled = false;

Expand Down
52 changes: 48 additions & 4 deletions AnKi/Gr/D3D/D3DDescriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -575,9 +575,10 @@ void DescriptorState::flush(ID3D12GraphicsCommandList& cmdList)
getDevice().CopyDescriptorsSimple(1, samplerHeapOffset.getCpuOffset(), outDescriptor.m_heapOffset,
D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER);
}
else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !!(inDescriptor.m_flags & DescriptorFlag::kWrite))
else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !!(inDescriptor.m_flags & DescriptorFlag::kWrite)
&& !!(inDescriptor.m_flags | DescriptorFlag::kByteAddressBuffer))
{
// RWStructuredBuffer or RWByteAddressBuffer
// RWByteAddressBuffer

ANKI_ASSERT(!outDescriptor.m_isHandle);

Expand All @@ -596,9 +597,31 @@ void DescriptorState::flush(ID3D12GraphicsCommandList& cmdList)

getDevice().CreateUnorderedAccessView(view.m_resource, nullptr, &uavDesc, cbvSrvUavHeapOffset.getCpuOffset());
}
else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !(inDescriptor.m_flags & DescriptorFlag::kWrite))
else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !!(inDescriptor.m_flags & DescriptorFlag::kWrite))
{
// RWStructuredBuffer

ANKI_ASSERT(!outDescriptor.m_isHandle);

const BufferView& view = outDescriptor.m_bufferView;
D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
uavDesc.Format = DXGI_FORMAT_UNKNOWN;
uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;

ANKI_ASSERT((view.m_offset % inDescriptor.m_structuredBufferStride) == 0);
uavDesc.Buffer.FirstElement = view.m_offset / inDescriptor.m_structuredBufferStride;

ANKI_ASSERT((view.m_range % inDescriptor.m_structuredBufferStride) == 0);
uavDesc.Buffer.NumElements = U32(view.m_range / inDescriptor.m_structuredBufferStride);

uavDesc.Buffer.StructureByteStride = inDescriptor.m_structuredBufferStride;

getDevice().CreateUnorderedAccessView(view.m_resource, nullptr, &uavDesc, cbvSrvUavHeapOffset.getCpuOffset());
}
else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !(inDescriptor.m_flags & DescriptorFlag::kWrite)
&& !!(inDescriptor.m_flags & DescriptorFlag::kByteAddressBuffer))
{
// StructuredBuffer or ByteAddressBuffer
// ByteAddressBuffer

ANKI_ASSERT(!outDescriptor.m_isHandle);
const BufferView& view = outDescriptor.m_bufferView;
Expand All @@ -617,6 +640,27 @@ void DescriptorState::flush(ID3D12GraphicsCommandList& cmdList)

getDevice().CreateShaderResourceView(view.m_resource, &srvDesc, cbvSrvUavHeapOffset.getCpuOffset());
}
else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !(inDescriptor.m_flags & DescriptorFlag::kWrite))
{
// StructuredBuffer

ANKI_ASSERT(!outDescriptor.m_isHandle);
const BufferView& view = outDescriptor.m_bufferView;
D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
srvDesc.Format = DXGI_FORMAT_UNKNOWN;
srvDesc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;

ANKI_ASSERT((view.m_offset % inDescriptor.m_structuredBufferStride) == 0);
srvDesc.Buffer.FirstElement = view.m_offset / inDescriptor.m_structuredBufferStride;

ANKI_ASSERT((view.m_range % inDescriptor.m_structuredBufferStride) == 0);
srvDesc.Buffer.NumElements = U32(view.m_range / inDescriptor.m_structuredBufferStride);

srvDesc.Buffer.StructureByteStride = inDescriptor.m_structuredBufferStride;

getDevice().CreateShaderResourceView(view.m_resource, &srvDesc, cbvSrvUavHeapOffset.getCpuOffset());
}
else if(inDescriptor.m_type == DescriptorType::kTexelBuffer && !!(inDescriptor.m_flags & DescriptorFlag::kWrite))
{
// RWBuffer
Expand Down
1 change: 1 addition & 0 deletions AnKi/Gr/D3D/D3DGrManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,7 @@ Error GrManagerImpl::initInternal(const GrManagerInitInfo& init)
m_capabilities.m_uniformBufferBindOffsetAlignment = D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT;
m_capabilities.m_uniformBufferMaxRange = D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * D3D12_STANDARD_VECTOR_SIZE * sizeof(F32);
m_capabilities.m_storageBufferBindOffsetAlignment = D3D12_RAW_UAV_SRV_BYTE_ALIGNMENT;
m_capabilities.m_structuredBufferNaturalAlignment = true;
m_capabilities.m_storageBufferMaxRange = 1 << D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP;
m_capabilities.m_texelBufferBindOffsetAlignment = 32;
m_capabilities.m_textureBufferMaxRange = kMaxU32; // ?
Expand Down
15 changes: 3 additions & 12 deletions AnKi/Gr/Utils/StackGpuMemoryPool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,11 @@ class StackGpuMemoryPool::BuilderInterface
PtrSize m_bias = 0;
PtrSize m_allocatedMemory = 0;
GrString m_bufferName;
U32 m_alignment = 0;
BufferUsageBit m_bufferUsage = BufferUsageBit::kNone;
BufferMapAccessBit m_bufferMap = BufferMapAccessBit::kNone;
U8 m_chunkCount = 0;
Bool m_allowToGrow = false;

// Builder interface stuff:
U32 getMaxAlignment() const
{
return m_alignment;
}

PtrSize getInitialChunkSize() const
{
return m_initialSize;
Expand Down Expand Up @@ -123,11 +116,10 @@ StackGpuMemoryPool::~StackGpuMemoryPool()
}
}

void StackGpuMemoryPool::init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, U32 alignment, BufferUsageBit bufferUsage,
void StackGpuMemoryPool::init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, BufferUsageBit bufferUsage,
BufferMapAccessBit bufferMapping, Bool allowToGrow, CString bufferName)
{
ANKI_ASSERT(m_builder == nullptr);
ANKI_ASSERT(initialSize > 0 && alignment > 0);
ANKI_ASSERT(nextChunkGrowScale >= 1.0);

m_builder = newInstance<Builder>(GrMemoryPool::getSingleton());
Expand All @@ -136,7 +128,6 @@ void StackGpuMemoryPool::init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSi
inter.m_scale = nextChunkGrowScale;
inter.m_bias = nextChunkGrowBias;
inter.m_bufferName = bufferName;
inter.m_alignment = alignment;
inter.m_bufferUsage = bufferUsage;
inter.m_bufferMap = bufferMapping;
inter.m_allowToGrow = allowToGrow;
Expand All @@ -147,11 +138,11 @@ void StackGpuMemoryPool::reset()
m_builder->reset();
}

void StackGpuMemoryPool::allocate(PtrSize size, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory)
void StackGpuMemoryPool::allocate(PtrSize size, PtrSize alignment, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory)
{
Chunk* chunk;
PtrSize offset;
const Error err = m_builder->allocate(size, 1, chunk, offset);
const Error err = m_builder->allocate(size, alignment, chunk, offset);
if(err)
{
ANKI_GR_LOGF("Allocation failed");
Expand Down
10 changes: 5 additions & 5 deletions AnKi/Gr/Utils/StackGpuMemoryPool.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,18 @@ class StackGpuMemoryPool

StackGpuMemoryPool& operator=(const StackGpuMemoryPool&) = delete; // Non-copyable

void init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, U32 alignment, BufferUsageBit bufferUsage,
BufferMapAccessBit bufferMapping, Bool allowToGrow, CString bufferName);
void init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, BufferUsageBit bufferUsage, BufferMapAccessBit bufferMapping,
Bool allowToGrow, CString bufferName);

/// @note It's thread-safe against other allocate()
void allocate(PtrSize size, PtrSize& outOffset, Buffer*& buffer)
void allocate(PtrSize size, PtrSize alignment, PtrSize& outOffset, Buffer*& buffer)
{
void* dummyMapped = nullptr;
allocate(size, outOffset, buffer, dummyMapped);
allocate(size, alignment, outOffset, buffer, dummyMapped);
}

/// @note It's thread-safe against other allocate()
void allocate(PtrSize size, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory);
void allocate(PtrSize size, PtrSize alignment, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory);

void reset();

Expand Down
1 change: 1 addition & 0 deletions AnKi/Gr/Vulkan/VkGrManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,7 @@ Error GrManagerImpl::initInstance()
m_capabilities.m_uniformBufferMaxRange = m_devProps.properties.limits.maxUniformBufferRange;
m_capabilities.m_storageBufferBindOffsetAlignment =
max<U32>(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minStorageBufferOffsetAlignment));
m_capabilities.m_structuredBufferNaturalAlignment = false;
m_capabilities.m_storageBufferMaxRange = m_devProps.properties.limits.maxStorageBufferRange;
m_capabilities.m_texelBufferBindOffsetAlignment = max<U32>(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minTexelBufferOffsetAlignment));
m_capabilities.m_textureBufferMaxRange = kMaxU32;
Expand Down
8 changes: 4 additions & 4 deletions AnKi/Renderer/ClusterBinning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ void ClusterBinning::populateRenderGraph(RenderingContext& ctx)
// Allocate the clusters buffer
{
const U32 clusterCount = getRenderer().getTileCounts().x() * getRenderer().getTileCounts().y() + getRenderer().getZSplitCount();
m_runCtx.m_clustersBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(Cluster) * clusterCount);
m_runCtx.m_clustersBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<Cluster>(clusterCount);
m_runCtx.m_clustersHandle = rgraph.importBuffer(m_runCtx.m_clustersBuffer, BufferUsageBit::kNone);
}

Expand All @@ -62,7 +62,7 @@ void ClusterBinning::populateRenderGraph(RenderingContext& ctx)
{
// Allocate memory for the indirect args
constexpr U32 dispatchCount = U32(GpuSceneNonRenderableObjectType::kCount) * 2;
indirectArgsBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs) * dispatchCount);
indirectArgsBuff = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DispatchIndirectArgs>(dispatchCount);
indirectArgsHandle = rgraph.importBuffer(indirectArgsBuff, BufferUsageBit::kNone);

// Create the pass
Expand Down Expand Up @@ -208,8 +208,8 @@ void ClusterBinning::populateRenderGraph(RenderingContext& ctx)
// Allocations
for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
{
m_runCtx.m_packedObjectsBuffers[type] =
GpuVisibleTransientMemoryPool::getSingleton().allocate(kClusteredObjectSizes[type] * kMaxVisibleClusteredObjects[type]);
m_runCtx.m_packedObjectsBuffers[type] = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer(
kMaxVisibleClusteredObjects[type], kClusteredObjectSizes[type]);
m_runCtx.m_packedObjectsHandles[type] = rgraph.importBuffer(m_runCtx.m_packedObjectsBuffers[type], BufferUsageBit::kNone);
}

Expand Down
2 changes: 1 addition & 1 deletion AnKi/Renderer/LensFlare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void LensFlare::populateRenderGraph(RenderingContext& ctx)
RenderGraphBuilder& rgraph = ctx.m_renderGraphDescr;

// Create indirect buffer
m_runCtx.m_indirectBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DrawIndirectArgs) * flareCount);
m_runCtx.m_indirectBuff = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DrawIndirectArgs>(flareCount);
m_runCtx.m_indirectBuffHandle = rgraph.importBuffer(m_runCtx.m_indirectBuff, BufferUsageBit::kNone);

// Create the pass
Expand Down
2 changes: 1 addition & 1 deletion AnKi/Renderer/RtShadows.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ void RtShadows::populateRenderGraph(RenderingContext& ctx)
BufferHandle sbtBuildIndirectArgsHandle;
BufferView sbtBuildIndirectArgsBuffer;
{
sbtBuildIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs));
sbtBuildIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DispatchIndirectArgs>(1);
sbtBuildIndirectArgsHandle = rgraph.importBuffer(sbtBuildIndirectArgsBuffer, BufferUsageBit::kStorageComputeWrite);

NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtShadows setup build SBT");
Expand Down
2 changes: 1 addition & 1 deletion AnKi/Renderer/ShadowMapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,7 @@ BufferView ShadowMapping::createVetVisibilityPass(CString passName, const LightC
{
BufferView clearTileIndirectArgs;

clearTileIndirectArgs = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DrawIndirectArgs));
clearTileIndirectArgs = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DrawIndirectArgs>(1);

NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(passName);

Expand Down
Loading

0 comments on commit fd51a66

Please sign in to comment.