diff --git a/AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h b/AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h index 23e7e2a8f..089bf1b43 100644 --- a/AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h +++ b/AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h @@ -15,70 +15,51 @@ namespace anki { /// @addtogroup core /// @{ -/// @memberof GpuVisibleTransientMemoryPool -class GpuVisibleTransientMemoryAllocation +/// GPU only transient memory. Used for temporary allocations. Allocations will get reset after each frame. +class GpuVisibleTransientMemoryPool : public MakeSingleton { - friend class GpuVisibleTransientMemoryPool; + template + friend class MakeSingleton; public: - Buffer& getBuffer() const - { - ANKI_ASSERT(isValid()); - return *m_buffer; - } - - PtrSize getOffset() const - { - ANKI_ASSERT(isValid()); - return m_offset; - } - - PtrSize getRange() const + BufferView allocate(PtrSize size, PtrSize alignment = 0) { - ANKI_ASSERT(isValid()); - return m_size; + alignment = (alignment == 0) ? m_alignment : alignment; + PtrSize offset; + Buffer* buffer; + m_pool.allocate(size, alignment, offset, buffer); + return BufferView(buffer, offset, size); } - Bool isValid() const + template + BufferView allocateStructuredBuffer(U32 count) { - return m_buffer != nullptr; + return allocateStructuredBuffer(count, sizeof(T)); } - operator BufferView() const; - -private: - Buffer* m_buffer = nullptr; - PtrSize m_offset = kMaxPtrSize; - PtrSize m_size = 0; -}; - -/// GPU only transient memory. Used for temporary allocations. Allocations will get reset after each frame. -class GpuVisibleTransientMemoryPool : public MakeSingleton -{ - template - friend class MakeSingleton; - -public: - GpuVisibleTransientMemoryAllocation allocate(PtrSize size) + BufferView allocateStructuredBuffer(U32 count, U32 structureSize) { - GpuVisibleTransientMemoryAllocation out; - m_pool.allocate(size, out.m_offset, out.m_buffer); - out.m_size = size; - return out; + return allocate(PtrSize(structureSize * count), (m_structuredBufferAlignment == kMaxU32) ? structureSize : m_structuredBufferAlignment); } void endFrame(); private: StackGpuMemoryPool m_pool; + U32 m_alignment = 0; U32 m_frame = 0; + U32 m_structuredBufferAlignment = 0; GpuVisibleTransientMemoryPool() { - U32 alignment = GrManager::getSingleton().getDeviceCapabilities().m_uniformBufferBindOffsetAlignment; - alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment); - alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment); - alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment); + m_structuredBufferAlignment = (GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment) + ? kMaxU32 + : GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment; + + m_alignment = GrManager::getSingleton().getDeviceCapabilities().m_uniformBufferBindOffsetAlignment; + m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment); + m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment); + m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment); BufferUsageBit buffUsage = BufferUsageBit::kAllUniform | BufferUsageBit::kAllStorage | BufferUsageBit::kIndirectDraw | BufferUsageBit::kIndirectCompute | BufferUsageBit::kVertex | BufferUsageBit::kAllTransfer; @@ -86,17 +67,11 @@ class GpuVisibleTransientMemoryPool : public MakeSingleton 0 && alignment > 0); ANKI_ASSERT(nextChunkGrowScale >= 1.0); m_builder = newInstance(GrMemoryPool::getSingleton()); @@ -136,7 +128,6 @@ void StackGpuMemoryPool::init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSi inter.m_scale = nextChunkGrowScale; inter.m_bias = nextChunkGrowBias; inter.m_bufferName = bufferName; - inter.m_alignment = alignment; inter.m_bufferUsage = bufferUsage; inter.m_bufferMap = bufferMapping; inter.m_allowToGrow = allowToGrow; @@ -147,11 +138,11 @@ void StackGpuMemoryPool::reset() m_builder->reset(); } -void StackGpuMemoryPool::allocate(PtrSize size, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory) +void StackGpuMemoryPool::allocate(PtrSize size, PtrSize alignment, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory) { Chunk* chunk; PtrSize offset; - const Error err = m_builder->allocate(size, 1, chunk, offset); + const Error err = m_builder->allocate(size, alignment, chunk, offset); if(err) { ANKI_GR_LOGF("Allocation failed"); diff --git a/AnKi/Gr/Utils/StackGpuMemoryPool.h b/AnKi/Gr/Utils/StackGpuMemoryPool.h index 39f3271e3..9c302df7d 100644 --- a/AnKi/Gr/Utils/StackGpuMemoryPool.h +++ b/AnKi/Gr/Utils/StackGpuMemoryPool.h @@ -25,18 +25,18 @@ class StackGpuMemoryPool StackGpuMemoryPool& operator=(const StackGpuMemoryPool&) = delete; // Non-copyable - void init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, U32 alignment, BufferUsageBit bufferUsage, - BufferMapAccessBit bufferMapping, Bool allowToGrow, CString bufferName); + void init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, BufferUsageBit bufferUsage, BufferMapAccessBit bufferMapping, + Bool allowToGrow, CString bufferName); /// @note It's thread-safe against other allocate() - void allocate(PtrSize size, PtrSize& outOffset, Buffer*& buffer) + void allocate(PtrSize size, PtrSize alignment, PtrSize& outOffset, Buffer*& buffer) { void* dummyMapped = nullptr; - allocate(size, outOffset, buffer, dummyMapped); + allocate(size, alignment, outOffset, buffer, dummyMapped); } /// @note It's thread-safe against other allocate() - void allocate(PtrSize size, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory); + void allocate(PtrSize size, PtrSize alignment, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory); void reset(); diff --git a/AnKi/Gr/Vulkan/VkGrManager.cpp b/AnKi/Gr/Vulkan/VkGrManager.cpp index ad7b7233f..6d67e3177 100644 --- a/AnKi/Gr/Vulkan/VkGrManager.cpp +++ b/AnKi/Gr/Vulkan/VkGrManager.cpp @@ -809,6 +809,7 @@ Error GrManagerImpl::initInstance() m_capabilities.m_uniformBufferMaxRange = m_devProps.properties.limits.maxUniformBufferRange; m_capabilities.m_storageBufferBindOffsetAlignment = max(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minStorageBufferOffsetAlignment)); + m_capabilities.m_structuredBufferNaturalAlignment = false; m_capabilities.m_storageBufferMaxRange = m_devProps.properties.limits.maxStorageBufferRange; m_capabilities.m_texelBufferBindOffsetAlignment = max(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minTexelBufferOffsetAlignment)); m_capabilities.m_textureBufferMaxRange = kMaxU32; diff --git a/AnKi/Renderer/ClusterBinning.cpp b/AnKi/Renderer/ClusterBinning.cpp index 67ffc4117..45fa76dfc 100644 --- a/AnKi/Renderer/ClusterBinning.cpp +++ b/AnKi/Renderer/ClusterBinning.cpp @@ -52,7 +52,7 @@ void ClusterBinning::populateRenderGraph(RenderingContext& ctx) // Allocate the clusters buffer { const U32 clusterCount = getRenderer().getTileCounts().x() * getRenderer().getTileCounts().y() + getRenderer().getZSplitCount(); - m_runCtx.m_clustersBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(Cluster) * clusterCount); + m_runCtx.m_clustersBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer(clusterCount); m_runCtx.m_clustersHandle = rgraph.importBuffer(m_runCtx.m_clustersBuffer, BufferUsageBit::kNone); } @@ -62,7 +62,7 @@ void ClusterBinning::populateRenderGraph(RenderingContext& ctx) { // Allocate memory for the indirect args constexpr U32 dispatchCount = U32(GpuSceneNonRenderableObjectType::kCount) * 2; - indirectArgsBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs) * dispatchCount); + indirectArgsBuff = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer(dispatchCount); indirectArgsHandle = rgraph.importBuffer(indirectArgsBuff, BufferUsageBit::kNone); // Create the pass @@ -208,8 +208,8 @@ void ClusterBinning::populateRenderGraph(RenderingContext& ctx) // Allocations for(GpuSceneNonRenderableObjectType type : EnumIterable()) { - m_runCtx.m_packedObjectsBuffers[type] = - GpuVisibleTransientMemoryPool::getSingleton().allocate(kClusteredObjectSizes[type] * kMaxVisibleClusteredObjects[type]); + m_runCtx.m_packedObjectsBuffers[type] = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer( + kMaxVisibleClusteredObjects[type], kClusteredObjectSizes[type]); m_runCtx.m_packedObjectsHandles[type] = rgraph.importBuffer(m_runCtx.m_packedObjectsBuffers[type], BufferUsageBit::kNone); } diff --git a/AnKi/Renderer/LensFlare.cpp b/AnKi/Renderer/LensFlare.cpp index bec689a51..02485b2e4 100644 --- a/AnKi/Renderer/LensFlare.cpp +++ b/AnKi/Renderer/LensFlare.cpp @@ -54,7 +54,7 @@ void LensFlare::populateRenderGraph(RenderingContext& ctx) RenderGraphBuilder& rgraph = ctx.m_renderGraphDescr; // Create indirect buffer - m_runCtx.m_indirectBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DrawIndirectArgs) * flareCount); + m_runCtx.m_indirectBuff = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer(flareCount); m_runCtx.m_indirectBuffHandle = rgraph.importBuffer(m_runCtx.m_indirectBuff, BufferUsageBit::kNone); // Create the pass diff --git a/AnKi/Renderer/RtShadows.cpp b/AnKi/Renderer/RtShadows.cpp index e541daa22..d05c4afdf 100644 --- a/AnKi/Renderer/RtShadows.cpp +++ b/AnKi/Renderer/RtShadows.cpp @@ -208,7 +208,7 @@ void RtShadows::populateRenderGraph(RenderingContext& ctx) BufferHandle sbtBuildIndirectArgsHandle; BufferView sbtBuildIndirectArgsBuffer; { - sbtBuildIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs)); + sbtBuildIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer(1); sbtBuildIndirectArgsHandle = rgraph.importBuffer(sbtBuildIndirectArgsBuffer, BufferUsageBit::kStorageComputeWrite); NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtShadows setup build SBT"); diff --git a/AnKi/Renderer/ShadowMapping.cpp b/AnKi/Renderer/ShadowMapping.cpp index 90c3c9c53..835202255 100644 --- a/AnKi/Renderer/ShadowMapping.cpp +++ b/AnKi/Renderer/ShadowMapping.cpp @@ -539,7 +539,7 @@ BufferView ShadowMapping::createVetVisibilityPass(CString passName, const LightC { BufferView clearTileIndirectArgs; - clearTileIndirectArgs = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DrawIndirectArgs)); + clearTileIndirectArgs = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer(1); NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(passName); diff --git a/AnKi/Renderer/Utils/GpuVisibility.cpp b/AnKi/Renderer/Utils/GpuVisibility.cpp index b56911768..f403d211c 100644 --- a/AnKi/Renderer/Utils/GpuVisibility.cpp +++ b/AnKi/Renderer/Utils/GpuVisibility.cpp @@ -79,16 +79,17 @@ class GpuVisMemoryStats : public RendererObject, public MakeSingletonSimple +static BufferView allocateStructuredBuffer(U32 count) { BufferView out = {}; - if(size) + if(count > 0) { - g_gpuVisMemoryAllocatedStatVar.increment(size); - out = GpuVisibleTransientMemoryPool::getSingleton().allocate(size); + g_gpuVisMemoryAllocatedStatVar.increment(sizeof(T) * count); + out = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer(count); - GpuVisMemoryStats::getSingleton().informAboutAllocation(size); + GpuVisMemoryStats::getSingleton().informAboutAllocation(sizeof(T) * count); } return out; @@ -296,22 +297,19 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib } m_persistentMemory.m_stage1.m_visibleRenderables = - allocateTransientGpuMem(sizeof(GpuVisibilityVisibleRenderableDesc) * maxLimits.m_maxVisibleLegacyRenderables); - m_persistentMemory.m_stage1.m_visibleMeshlets = - allocateTransientGpuMem(sizeof(GpuVisibilityVisibleMeshletDesc) * maxLimits.m_maxVisibleMeshlets); + allocateStructuredBuffer(maxLimits.m_maxVisibleLegacyRenderables); + m_persistentMemory.m_stage1.m_visibleMeshlets = allocateStructuredBuffer(maxLimits.m_maxVisibleMeshlets); - m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables = - allocateTransientGpuMem(sizeof(UVec4) * maxLimits.m_maxVisibleLegacyRenderables); + m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables = allocateStructuredBuffer(maxLimits.m_maxVisibleLegacyRenderables); m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs = - allocateTransientGpuMem(sizeof(DrawIndexedIndirectArgs) * maxLimits.m_maxVisibleLegacyRenderables); + allocateStructuredBuffer(maxLimits.m_maxVisibleLegacyRenderables); - m_persistentMemory.m_stage2Meshlet.m_meshletInstances = - allocateTransientGpuMem(sizeof(GpuSceneMeshletInstance) * maxLimits.m_maxVisibleMeshlets); + m_persistentMemory.m_stage2Meshlet.m_meshletInstances = allocateStructuredBuffer(maxLimits.m_maxVisibleMeshlets); m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb = - allocateTransientGpuMem(sizeof(GpuVisibilityVisibleRenderableDesc) * maxLimits.m_maxVisibleMeshlets); + allocateStructuredBuffer(maxLimits.m_maxVisibleMeshlets); - m_persistentMemory.m_stage3.m_meshletInstances = allocateTransientGpuMem(sizeof(GpuSceneMeshletInstance) * maxLimits.m_maxVisibleMeshlets); + m_persistentMemory.m_stage3.m_meshletInstances = allocateStructuredBuffer(maxLimits.m_maxVisibleMeshlets); m_persistentMemory.m_dep = rgraph.importBuffer((bMeshletRendering) ? m_persistentMemory.m_stage1.m_visibleMeshlets : m_persistentMemory.m_stage1.m_visibleRenderables, @@ -361,7 +359,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib BufferView m_hash; } stage1Mem; - stage1Mem.m_counters = allocateTransientGpuMem(sizeof(U32) * U32(GpuVisibilityCounter::kCount)); + stage1Mem.m_counters = allocateStructuredBuffer(U32(GpuVisibilityCounter::kCount)); if(in.m_limitMemory) { PtrSize newRange = sizeof(GpuVisibilityVisibleRenderableDesc) * limits.m_maxVisibleLegacyRenderables; @@ -380,21 +378,21 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib } else { - stage1Mem.m_visibleRenderables = allocateTransientGpuMem(sizeof(GpuVisibilityVisibleRenderableDesc) * limits.m_maxVisibleLegacyRenderables); - stage1Mem.m_visibleMeshlets = allocateTransientGpuMem(sizeof(GpuVisibilityVisibleMeshletDesc) * limits.m_maxVisibleMeshlets); + stage1Mem.m_visibleRenderables = allocateStructuredBuffer(limits.m_maxVisibleLegacyRenderables); + stage1Mem.m_visibleMeshlets = allocateStructuredBuffer(limits.m_maxVisibleMeshlets); } - stage1Mem.m_renderablePrefixSums = allocateTransientGpuMem(sizeof(U32) * bucketCount); - stage1Mem.m_meshletPrefixSums = allocateTransientGpuMem(sizeof(U32) * bucketCount); - stage1Mem.m_gpuVisIndirectDispatchArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::kCount)); + stage1Mem.m_renderablePrefixSums = allocateStructuredBuffer(bucketCount); + stage1Mem.m_meshletPrefixSums = allocateStructuredBuffer(bucketCount); + stage1Mem.m_gpuVisIndirectDispatchArgs = allocateStructuredBuffer(U32(GpuVisibilityIndirectDispatches::kCount)); if(in.m_gatherAabbIndices) { - stage1Mem.m_visibleAabbIndices = allocateTransientGpuMem(sizeof(U32) * buckets.getBucketsActiveUserCount(in.m_technique)); + stage1Mem.m_visibleAabbIndices = allocateStructuredBuffer(buckets.getBucketsActiveUserCount(in.m_technique)); } if(in.m_hashVisibles) { - stage1Mem.m_hash = allocateTransientGpuMem(sizeof(GpuVisibilityHash)); + stage1Mem.m_hash = allocateStructuredBuffer(1); } // Allocate memory for stage 2 @@ -436,47 +434,48 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib } else { - stage2Mem.m_legacy.m_instanceRateRenderables = allocateTransientGpuMem(sizeof(UVec4) * limits.m_maxVisibleLegacyRenderables); - stage2Mem.m_legacy.m_drawIndexedIndirectArgs = - allocateTransientGpuMem(sizeof(DrawIndexedIndirectArgs) * limits.m_maxVisibleLegacyRenderables); + stage2Mem.m_legacy.m_instanceRateRenderables = allocateStructuredBuffer(limits.m_maxVisibleLegacyRenderables); + stage2Mem.m_legacy.m_drawIndexedIndirectArgs = allocateStructuredBuffer(limits.m_maxVisibleLegacyRenderables); } - stage2Mem.m_legacy.m_mdiDrawCounts = allocateTransientGpuMem(sizeof(U32) * bucketCount); + stage2Mem.m_legacy.m_mdiDrawCounts = allocateStructuredBuffer(bucketCount); } if(bMeshletRendering) { if(bHwMeshletRendering) { - stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * bucketCount); + stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs = allocateStructuredBuffer(bucketCount); } else { - stage2Mem.m_meshlet.m_indirectDrawArgs = allocateTransientGpuMem(sizeof(DrawIndirectArgs) * bucketCount); + stage2Mem.m_meshlet.m_indirectDrawArgs = allocateStructuredBuffer(bucketCount); } - const PtrSize newRange = sizeof(GpuSceneMeshletInstance) * limits.m_maxVisibleMeshlets; + const U32 newCount = limits.m_maxVisibleMeshlets; if(in.m_limitMemory) { - ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Meshlet.m_meshletInstances.getRange()); - stage2Mem.m_meshlet.m_meshletInstances = BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletInstances).setRange(newRange); + ANKI_ASSERT(newCount * sizeof(GpuSceneMeshletInstance) <= m_persistentMemory.m_stage2Meshlet.m_meshletInstances.getRange()); + stage2Mem.m_meshlet.m_meshletInstances = + BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletInstances).setRange(newCount * sizeof(GpuSceneMeshletInstance)); } else { - stage2Mem.m_meshlet.m_meshletInstances = allocateTransientGpuMem(newRange); + stage2Mem.m_meshlet.m_meshletInstances = allocateStructuredBuffer(newCount); } if(bStoreMeshletsFailedHzb) { - const PtrSize newRange = sizeof(GpuVisibilityVisibleMeshletDesc) * limits.m_maxVisibleMeshlets; + const U32 newCount = limits.m_maxVisibleMeshlets; if(in.m_limitMemory) { - ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb.getRange()); - stage2Mem.m_meshlet.m_meshletsFailedHzb = BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb).setRange(newRange); + ANKI_ASSERT(newCount * sizeof(GpuVisibilityVisibleMeshletDesc) <= m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb.getRange()); + stage2Mem.m_meshlet.m_meshletsFailedHzb = + BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb).setRange(newCount * sizeof(GpuVisibilityVisibleMeshletDesc)); } else { - stage2Mem.m_meshlet.m_meshletsFailedHzb = allocateTransientGpuMem(newRange); + stage2Mem.m_meshlet.m_meshletsFailedHzb = allocateStructuredBuffer(newCount); } } } @@ -495,22 +494,23 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib { if(bHwMeshletRendering) { - stage3Mem.m_dispatchMeshIndirectArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * bucketCount); + stage3Mem.m_dispatchMeshIndirectArgs = allocateStructuredBuffer(bucketCount); } else { - stage3Mem.m_indirectDrawArgs = allocateTransientGpuMem(sizeof(DrawIndirectArgs) * bucketCount); + stage3Mem.m_indirectDrawArgs = allocateStructuredBuffer(bucketCount); } - const PtrSize newRange = sizeof(GpuSceneMeshletInstance) * limits.m_maxVisibleMeshlets; + const U32 newCount = limits.m_maxVisibleMeshlets; if(in.m_limitMemory) { - ANKI_ASSERT(newRange <= m_persistentMemory.m_stage3.m_meshletInstances.getRange()); - stage3Mem.m_meshletInstances = BufferView(m_persistentMemory.m_stage3.m_meshletInstances).setRange(newRange); + ANKI_ASSERT(newCount * sizeof(GpuSceneMeshletInstance) <= m_persistentMemory.m_stage3.m_meshletInstances.getRange()); + stage3Mem.m_meshletInstances = + BufferView(m_persistentMemory.m_stage3.m_meshletInstances).setRange(newCount * sizeof(GpuSceneMeshletInstance)); } else { - stage3Mem.m_meshletInstances = allocateTransientGpuMem(newRange); + stage3Mem.m_meshletInstances = allocateStructuredBuffer(newCount); } } @@ -991,7 +991,7 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable } // Allocate memory for the result - out.m_visiblesBuffer = allocateTransientGpuMem((objCount + 1) * sizeof(U32)); + out.m_visiblesBuffer = allocateStructuredBuffer(objCount + 1); out.m_visiblesBufferHandle = rgraph.importBuffer(out.m_visiblesBuffer, BufferUsageBit::kNone); // Create the renderpass @@ -1093,12 +1093,12 @@ void GpuVisibilityAccelerationStructures::pupulateRenderGraph(GpuVisibilityAccel // Allocate the transient buffers const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount(); - out.m_instancesBuffer = allocateTransientGpuMem(aabbCount * sizeof(AccelerationStructureInstance)); + out.m_instancesBuffer = allocateStructuredBuffer(aabbCount); out.m_someBufferHandle = rgraph.importBuffer(out.m_instancesBuffer, BufferUsageBit::kStorageComputeWrite); - out.m_renderableIndicesBuffer = allocateTransientGpuMem((aabbCount + 1) * sizeof(U32)); + out.m_renderableIndicesBuffer = allocateStructuredBuffer(aabbCount + 1); - const BufferView zeroInstancesDispatchArgsBuff = allocateTransientGpuMem(sizeof(DispatchIndirectArgs)); + const BufferView zeroInstancesDispatchArgsBuff = allocateStructuredBuffer(1); // Create vis pass { diff --git a/AnKi/Resource/TransferGpuAllocator.h b/AnKi/Resource/TransferGpuAllocator.h index 9f58f1c28..1fb6128b3 100644 --- a/AnKi/Resource/TransferGpuAllocator.h +++ b/AnKi/Resource/TransferGpuAllocator.h @@ -154,11 +154,6 @@ class TransferGpuAllocator // The rest of the functions implement the StackAllocatorBuilder TInterface. - constexpr PtrSize getMaxAlignment() - { - return kGpuBufferAlignment; - } - constexpr PtrSize getInitialChunkSize() const { return kChunkInitialSize; diff --git a/AnKi/Scene/SceneGraph.cpp b/AnKi/Scene/SceneGraph.cpp index 48156317a..0c29f9a29 100644 --- a/AnKi/Scene/SceneGraph.cpp +++ b/AnKi/Scene/SceneGraph.cpp @@ -104,7 +104,7 @@ Error SceneGraph::init(AllocAlignedCallback allocCallback, void* allocCallbackDa { SceneMemoryPool::allocateSingleton(allocCallback, allocCallbackData); - m_framePool.init(allocCallback, allocCallbackData, 1_MB, 2.0, 0, true, ANKI_SAFE_ALIGNMENT, "SceneGraphFramePool"); + m_framePool.init(allocCallback, allocCallbackData, 1_MB, 2.0, 0, true, "SceneGraphFramePool"); // Init the default main camera ANKI_CHECK(newSceneNode("mainCamera", m_defaultMainCam)); diff --git a/AnKi/Util/MemoryPool.cpp b/AnKi/Util/MemoryPool.cpp index b0e147c7a..1f7c03ebb 100644 --- a/AnKi/Util/MemoryPool.cpp +++ b/AnKi/Util/MemoryPool.cpp @@ -276,15 +276,13 @@ void StackMemoryPool::StackAllocatorBuilderInterface::recycleChunk([[maybe_unuse } void StackMemoryPool::init(AllocAlignedCallback allocCb, void* allocCbUserData, PtrSize initialChunkSize, F64 nextChunkScale, PtrSize nextChunkBias, - Bool ignoreDeallocationErrors, U32 alignmentBytes, const Char* name) + Bool ignoreDeallocationErrors, const Char* name) { ANKI_ASSERT(initialChunkSize > 0); ANKI_ASSERT(nextChunkScale >= 1.0); - ANKI_ASSERT(alignmentBytes > 0 && alignmentBytes <= kMaxAlignment); BaseMemoryPool::init(allocCb, allocCbUserData, name); m_builder.getInterface().m_parent = this; - m_builder.getInterface().m_alignmentBytes = alignmentBytes; m_builder.getInterface().m_ignoreDeallocationErrors = ignoreDeallocationErrors; m_builder.getInterface().m_initialChunkSize = initialChunkSize; m_builder.getInterface().m_nextChunkScale = nextChunkScale; @@ -301,6 +299,7 @@ void StackMemoryPool::destroy() void* StackMemoryPool::allocate(PtrSize size, PtrSize alignment) { ANKI_ASSERT(size > 0); + ANKI_ASSERT(alignment > 0 && alignment <= kMaxAlignment); Chunk* chunk; PtrSize offset; diff --git a/AnKi/Util/MemoryPool.h b/AnKi/Util/MemoryPool.h index 2d60344bf..1631b1957 100644 --- a/AnKi/Util/MemoryPool.h +++ b/AnKi/Util/MemoryPool.h @@ -195,11 +195,10 @@ class StackMemoryPool : public BaseMemoryPool /// @see init StackMemoryPool(AllocAlignedCallback allocCb, void* allocCbUserData, PtrSize initialChunkSize, F64 nextChunkScale = 2.0, - PtrSize nextChunkBias = 0, Bool ignoreDeallocationErrors = true, U32 alignmentBytes = ANKI_SAFE_ALIGNMENT, - const Char* name = nullptr) + PtrSize nextChunkBias = 0, Bool ignoreDeallocationErrors = true, const Char* name = nullptr) : StackMemoryPool() { - init(allocCb, allocCbUserData, initialChunkSize, nextChunkScale, nextChunkBias, ignoreDeallocationErrors, alignmentBytes, name); + init(allocCb, allocCbUserData, initialChunkSize, nextChunkScale, nextChunkBias, ignoreDeallocationErrors, name); } /// Destroy @@ -214,12 +213,10 @@ class StackMemoryPool : public BaseMemoryPool /// @param initialChunkSize The size of the first chunk. /// @param nextChunkScale Value that controls the next chunk. /// @param nextChunkBias Value that controls the next chunk. - /// @param ignoreDeallocationErrors Method free() may fail if the ptr is not in the top of the stack. Set that to - /// true to suppress such errors. - /// @param alignmentBytes The maximum supported alignment for returned memory. + /// @param ignoreDeallocationErrors Method free() may fail if the ptr is not in the top of the stack. Set that to true to suppress such errors. /// @param name An optional name. void init(AllocAlignedCallback allocCb, void* allocCbUserData, PtrSize initialChunkSize, F64 nextChunkScale = 2.0, PtrSize nextChunkBias = 0, - Bool ignoreDeallocationErrors = true, U32 alignmentBytes = ANKI_SAFE_ALIGNMENT, const Char* name = nullptr); + Bool ignoreDeallocationErrors = true, const Char* name = nullptr); /// Manual destroy. The destructor calls that as well. void destroy(); @@ -274,8 +271,6 @@ class StackMemoryPool : public BaseMemoryPool public: StackMemoryPool* m_parent = nullptr; - PtrSize m_alignmentBytes = 0; - Bool m_ignoreDeallocationErrors = false; PtrSize m_initialChunkSize = 0; @@ -286,12 +281,6 @@ class StackMemoryPool : public BaseMemoryPool // The rest of the functions implement the StackAllocatorBuilder TInterface. - PtrSize getMaxAlignment() const - { - ANKI_ASSERT(m_alignmentBytes > 0); - return m_alignmentBytes; - } - PtrSize getInitialChunkSize() const { ANKI_ASSERT(m_initialChunkSize > 0); diff --git a/AnKi/Util/StackAllocatorBuilder.h b/AnKi/Util/StackAllocatorBuilder.h index d7f1cf51d..933c9f3bd 100644 --- a/AnKi/Util/StackAllocatorBuilder.h +++ b/AnKi/Util/StackAllocatorBuilder.h @@ -22,7 +22,6 @@ namespace anki { /// @endcode /// @tparam TInterface This is the type of the interface that contains various info. Should have the following members: /// @code -/// U32 getMaxAlignment(); /// PtrSize getInitialChunkSize(); /// F64 getNextChunkGrowScale(); /// PtrSize getNextChunkGrowBias(); @@ -52,7 +51,7 @@ class StackAllocatorBuilder /// Allocate memory. /// @param size The size to allocate. - /// @param alignment The alignment of the returned address. + /// @param alignment The alignment of the returned address. Can be anything, not only a power of two. /// @param[out] chunk The chunk that the memory belongs to. /// @param[out] offset The offset inside the chunk. /// @note This is thread safe with itself. diff --git a/AnKi/Util/StackAllocatorBuilder.inl.h b/AnKi/Util/StackAllocatorBuilder.inl.h index 54e2a6ad0..ae5660333 100644 --- a/AnKi/Util/StackAllocatorBuilder.inl.h +++ b/AnKi/Util/StackAllocatorBuilder.inl.h @@ -38,12 +38,10 @@ void StackAllocatorBuilder::destroy() } template -Error StackAllocatorBuilder::allocate(PtrSize size, [[maybe_unused]] PtrSize alignment, TChunk*& chunk, PtrSize& offset) +Error StackAllocatorBuilder::allocate(PtrSize size, PtrSize alignment, TChunk*& chunk, PtrSize& offset) { - ANKI_ASSERT(alignment <= m_interface.getMaxAlignment()); - - size = getAlignedRoundUp(m_interface.getMaxAlignment(), size); ANKI_ASSERT(size > 0); + size += alignment; chunk = nullptr; offset = kMaxPtrSize; @@ -73,7 +71,7 @@ Error StackAllocatorBuilder::allocate(PtrSize size, [ } else { - // Need new chunk + // Need new chunk, create it and loop back LockGuard lock(m_lock); @@ -98,7 +96,6 @@ Error StackAllocatorBuilder::allocate(PtrSize size, [ } nextChunkSize = max(size, nextChunkSize); // Can't have the allocation fail - alignRoundUp(m_interface.getMaxAlignment(), nextChunkSize); // Align again TChunk* nextChunk; if(crntChunk) @@ -167,7 +164,10 @@ Error StackAllocatorBuilder::allocate(PtrSize size, [ } } + alignRoundUp(alignment, offset); + ANKI_ASSERT(chunk && offset != kMaxPtrSize); + ANKI_ASSERT(offset + size <= chunk->m_chunkSize); return Error::kNone; } diff --git a/Tests/Gr/Gr.cpp b/Tests/Gr/Gr.cpp index 9372ce1e1..1833b439e 100644 --- a/Tests/Gr/Gr.cpp +++ b/Tests/Gr/Gr.cpp @@ -196,11 +196,18 @@ ANKI_TEST(Gr, Bindings) { constexpr const char* kSrc = R"( +struct Foo3 +{ + float x; + float y; + float z; +}; + StructuredBuffer g_structured : register(t0); Texture2D g_tex : register(t2); Buffer g_buff : register(t3); -RWStructuredBuffer g_rwstructured : register(u0, space2); +RWStructuredBuffer g_rwstructured : register(u0, space2); RWTexture2D g_rwtex[3] : register(u2); RWBuffer g_rwbuff : register(u7); @@ -228,7 +235,12 @@ SamplerState g_sampler : register(s2); [numthreads(1, 1, 1)] void main() { - g_rwstructured[0] = g_structured[0] + g_structured[1]; + float3 tmp = (g_structured[0] + g_structured[1]).xyz; + Foo3 tmp3 = {tmp.x, tmp.y, tmp.z}; + g_rwstructured[0] = tmp3; + tmp *= 2.0f; + Foo3 tmp3_ = {tmp.x, tmp.y, tmp.z}; + g_rwstructured[1] = tmp3_; g_rwtex[0][uint2(0, 0)] = g_consts.m_val; @@ -244,6 +256,26 @@ void main() g_rwbuff[0] = g_buff[0]; } )"; + struct Foo + { + F32 x; + F32 y; + F32 z; + + Foo() = default; + + Foo(Vec4 v) + : x(v.x()) + , y(v.y()) + , z(v.z()) + { + } + + Bool operator==(const Foo& b) const + { + return x == b.x && y == b.y && z == b.z; + } + }; TextureInitInfo texInit; texInit.m_width = texInit.m_height = 1; @@ -258,14 +290,14 @@ void main() const Vec4 kMagicVec(1.0f, 2.0f, 3.0f, 4.0f); const Vec4 kInvalidVec(1.0f, 2.0f, 3.0f, 4.0f); - const Array data = {kMagicVec, kMagicVec}; + const Array data = {kMagicVec, kMagicVec * 2.0f}; BufferPtr structured = createBuffer(BufferUsageBit::kAllStorage, ConstWeakArray(data), "structured"); texInit.m_usage = TextureUsageBit::kSampledCompute | TextureUsageBit::kTransferDestination; TexturePtr tex = createTexture2d(texInit, kMagicVec * 2.0f); BufferPtr buff = createBuffer(BufferUsageBit::kAllTexel, kMagicVec * 2.0f, 1, "buff"); - BufferPtr rwstructured = createBuffer(BufferUsageBit::kAllStorage, kInvalidVec, 1, "rwstructured"); + BufferPtr rwstructured = createBuffer(BufferUsageBit::kAllStorage, Foo(kInvalidVec), 2, "rwstructured"); BufferPtr rwbuff = createBuffer(BufferUsageBit::kAllTexel, kInvalidVec, 1, "rwbuff"); Array rwtex; @@ -310,8 +342,8 @@ void main() signalFence->clientWait(kMaxSecond); // Check - validateBuffer(rwstructured, kMagicVec + kMagicVec); - validateBuffer(rwbuff, kMagicVec * 2.0f); + validateBuffer(rwstructured, ConstWeakArray(Array{kMagicVec + kMagicVec * 2.0f, (kMagicVec + kMagicVec * 2.0f) * 2.0f})); + validateBuffer(rwbuff, ConstWeakArray(Array{kMagicVec * 2.0f})); } commonDestroy(); diff --git a/Tests/Gr/GrCommon.h b/Tests/Gr/GrCommon.h index 8e1ba9ac2..f2a4054fb 100644 --- a/Tests/Gr/GrCommon.h +++ b/Tests/Gr/GrCommon.h @@ -184,7 +184,8 @@ inline void readBuffer(BufferPtr buff, DynamicArray& out) buffInit.m_usage = BufferUsageBit::kTransferDestination; tmpBuff = GrManager::getSingleton().newBuffer(buffInit); - CommandBufferPtr cmdb = GrManager::getSingleton().newCommandBuffer(CommandBufferInitInfo(CommandBufferFlag::kSmallBatch)); + CommandBufferPtr cmdb = + GrManager::getSingleton().newCommandBuffer(CommandBufferInitInfo(CommandBufferFlag::kGeneralWork | CommandBufferFlag::kSmallBatch)); cmdb->copyBufferToBuffer(BufferView(buff.get()), BufferView(tmpBuff.get())); cmdb->endRecording(); @@ -202,14 +203,16 @@ inline void readBuffer(BufferPtr buff, DynamicArray& out) } template -inline void validateBuffer(BufferPtr buff, T value) +inline void validateBuffer(BufferPtr buff, ConstWeakArray values) { DynamicArray cpuBuff; readBuffer(buff, cpuBuff); - for(const T& x : cpuBuff) + ANKI_ASSERT(values.getSize() == cpuBuff.getSize()); + + for(U32 i = 0; i < values.getSize(); ++i) { - ANKI_TEST_EXPECT_EQ(x, value); + ANKI_TEST_EXPECT_EQ(cpuBuff[i], values[i]); } } diff --git a/Tests/Gr/GrWorkGraphs.cpp b/Tests/Gr/GrWorkGraphs.cpp index d4ae606cc..022d6ef8d 100644 --- a/Tests/Gr/GrWorkGraphs.cpp +++ b/Tests/Gr/GrWorkGraphs.cpp @@ -145,7 +145,7 @@ void thirdNode([MaxRecords(32)] GroupNodeInputRecords inp, uint GrManager::getSingleton().submit(cmdb.get(), {}, &fence); fence->clientWait(kMaxSecond); - validateBuffer(counterBuff, 122880); + validateBuffer(counterBuff, ConstWeakArray(Array{122880})); } commonDestroy();