From 8e0a94bf51998b910be6158829f4df6fe0d24658 Mon Sep 17 00:00:00 2001 From: shg8 Date: Fri, 23 Feb 2024 02:26:09 -0600 Subject: [PATCH] Automatically increase sort buffer size --- 3dgs/Renderer.cpp | 64 +++++++++++++++++++++--------- 3dgs/Renderer.h | 4 +- vulkan/Buffer.cpp | 86 ++++++++++++++++++++++++++++++---------- vulkan/Buffer.h | 15 +++++++ vulkan/DescriptorSet.cpp | 2 + vulkan/DescriptorSet.h | 9 +++-- 6 files changed, 137 insertions(+), 43 deletions(-) diff --git a/3dgs/Renderer.cpp b/3dgs/Renderer.cpp index 91215f9..969e5f2 100644 --- a/3dgs/Renderer.cpp +++ b/3dgs/Renderer.cpp @@ -15,8 +15,6 @@ #include -#define SORT_ALLOCATE_MULTIPLIER 10 - void Renderer::initialize() { initializeVulkan(); createGui(); @@ -189,17 +187,17 @@ void Renderer::createPrefixSumPipeline() { void Renderer::createRadixSortPipeline() { spdlog::debug("Creating radix sort pipeline"); - sortKBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * SORT_ALLOCATE_MULTIPLIER, + sortKBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier, false); - sortKBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * SORT_ALLOCATE_MULTIPLIER, + sortKBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier, false); - sortVBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * SORT_ALLOCATE_MULTIPLIER, + sortVBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier, false); - sortVBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * SORT_ALLOCATE_MULTIPLIER, + sortVBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier, false); - uint32_t globalInvocationSize = scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER / numRadixSortBlocksPerWorkgroup; - uint32_t remainder = scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER % numRadixSortBlocksPerWorkgroup; + uint32_t globalInvocationSize = scene->getNumVertices() * sortBufferSizeMultiplier / numRadixSortBlocksPerWorkgroup; + uint32_t remainder = scene->getNumVertices() * sortBufferSizeMultiplier % numRadixSortBlocksPerWorkgroup; globalInvocationSize += remainder > 0 ? 1 : 0; auto numWorkgroups = (globalInvocationSize + 256 - 1) / 256; @@ -338,6 +336,7 @@ void Renderer::run() { throw std::runtime_error("Failed to acquire swapchain image"); } + startOfRenderLoop: handleInput(); updateUniforms(); @@ -351,7 +350,9 @@ void Renderer::run() { } context->device->resetFences(inflightFences[0].get()); - recordRenderCommandBuffer(0); + if (!recordRenderCommandBuffer(0)) { + goto startOfRenderLoop; + } vk::PipelineStageFlags waitStage = vk::PipelineStageFlagBits::eComputeShader; submitInfo = vk::SubmitInfo {}.setWaitSemaphores(swapchain->imageAvailableSemaphores[0].get()) .setCommandBuffers(renderCommandBuffer.get()) @@ -421,9 +422,12 @@ void Renderer::createCommandPool() { void Renderer::recordPreprocessCommandBuffer() { spdlog::debug("Recording preprocess command buffer"); - vk::CommandBufferAllocateInfo allocateInfo = {commandPool.get(), vk::CommandBufferLevel::ePrimary, 1}; - auto buffers = context->device->allocateCommandBuffersUnique(allocateInfo); - preprocessCommandBuffer = std::move(buffers[0]); + if (!preprocessCommandBuffer) { + vk::CommandBufferAllocateInfo allocateInfo = {commandPool.get(), vk::CommandBufferLevel::ePrimary, 1}; + auto buffers = context->device->allocateCommandBuffersUnique(allocateInfo); + preprocessCommandBuffer = std::move(buffers[0]); + } + preprocessCommandBuffer->reset(); auto numGroups = (scene->getNumVertices() + 255) / 256; @@ -492,20 +496,42 @@ void Renderer::recordPreprocessCommandBuffer() { } -void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) { +bool Renderer::recordRenderCommandBuffer(uint32_t currentFrame) { if (!renderCommandBuffer) { renderCommandBuffer = std::move(context->device->allocateCommandBuffersUnique( vk::CommandBufferAllocateInfo(commandPool.get(), vk::CommandBufferLevel::ePrimary, 1))[0]); } + + uint32_t numInstances = totalSumBufferHost->readOne(); + if (numInstances > scene->getNumVertices() * sortBufferSizeMultiplier) { + auto old = sortBufferSizeMultiplier; + while (numInstances > scene->getNumVertices() * sortBufferSizeMultiplier) { + sortBufferSizeMultiplier++; + } + spdlog::info("Reallocating sort buffers. {} -> {}", old, sortBufferSizeMultiplier); + sortKBufferEven->realloc(scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier); + sortKBufferOdd->realloc(scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier); + sortVBufferEven->realloc(scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier); + sortVBufferOdd->realloc(scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier); + + uint32_t globalInvocationSize = scene->getNumVertices() * sortBufferSizeMultiplier / numRadixSortBlocksPerWorkgroup; + uint32_t remainder = scene->getNumVertices() * sortBufferSizeMultiplier % numRadixSortBlocksPerWorkgroup; + globalInvocationSize += remainder > 0 ? 1 : 0; + + auto numWorkgroups = (globalInvocationSize + 256 - 1) / 256; + + sortHistBuffer->realloc(numWorkgroups * 256 * sizeof(uint32_t)); + + recordPreprocessCommandBuffer(); + return false; + } + renderCommandBuffer->reset({}); renderCommandBuffer->begin(vk::CommandBufferBeginInfo{}); - uint32_t numInstances = totalSumBufferHost->readOne(); // std::cout << "Num instances: " << numInstances << std::endl; - if (numInstances > scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER) { - throw std::runtime_error("Gaussian instantiation out of memory"); - } - assert(numInstances <= scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER); + + assert(numInstances <= scene->getNumVertices() * sortBufferSizeMultiplier); for (auto i = 0; i < 8; i++) { sortHistPipeline->bind(renderCommandBuffer, 0, i % 2 == 0 ? 0 : 1); if (i == 0) { @@ -625,6 +651,8 @@ void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) { vk::DependencyFlagBits::eByRegion, nullptr, nullptr, imageMemoryBarrier); } renderCommandBuffer->end(); + + return true; } void Renderer::updateUniforms() { diff --git a/3dgs/Renderer.h b/3dgs/Renderer.h index 0359d0e..1bf6c18 100644 --- a/3dgs/Renderer.h +++ b/3dgs/Renderer.h @@ -137,6 +137,8 @@ class Renderer { int fpsCounter = 0; std::chrono::high_resolution_clock::time_point lastFpsTime = std::chrono::high_resolution_clock::now(); + unsigned int sortBufferSizeMultiplier = 3; + void initializeVulkan(); void loadSceneToGPU(); @@ -155,7 +157,7 @@ class Renderer { void recordPreprocessCommandBuffer(); - void recordRenderCommandBuffer(uint32_t currentFrame); + bool recordRenderCommandBuffer(uint32_t currentFrame); void createCommandPool(); diff --git a/vulkan/Buffer.cpp b/vulkan/Buffer.cpp index 13bbf1a..c81896e 100644 --- a/vulkan/Buffer.cpp +++ b/vulkan/Buffer.cpp @@ -3,15 +3,9 @@ #include "Buffer.h" #include "Utils.h" +#include "spdlog/spdlog.h" -Buffer::Buffer(const std::shared_ptr& _context, uint32_t size, vk::BufferUsageFlags usage, - VmaMemoryUsage vmaUsage, VmaAllocationCreateFlags flags, bool shared, vk::DeviceSize alignment) - : context(_context), - size(size), - usage(usage), - vmaUsage(vmaUsage), - flags(flags), - allocation(nullptr) { +void Buffer::alloc() { auto bufferInfo = vk::BufferCreateInfo() .setSize(size) .setUsage(usage) @@ -33,7 +27,8 @@ Buffer::Buffer(const std::shared_ptr& _context, uint32_t size, vk VkResult res; if (alignment != 0) { - res = vmaCreateBufferWithAlignment(context->allocator, &vkBufferInfo, &allocInfo, alignment, &vkBuffer, &allocation, &allocation_info); + res = vmaCreateBufferWithAlignment(context->allocator, &vkBufferInfo, &allocInfo, alignment, &vkBuffer, + &allocation, &allocation_info); } else { res = vmaCreateBuffer(context->allocator, &vkBufferInfo, &allocInfo, &vkBuffer, &allocation, &allocation_info); } @@ -43,20 +38,33 @@ Buffer::Buffer(const std::shared_ptr& _context, uint32_t size, vk buffer = static_cast(vkBuffer); } +Buffer::Buffer(const std::shared_ptr& _context, uint32_t size, vk::BufferUsageFlags usage, + VmaMemoryUsage vmaUsage, VmaAllocationCreateFlags flags, bool shared, vk::DeviceSize alignment) + : context(_context), + size(size), + alignment(alignment), + shared(shared), + usage(usage), + vmaUsage(vmaUsage), + flags(flags), + allocation(nullptr) { + alloc(); +} + Buffer Buffer::createStagingBuffer(uint32_t size) { return Buffer(context, size, vk::BufferUsageFlagBits::eTransferSrc, VMA_MEMORY_USAGE_AUTO, VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT, false); } -void Buffer::upload(const void *data, uint32_t size, uint32_t offset) { +void Buffer::upload(const void* data, uint32_t size, uint32_t offset) { if (size + offset > this->size) { throw std::runtime_error("Buffer overflow"); } if (vmaUsage == VMA_MEMORY_USAGE_GPU_ONLY || vmaUsage == VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE) { auto stagingBuffer = createStagingBuffer(size); - memcpy(stagingBuffer.allocation_info.pMappedData, ((char*) data) + offset, size); + memcpy(stagingBuffer.allocation_info.pMappedData, ((char *) data) + offset, size); auto commandBuffer = context->beginOneTimeCommandBuffer(); vk::BufferCopy copyRegion = {}; copyRegion.setSize(size); @@ -103,28 +111,62 @@ void Buffer::downloadTo(std::shared_ptr buffer, vk::DeviceSize srcOffset Buffer::~Buffer() { vmaDestroyBuffer(context->allocator, buffer, allocation); + spdlog::debug("Buffer destroyed"); +} + +void Buffer::realloc(uint64_t newSize) { + vmaDestroyBuffer(context->allocator, buffer, allocation); + + size = newSize; + alloc(); + + std::vector writeDescriptorSets; + for (auto& tuple: boundDescriptorSets) { + auto descriptorSet = std::get<0>(tuple); + auto shared = descriptorSet.lock(); + if (shared) { + vk::DescriptorBufferInfo bufferInfo(buffer, 0, size); + writeDescriptorSets.emplace_back(shared->descriptorSets[std::get<1>(tuple)].get(), + std::get<2>(tuple), 0, 1, + std::get<3>(tuple), nullptr, + &bufferInfo); + } + } + if (!writeDescriptorSets.empty()) { + context->device->updateDescriptorSets(writeDescriptorSets, nullptr); + } +} + +void Buffer::boundToDescriptorSet(std::weak_ptr descriptorSet, uint32_t set, uint32_t binding, + vk::DescriptorType type) { + boundDescriptorSets.push_back({descriptorSet, set, binding, type}); } std::shared_ptr Buffer::uniform(std::shared_ptr context, uint32_t size, bool concurrentSharing) { return std::make_shared(std::move(context), size, vk::BufferUsageFlagBits::eUniformBuffer, - VMA_MEMORY_USAGE_AUTO, - VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT, concurrentSharing); + VMA_MEMORY_USAGE_AUTO, + VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT, + concurrentSharing); } std::shared_ptr Buffer::staging(std::shared_ptr context, unsigned long size) { - return std::make_shared(context, size, vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst, + return std::make_shared(context, size, + vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst, VMA_MEMORY_USAGE_AUTO, VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT, false); } -std::shared_ptr Buffer::storage(std::shared_ptr context, uint64_t size, bool concurrentSharing, vk::DeviceSize alignment) { - return std::make_shared(context, size, vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc, - VMA_MEMORY_USAGE_GPU_ONLY, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT, - concurrentSharing, alignment); +std::shared_ptr Buffer::storage(std::shared_ptr context, uint64_t size, bool concurrentSharing, + vk::DeviceSize alignment) { + return std::make_shared(context, size, + vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst | + vk::BufferUsageFlagBits::eTransferSrc, + VMA_MEMORY_USAGE_GPU_ONLY, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT, + concurrentSharing, alignment); } -void Buffer::assertEquals(char *data, size_t length) { +void Buffer::assertEquals(char* data, size_t length) { if (length > size) { throw std::runtime_error("Buffer overflow"); } @@ -170,6 +212,8 @@ void Buffer::computeWriteWriteBarrier(vk::CommandBuffer commandBuffer) { std::vector Buffer::download() { auto stagingBuffer = Buffer::staging(context, size); downloadTo(stagingBuffer); - return {(char *) stagingBuffer->allocation_info.pMappedData, - ((char *) stagingBuffer->allocation_info.pMappedData) + size}; + return { + (char *) stagingBuffer->allocation_info.pMappedData, + ((char *) stagingBuffer->allocation_info.pMappedData) + size + }; } diff --git a/vulkan/Buffer.h b/vulkan/Buffer.h index e94908d..b2602bc 100644 --- a/vulkan/Buffer.h +++ b/vulkan/Buffer.h @@ -3,9 +3,13 @@ #include #include + +#include "DescriptorSet.h" #include "VulkanContext.h" #include "vk_mem_alloc.h" +class DescriptorSet; + class Buffer : public std::enable_shared_from_this { public: Buffer(const std::shared_ptr& context, uint32_t size, vk::BufferUsageFlags usage, VmaMemoryUsage vmaUsage, @@ -21,6 +25,10 @@ class Buffer : public std::enable_shared_from_this { ~Buffer(); + void realloc(uint64_t uint64); + + void boundToDescriptorSet(std::weak_ptr descriptorSet, uint32_t set, uint32_t binding, vk::DescriptorType type); + static std::shared_ptr uniform(std::shared_ptr context, uint32_t size, bool concurrentSharing = false); static std::shared_ptr staging(std::shared_ptr context, unsigned long size); @@ -56,6 +64,9 @@ class Buffer : public std::enable_shared_from_this { vk::DeviceSize size; vk::BufferUsageFlags usage; + uint64_t alignment; + bool shared; + vk::Buffer buffer; VmaAllocation allocation; VmaAllocationInfo allocation_info; @@ -65,8 +76,12 @@ class Buffer : public std::enable_shared_from_this { private: + void alloc(); + Buffer createStagingBuffer(uint32_t size); std::shared_ptr context; + + std::vector, uint32_t, uint32_t, vk::DescriptorType>> boundDescriptorSets; }; diff --git a/vulkan/DescriptorSet.cpp b/vulkan/DescriptorSet.cpp index 72367e1..e25d9ec 100644 --- a/vulkan/DescriptorSet.cpp +++ b/vulkan/DescriptorSet.cpp @@ -45,6 +45,7 @@ void DescriptorSet::build() { for (auto j = 0; j < maxOptions; j++) { if (binding.second.size() == 1) { if (binding.second[0].buffer != nullptr) { + binding.second[0].buffer->boundToDescriptorSet(static_cast>(shared_from_this()), i * maxOptions + j, binding.first, binding.second[0].type); writeDescriptorSets.emplace_back(descriptorSets[i * maxOptions + j].get(), binding.first, 0, 1, binding.second[0].type, nullptr, &binding.second[0].bufferInfo); @@ -56,6 +57,7 @@ void DescriptorSet::build() { } else { if (binding.second.at(j).buffer != nullptr) { + binding.second.at(j).buffer->boundToDescriptorSet(static_cast>(shared_from_this()), i * maxOptions + j, binding.first, binding.second.at(j).type); writeDescriptorSets.emplace_back(descriptorSets[i * maxOptions + j].get(), binding.first, 0, 1, binding.second.at(j).type, nullptr, &binding.second.at(j).bufferInfo); diff --git a/vulkan/DescriptorSet.h b/vulkan/DescriptorSet.h index adac056..3b3bd4a 100644 --- a/vulkan/DescriptorSet.h +++ b/vulkan/DescriptorSet.h @@ -9,7 +9,9 @@ #include "Swapchain.h" -class DescriptorSet { +class Buffer; + +class DescriptorSet : public std::enable_shared_from_this { public: struct DescriptorBinding { vk::DescriptorType type; @@ -36,12 +38,13 @@ class DescriptorSet { vk::UniqueDescriptorSetLayout descriptorSetLayout; + std::vector descriptorSets; + size_t maxOptions = 1; + private: const std::shared_ptr context; const uint8_t framesInFlight; std::unordered_map> bindings; - std::vector descriptorSets; - size_t maxOptions = 1; };