shg8 · shg8 · Feb 23, 2024 · Feb 23, 2024
diff --git a/3dgs/Renderer.cpp b/3dgs/Renderer.cpp
@@ -15,8 +15,6 @@
 
 #include <spdlog/spdlog.h>
 
-#define SORT_ALLOCATE_MULTIPLIER 10
-
 void Renderer::initialize() {
     initializeVulkan();
     createGui();
@@ -189,17 +187,17 @@ void Renderer::createPrefixSumPipeline() {
 
 void Renderer::createRadixSortPipeline() {
     spdlog::debug("Creating radix sort pipeline");
-    sortKBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * SORT_ALLOCATE_MULTIPLIER,
+    sortKBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier,
                                       false);
-    sortKBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * SORT_ALLOCATE_MULTIPLIER,
+    sortKBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier,
                                      false);
-    sortVBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * SORT_ALLOCATE_MULTIPLIER,
+    sortVBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier,
                                       false);
-    sortVBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * SORT_ALLOCATE_MULTIPLIER,
+    sortVBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier,
                                      false);
 
-    uint32_t globalInvocationSize = scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER / numRadixSortBlocksPerWorkgroup;
-    uint32_t remainder = scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER % numRadixSortBlocksPerWorkgroup;
+    uint32_t globalInvocationSize = scene->getNumVertices() * sortBufferSizeMultiplier / numRadixSortBlocksPerWorkgroup;
+    uint32_t remainder = scene->getNumVertices() * sortBufferSizeMultiplier % numRadixSortBlocksPerWorkgroup;
     globalInvocationSize += remainder > 0 ? 1 : 0;
 
     auto numWorkgroups = (globalInvocationSize + 256 - 1) / 256;
@@ -338,6 +336,7 @@ void Renderer::run() {
             throw std::runtime_error("Failed to acquire swapchain image");
         }
 
+        startOfRenderLoop:
         handleInput();
 
         updateUniforms();
@@ -351,7 +350,9 @@ void Renderer::run() {
         }
         context->device->resetFences(inflightFences[0].get());
 
-        recordRenderCommandBuffer(0);
+        if (!recordRenderCommandBuffer(0)) {
+            goto startOfRenderLoop;
+        }
         vk::PipelineStageFlags waitStage = vk::PipelineStageFlagBits::eComputeShader;
         submitInfo = vk::SubmitInfo {}.setWaitSemaphores(swapchain->imageAvailableSemaphores[0].get())
             .setCommandBuffers(renderCommandBuffer.get())
@@ -421,9 +422,12 @@ void Renderer::createCommandPool() {
 
 void Renderer::recordPreprocessCommandBuffer() {
     spdlog::debug("Recording preprocess command buffer");
-    vk::CommandBufferAllocateInfo allocateInfo = {commandPool.get(), vk::CommandBufferLevel::ePrimary, 1};
-    auto buffers = context->device->allocateCommandBuffersUnique(allocateInfo);
-    preprocessCommandBuffer = std::move(buffers[0]);
+    if (!preprocessCommandBuffer) {
+        vk::CommandBufferAllocateInfo allocateInfo = {commandPool.get(), vk::CommandBufferLevel::ePrimary, 1};
+        auto buffers = context->device->allocateCommandBuffersUnique(allocateInfo);
+        preprocessCommandBuffer = std::move(buffers[0]);
+    }
+    preprocessCommandBuffer->reset();
 
     auto numGroups = (scene->getNumVertices() + 255) / 256;
 
@@ -492,20 +496,42 @@ void Renderer::recordPreprocessCommandBuffer() {
 }
 
 
-void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
+bool Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
     if (!renderCommandBuffer) {
         renderCommandBuffer = std::move(context->device->allocateCommandBuffersUnique(
             vk::CommandBufferAllocateInfo(commandPool.get(), vk::CommandBufferLevel::ePrimary, 1))[0]);
     }
+
+    uint32_t numInstances = totalSumBufferHost->readOne<uint32_t>();
+    if (numInstances > scene->getNumVertices() * sortBufferSizeMultiplier) {
+        auto old = sortBufferSizeMultiplier;
+        while (numInstances > scene->getNumVertices() * sortBufferSizeMultiplier) {
+            sortBufferSizeMultiplier++;
+        }
+        spdlog::info("Reallocating sort buffers. {} -> {}", old, sortBufferSizeMultiplier);
+        sortKBufferEven->realloc(scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier);
+        sortKBufferOdd->realloc(scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier);
+        sortVBufferEven->realloc(scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier);
+        sortVBufferOdd->realloc(scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier);
+
+        uint32_t globalInvocationSize = scene->getNumVertices() * sortBufferSizeMultiplier / numRadixSortBlocksPerWorkgroup;
+        uint32_t remainder = scene->getNumVertices() * sortBufferSizeMultiplier % numRadixSortBlocksPerWorkgroup;
+        globalInvocationSize += remainder > 0 ? 1 : 0;
+
+        auto numWorkgroups = (globalInvocationSize + 256 - 1) / 256;
+
+        sortHistBuffer->realloc(numWorkgroups * 256 * sizeof(uint32_t));
+
+        recordPreprocessCommandBuffer();
+        return false;
+    }
+
     renderCommandBuffer->reset({});
     renderCommandBuffer->begin(vk::CommandBufferBeginInfo{});
 
-    uint32_t numInstances = totalSumBufferHost->readOne<uint32_t>();
     // std::cout << "Num instances: " << numInstances << std::endl;
-    if (numInstances > scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER) {
-        throw std::runtime_error("Gaussian instantiation out of memory");
-    }
-    assert(numInstances <= scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER);
+
+    assert(numInstances <= scene->getNumVertices() * sortBufferSizeMultiplier);
     for (auto i = 0; i < 8; i++) {
         sortHistPipeline->bind(renderCommandBuffer, 0, i % 2 == 0 ? 0 : 1);
         if (i == 0) {
@@ -625,6 +651,8 @@ void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
                                          vk::DependencyFlagBits::eByRegion, nullptr, nullptr, imageMemoryBarrier);
     }
     renderCommandBuffer->end();
+
+    return true;
 }
 
 void Renderer::updateUniforms() {

diff --git a/3dgs/Renderer.h b/3dgs/Renderer.h
@@ -137,6 +137,8 @@ class Renderer {
     int fpsCounter = 0;
     std::chrono::high_resolution_clock::time_point lastFpsTime = std::chrono::high_resolution_clock::now();
 
+    unsigned int sortBufferSizeMultiplier = 3;
+
     void initializeVulkan();
 
     void loadSceneToGPU();
@@ -155,7 +157,7 @@ class Renderer {
 
     void recordPreprocessCommandBuffer();
 
-    void recordRenderCommandBuffer(uint32_t currentFrame);
+    bool recordRenderCommandBuffer(uint32_t currentFrame);
 
     void createCommandPool();
 

diff --git a/vulkan/Buffer.cpp b/vulkan/Buffer.cpp
@@ -3,15 +3,9 @@
 #include "Buffer.h"
 
 #include "Utils.h"
+#include "spdlog/spdlog.h"
 
-Buffer::Buffer(const std::shared_ptr<VulkanContext>& _context, uint32_t size, vk::BufferUsageFlags usage,
-               VmaMemoryUsage vmaUsage, VmaAllocationCreateFlags flags, bool shared, vk::DeviceSize alignment)
-        : context(_context),
-          size(size),
-          usage(usage),
-          vmaUsage(vmaUsage),
-          flags(flags),
-          allocation(nullptr) {
+void Buffer::alloc() {
     auto bufferInfo = vk::BufferCreateInfo()
             .setSize(size)
             .setUsage(usage)
@@ -33,7 +27,8 @@ Buffer::Buffer(const std::shared_ptr<VulkanContext>& _context, uint32_t size, vk
 
     VkResult res;
     if (alignment != 0) {
-        res = vmaCreateBufferWithAlignment(context->allocator, &vkBufferInfo, &allocInfo, alignment, &vkBuffer, &allocation, &allocation_info);
+        res = vmaCreateBufferWithAlignment(context->allocator, &vkBufferInfo, &allocInfo, alignment, &vkBuffer,
+                                           &allocation, &allocation_info);
     } else {
         res = vmaCreateBuffer(context->allocator, &vkBufferInfo, &allocInfo, &vkBuffer, &allocation, &allocation_info);
     }
@@ -43,20 +38,33 @@ Buffer::Buffer(const std::shared_ptr<VulkanContext>& _context, uint32_t size, vk
     buffer = static_cast<vk::Buffer>(vkBuffer);
 }
 
+Buffer::Buffer(const std::shared_ptr<VulkanContext>& _context, uint32_t size, vk::BufferUsageFlags usage,
+               VmaMemoryUsage vmaUsage, VmaAllocationCreateFlags flags, bool shared, vk::DeviceSize alignment)
+    : context(_context),
+      size(size),
+      alignment(alignment),
+      shared(shared),
+      usage(usage),
+      vmaUsage(vmaUsage),
+      flags(flags),
+      allocation(nullptr) {
+    alloc();
+}
+
 Buffer Buffer::createStagingBuffer(uint32_t size) {
     return Buffer(context, size, vk::BufferUsageFlagBits::eTransferSrc,
                   VMA_MEMORY_USAGE_AUTO,
                   VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT, false);
 }
 
-void Buffer::upload(const void *data, uint32_t size, uint32_t offset) {
+void Buffer::upload(const void* data, uint32_t size, uint32_t offset) {
     if (size + offset > this->size) {
         throw std::runtime_error("Buffer overflow");
     }
 
     if (vmaUsage == VMA_MEMORY_USAGE_GPU_ONLY || vmaUsage == VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE) {
         auto stagingBuffer = createStagingBuffer(size);
-        memcpy(stagingBuffer.allocation_info.pMappedData, ((char*) data) + offset, size);
+        memcpy(stagingBuffer.allocation_info.pMappedData, ((char *) data) + offset, size);
         auto commandBuffer = context->beginOneTimeCommandBuffer();
         vk::BufferCopy copyRegion = {};
         copyRegion.setSize(size);
@@ -103,28 +111,62 @@ void Buffer::downloadTo(std::shared_ptr<Buffer> buffer, vk::DeviceSize srcOffset
 
 Buffer::~Buffer() {
     vmaDestroyBuffer(context->allocator, buffer, allocation);
+    spdlog::debug("Buffer destroyed");
+}
+
+void Buffer::realloc(uint64_t newSize) {
+    vmaDestroyBuffer(context->allocator, buffer, allocation);
+
+    size = newSize;
+    alloc();
+
+    std::vector<vk::WriteDescriptorSet> writeDescriptorSets;
+    for (auto& tuple: boundDescriptorSets) {
+        auto descriptorSet = std::get<0>(tuple);
+        auto shared = descriptorSet.lock();
+        if (shared) {
+            vk::DescriptorBufferInfo bufferInfo(buffer, 0, size);
+            writeDescriptorSets.emplace_back(shared->descriptorSets[std::get<1>(tuple)].get(),
+                                            std::get<2>(tuple), 0, 1,
+                                            std::get<3>(tuple), nullptr,
+                                            &bufferInfo);
+        }
+    }
+    if (!writeDescriptorSets.empty()) {
+        context->device->updateDescriptorSets(writeDescriptorSets, nullptr);
+    }
+}
+
+void Buffer::boundToDescriptorSet(std::weak_ptr<DescriptorSet> descriptorSet, uint32_t set, uint32_t binding,
+    vk::DescriptorType type) {
+    boundDescriptorSets.push_back({descriptorSet, set, binding, type});
 }
 
 std::shared_ptr<Buffer> Buffer::uniform(std::shared_ptr<VulkanContext> context, uint32_t size, bool concurrentSharing) {
     return std::make_shared<Buffer>(std::move(context), size, vk::BufferUsageFlagBits::eUniformBuffer,
-                  VMA_MEMORY_USAGE_AUTO,
-                  VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT, concurrentSharing);
+                                    VMA_MEMORY_USAGE_AUTO,
+                                    VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT,
+                                    concurrentSharing);
 }
 
 std::shared_ptr<Buffer> Buffer::staging(std::shared_ptr<VulkanContext> context, unsigned long size) {
-    return std::make_shared<Buffer>(context, size, vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
+    return std::make_shared<Buffer>(context, size,
+                                    vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
                                     VMA_MEMORY_USAGE_AUTO, VMA_ALLOCATION_CREATE_MAPPED_BIT |
                                                            VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT,
                                     false);
 }
 
-std::shared_ptr<Buffer> Buffer::storage(std::shared_ptr<VulkanContext> context, uint64_t size, bool concurrentSharing, vk::DeviceSize alignment) {
-    return std::make_shared<Buffer>(context, size, vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc,
-                                              VMA_MEMORY_USAGE_GPU_ONLY, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT,
-                                              concurrentSharing, alignment);
+std::shared_ptr<Buffer> Buffer::storage(std::shared_ptr<VulkanContext> context, uint64_t size, bool concurrentSharing,
+                                        vk::DeviceSize alignment) {
+    return std::make_shared<Buffer>(context, size,
+                                    vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst |
+                                    vk::BufferUsageFlagBits::eTransferSrc,
+                                    VMA_MEMORY_USAGE_GPU_ONLY, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT,
+                                    concurrentSharing, alignment);
 }
 
-void Buffer::assertEquals(char *data, size_t length) {
+void Buffer::assertEquals(char* data, size_t length) {
     if (length > size) {
         throw std::runtime_error("Buffer overflow");
     }
@@ -170,6 +212,8 @@ void Buffer::computeWriteWriteBarrier(vk::CommandBuffer commandBuffer) {
 std::vector<char> Buffer::download() {
     auto stagingBuffer = Buffer::staging(context, size);
     downloadTo(stagingBuffer);
-    return {(char *) stagingBuffer->allocation_info.pMappedData,
-                             ((char *) stagingBuffer->allocation_info.pMappedData) + size};
+    return {
+        (char *) stagingBuffer->allocation_info.pMappedData,
+        ((char *) stagingBuffer->allocation_info.pMappedData) + size
+    };
 }
diff --git a/vulkan/Buffer.h b/vulkan/Buffer.h
@@ -3,9 +3,13 @@
 
 #include <cstdint>
 #include <memory>
+
+#include "DescriptorSet.h"
 #include "VulkanContext.h"
 #include "vk_mem_alloc.h"
 
+class DescriptorSet;
+
 class Buffer : public std::enable_shared_from_this<Buffer> {
 public:
     Buffer(const std::shared_ptr<VulkanContext>& context, uint32_t size, vk::BufferUsageFlags usage, VmaMemoryUsage vmaUsage,
@@ -21,6 +25,10 @@ class Buffer : public std::enable_shared_from_this<Buffer> {
 
     ~Buffer();
 
+    void realloc(uint64_t uint64);
+
+    void boundToDescriptorSet(std::weak_ptr<DescriptorSet> descriptorSet, uint32_t set, uint32_t binding, vk::DescriptorType type);
+
     static std::shared_ptr<Buffer> uniform(std::shared_ptr<VulkanContext> context, uint32_t size, bool concurrentSharing = false);
 
     static std::shared_ptr<Buffer> staging(std::shared_ptr<VulkanContext> context, unsigned long size);
@@ -56,6 +64,9 @@ class Buffer : public std::enable_shared_from_this<Buffer> {
 
     vk::DeviceSize size;
     vk::BufferUsageFlags usage;
+    uint64_t alignment;
+    bool shared;
+
     vk::Buffer buffer;
     VmaAllocation allocation;
     VmaAllocationInfo allocation_info;
@@ -65,8 +76,12 @@ class Buffer : public std::enable_shared_from_this<Buffer> {
 
 
 private:
+    void alloc();
+
     Buffer createStagingBuffer(uint32_t size);
     std::shared_ptr<VulkanContext> context;
+
+    std::vector<std::tuple<std::weak_ptr<DescriptorSet>, uint32_t, uint32_t, vk::DescriptorType>> boundDescriptorSets;
 };
 
 

diff --git a/vulkan/DescriptorSet.cpp b/vulkan/DescriptorSet.cpp
@@ -45,6 +45,7 @@ void DescriptorSet::build() {
             for (auto j = 0; j < maxOptions; j++) {
                 if (binding.second.size() == 1) {
                     if (binding.second[0].buffer != nullptr) {
+                        binding.second[0].buffer->boundToDescriptorSet(static_cast<std::weak_ptr<DescriptorSet>>(shared_from_this()), i * maxOptions + j, binding.first, binding.second[0].type);
                         writeDescriptorSets.emplace_back(descriptorSets[i * maxOptions + j].get(), binding.first, 0, 1,
                                                      binding.second[0].type, nullptr,
                                                      &binding.second[0].bufferInfo);
@@ -56,6 +57,7 @@ void DescriptorSet::build() {
                 }
                 else {
                     if (binding.second.at(j).buffer != nullptr) {
+                        binding.second.at(j).buffer->boundToDescriptorSet(static_cast<std::weak_ptr<DescriptorSet>>(shared_from_this()), i * maxOptions + j, binding.first, binding.second.at(j).type);
                         writeDescriptorSets.emplace_back(descriptorSets[i * maxOptions + j].get(), binding.first, 0, 1,
                                                          binding.second.at(j).type, nullptr,
                                                          &binding.second.at(j).bufferInfo);

diff --git a/vulkan/DescriptorSet.h b/vulkan/DescriptorSet.h
@@ -9,7 +9,9 @@
 
 #include "Swapchain.h"
 
-class DescriptorSet {
+class Buffer;
+
+class DescriptorSet : public std::enable_shared_from_this<DescriptorSet> {
 public:
     struct DescriptorBinding {
         vk::DescriptorType type;
@@ -36,12 +38,13 @@ class DescriptorSet {
 
     vk::UniqueDescriptorSetLayout descriptorSetLayout;
 
+    std::vector<vk::UniqueDescriptorSet> descriptorSets;
+    size_t maxOptions = 1;
+
 private:
     const std::shared_ptr<VulkanContext> context;
     const uint8_t framesInFlight;
     std::unordered_map<uint32_t, std::vector<DescriptorBinding>> bindings;
-    std::vector<vk::UniqueDescriptorSet> descriptorSets;
-    size_t maxOptions = 1;
 };