Skip to content

Commit

Permalink
Automatically increase sort buffer size
Browse files Browse the repository at this point in the history
  • Loading branch information
shg8 committed Feb 23, 2024
1 parent 571c0cc commit 8e0a94b
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 43 deletions.
64 changes: 46 additions & 18 deletions 3dgs/Renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@

#include <spdlog/spdlog.h>

#define SORT_ALLOCATE_MULTIPLIER 10

void Renderer::initialize() {
initializeVulkan();
createGui();
Expand Down Expand Up @@ -189,17 +187,17 @@ void Renderer::createPrefixSumPipeline() {

void Renderer::createRadixSortPipeline() {
spdlog::debug("Creating radix sort pipeline");
sortKBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * SORT_ALLOCATE_MULTIPLIER,
sortKBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier,
false);
sortKBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * SORT_ALLOCATE_MULTIPLIER,
sortKBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier,
false);
sortVBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * SORT_ALLOCATE_MULTIPLIER,
sortVBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier,
false);
sortVBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * SORT_ALLOCATE_MULTIPLIER,
sortVBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier,
false);

uint32_t globalInvocationSize = scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER / numRadixSortBlocksPerWorkgroup;
uint32_t remainder = scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER % numRadixSortBlocksPerWorkgroup;
uint32_t globalInvocationSize = scene->getNumVertices() * sortBufferSizeMultiplier / numRadixSortBlocksPerWorkgroup;
uint32_t remainder = scene->getNumVertices() * sortBufferSizeMultiplier % numRadixSortBlocksPerWorkgroup;
globalInvocationSize += remainder > 0 ? 1 : 0;

auto numWorkgroups = (globalInvocationSize + 256 - 1) / 256;
Expand Down Expand Up @@ -338,6 +336,7 @@ void Renderer::run() {
throw std::runtime_error("Failed to acquire swapchain image");
}

startOfRenderLoop:
handleInput();

updateUniforms();
Expand All @@ -351,7 +350,9 @@ void Renderer::run() {
}
context->device->resetFences(inflightFences[0].get());

recordRenderCommandBuffer(0);
if (!recordRenderCommandBuffer(0)) {
goto startOfRenderLoop;
}
vk::PipelineStageFlags waitStage = vk::PipelineStageFlagBits::eComputeShader;
submitInfo = vk::SubmitInfo {}.setWaitSemaphores(swapchain->imageAvailableSemaphores[0].get())
.setCommandBuffers(renderCommandBuffer.get())
Expand Down Expand Up @@ -421,9 +422,12 @@ void Renderer::createCommandPool() {

void Renderer::recordPreprocessCommandBuffer() {
spdlog::debug("Recording preprocess command buffer");
vk::CommandBufferAllocateInfo allocateInfo = {commandPool.get(), vk::CommandBufferLevel::ePrimary, 1};
auto buffers = context->device->allocateCommandBuffersUnique(allocateInfo);
preprocessCommandBuffer = std::move(buffers[0]);
if (!preprocessCommandBuffer) {
vk::CommandBufferAllocateInfo allocateInfo = {commandPool.get(), vk::CommandBufferLevel::ePrimary, 1};
auto buffers = context->device->allocateCommandBuffersUnique(allocateInfo);
preprocessCommandBuffer = std::move(buffers[0]);
}
preprocessCommandBuffer->reset();

auto numGroups = (scene->getNumVertices() + 255) / 256;

Expand Down Expand Up @@ -492,20 +496,42 @@ void Renderer::recordPreprocessCommandBuffer() {
}


void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
bool Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
if (!renderCommandBuffer) {
renderCommandBuffer = std::move(context->device->allocateCommandBuffersUnique(
vk::CommandBufferAllocateInfo(commandPool.get(), vk::CommandBufferLevel::ePrimary, 1))[0]);
}

uint32_t numInstances = totalSumBufferHost->readOne<uint32_t>();
if (numInstances > scene->getNumVertices() * sortBufferSizeMultiplier) {
auto old = sortBufferSizeMultiplier;
while (numInstances > scene->getNumVertices() * sortBufferSizeMultiplier) {
sortBufferSizeMultiplier++;
}
spdlog::info("Reallocating sort buffers. {} -> {}", old, sortBufferSizeMultiplier);
sortKBufferEven->realloc(scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier);
sortKBufferOdd->realloc(scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier);
sortVBufferEven->realloc(scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier);
sortVBufferOdd->realloc(scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier);

uint32_t globalInvocationSize = scene->getNumVertices() * sortBufferSizeMultiplier / numRadixSortBlocksPerWorkgroup;
uint32_t remainder = scene->getNumVertices() * sortBufferSizeMultiplier % numRadixSortBlocksPerWorkgroup;
globalInvocationSize += remainder > 0 ? 1 : 0;

auto numWorkgroups = (globalInvocationSize + 256 - 1) / 256;

sortHistBuffer->realloc(numWorkgroups * 256 * sizeof(uint32_t));

recordPreprocessCommandBuffer();
return false;
}

renderCommandBuffer->reset({});
renderCommandBuffer->begin(vk::CommandBufferBeginInfo{});

uint32_t numInstances = totalSumBufferHost->readOne<uint32_t>();
// std::cout << "Num instances: " << numInstances << std::endl;
if (numInstances > scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER) {
throw std::runtime_error("Gaussian instantiation out of memory");
}
assert(numInstances <= scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER);

assert(numInstances <= scene->getNumVertices() * sortBufferSizeMultiplier);
for (auto i = 0; i < 8; i++) {
sortHistPipeline->bind(renderCommandBuffer, 0, i % 2 == 0 ? 0 : 1);
if (i == 0) {
Expand Down Expand Up @@ -625,6 +651,8 @@ void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
vk::DependencyFlagBits::eByRegion, nullptr, nullptr, imageMemoryBarrier);
}
renderCommandBuffer->end();

return true;
}

void Renderer::updateUniforms() {
Expand Down
4 changes: 3 additions & 1 deletion 3dgs/Renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ class Renderer {
int fpsCounter = 0;
std::chrono::high_resolution_clock::time_point lastFpsTime = std::chrono::high_resolution_clock::now();

unsigned int sortBufferSizeMultiplier = 3;

void initializeVulkan();

void loadSceneToGPU();
Expand All @@ -155,7 +157,7 @@ class Renderer {

void recordPreprocessCommandBuffer();

void recordRenderCommandBuffer(uint32_t currentFrame);
bool recordRenderCommandBuffer(uint32_t currentFrame);

void createCommandPool();

Expand Down
86 changes: 65 additions & 21 deletions vulkan/Buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,9 @@
#include "Buffer.h"

#include "Utils.h"
#include "spdlog/spdlog.h"

Buffer::Buffer(const std::shared_ptr<VulkanContext>& _context, uint32_t size, vk::BufferUsageFlags usage,
VmaMemoryUsage vmaUsage, VmaAllocationCreateFlags flags, bool shared, vk::DeviceSize alignment)
: context(_context),
size(size),
usage(usage),
vmaUsage(vmaUsage),
flags(flags),
allocation(nullptr) {
void Buffer::alloc() {
auto bufferInfo = vk::BufferCreateInfo()
.setSize(size)
.setUsage(usage)
Expand All @@ -33,7 +27,8 @@ Buffer::Buffer(const std::shared_ptr<VulkanContext>& _context, uint32_t size, vk

VkResult res;
if (alignment != 0) {
res = vmaCreateBufferWithAlignment(context->allocator, &vkBufferInfo, &allocInfo, alignment, &vkBuffer, &allocation, &allocation_info);
res = vmaCreateBufferWithAlignment(context->allocator, &vkBufferInfo, &allocInfo, alignment, &vkBuffer,
&allocation, &allocation_info);
} else {
res = vmaCreateBuffer(context->allocator, &vkBufferInfo, &allocInfo, &vkBuffer, &allocation, &allocation_info);
}
Expand All @@ -43,20 +38,33 @@ Buffer::Buffer(const std::shared_ptr<VulkanContext>& _context, uint32_t size, vk
buffer = static_cast<vk::Buffer>(vkBuffer);
}

Buffer::Buffer(const std::shared_ptr<VulkanContext>& _context, uint32_t size, vk::BufferUsageFlags usage,
VmaMemoryUsage vmaUsage, VmaAllocationCreateFlags flags, bool shared, vk::DeviceSize alignment)
: context(_context),
size(size),
alignment(alignment),
shared(shared),
usage(usage),
vmaUsage(vmaUsage),
flags(flags),
allocation(nullptr) {
alloc();
}

Buffer Buffer::createStagingBuffer(uint32_t size) {
return Buffer(context, size, vk::BufferUsageFlagBits::eTransferSrc,
VMA_MEMORY_USAGE_AUTO,
VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT, false);
}

void Buffer::upload(const void *data, uint32_t size, uint32_t offset) {
void Buffer::upload(const void* data, uint32_t size, uint32_t offset) {
if (size + offset > this->size) {
throw std::runtime_error("Buffer overflow");
}

if (vmaUsage == VMA_MEMORY_USAGE_GPU_ONLY || vmaUsage == VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE) {
auto stagingBuffer = createStagingBuffer(size);
memcpy(stagingBuffer.allocation_info.pMappedData, ((char*) data) + offset, size);
memcpy(stagingBuffer.allocation_info.pMappedData, ((char *) data) + offset, size);
auto commandBuffer = context->beginOneTimeCommandBuffer();
vk::BufferCopy copyRegion = {};
copyRegion.setSize(size);
Expand Down Expand Up @@ -103,28 +111,62 @@ void Buffer::downloadTo(std::shared_ptr<Buffer> buffer, vk::DeviceSize srcOffset

Buffer::~Buffer() {
vmaDestroyBuffer(context->allocator, buffer, allocation);
spdlog::debug("Buffer destroyed");
}

void Buffer::realloc(uint64_t newSize) {
vmaDestroyBuffer(context->allocator, buffer, allocation);

size = newSize;
alloc();

std::vector<vk::WriteDescriptorSet> writeDescriptorSets;
for (auto& tuple: boundDescriptorSets) {
auto descriptorSet = std::get<0>(tuple);
auto shared = descriptorSet.lock();
if (shared) {
vk::DescriptorBufferInfo bufferInfo(buffer, 0, size);
writeDescriptorSets.emplace_back(shared->descriptorSets[std::get<1>(tuple)].get(),
std::get<2>(tuple), 0, 1,
std::get<3>(tuple), nullptr,
&bufferInfo);
}
}
if (!writeDescriptorSets.empty()) {
context->device->updateDescriptorSets(writeDescriptorSets, nullptr);
}
}

void Buffer::boundToDescriptorSet(std::weak_ptr<DescriptorSet> descriptorSet, uint32_t set, uint32_t binding,
vk::DescriptorType type) {
boundDescriptorSets.push_back({descriptorSet, set, binding, type});
}

std::shared_ptr<Buffer> Buffer::uniform(std::shared_ptr<VulkanContext> context, uint32_t size, bool concurrentSharing) {
return std::make_shared<Buffer>(std::move(context), size, vk::BufferUsageFlagBits::eUniformBuffer,
VMA_MEMORY_USAGE_AUTO,
VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT, concurrentSharing);
VMA_MEMORY_USAGE_AUTO,
VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT,
concurrentSharing);
}

std::shared_ptr<Buffer> Buffer::staging(std::shared_ptr<VulkanContext> context, unsigned long size) {
return std::make_shared<Buffer>(context, size, vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
return std::make_shared<Buffer>(context, size,
vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
VMA_MEMORY_USAGE_AUTO, VMA_ALLOCATION_CREATE_MAPPED_BIT |
VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT,
false);
}

std::shared_ptr<Buffer> Buffer::storage(std::shared_ptr<VulkanContext> context, uint64_t size, bool concurrentSharing, vk::DeviceSize alignment) {
return std::make_shared<Buffer>(context, size, vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc,
VMA_MEMORY_USAGE_GPU_ONLY, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT,
concurrentSharing, alignment);
std::shared_ptr<Buffer> Buffer::storage(std::shared_ptr<VulkanContext> context, uint64_t size, bool concurrentSharing,
vk::DeviceSize alignment) {
return std::make_shared<Buffer>(context, size,
vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst |
vk::BufferUsageFlagBits::eTransferSrc,
VMA_MEMORY_USAGE_GPU_ONLY, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT,
concurrentSharing, alignment);
}

void Buffer::assertEquals(char *data, size_t length) {
void Buffer::assertEquals(char* data, size_t length) {
if (length > size) {
throw std::runtime_error("Buffer overflow");
}
Expand Down Expand Up @@ -170,6 +212,8 @@ void Buffer::computeWriteWriteBarrier(vk::CommandBuffer commandBuffer) {
std::vector<char> Buffer::download() {
auto stagingBuffer = Buffer::staging(context, size);
downloadTo(stagingBuffer);
return {(char *) stagingBuffer->allocation_info.pMappedData,
((char *) stagingBuffer->allocation_info.pMappedData) + size};
return {
(char *) stagingBuffer->allocation_info.pMappedData,
((char *) stagingBuffer->allocation_info.pMappedData) + size
};
}
15 changes: 15 additions & 0 deletions vulkan/Buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@

#include <cstdint>
#include <memory>

#include "DescriptorSet.h"
#include "VulkanContext.h"
#include "vk_mem_alloc.h"

class DescriptorSet;

class Buffer : public std::enable_shared_from_this<Buffer> {
public:
Buffer(const std::shared_ptr<VulkanContext>& context, uint32_t size, vk::BufferUsageFlags usage, VmaMemoryUsage vmaUsage,
Expand All @@ -21,6 +25,10 @@ class Buffer : public std::enable_shared_from_this<Buffer> {

~Buffer();

void realloc(uint64_t uint64);

void boundToDescriptorSet(std::weak_ptr<DescriptorSet> descriptorSet, uint32_t set, uint32_t binding, vk::DescriptorType type);

static std::shared_ptr<Buffer> uniform(std::shared_ptr<VulkanContext> context, uint32_t size, bool concurrentSharing = false);

static std::shared_ptr<Buffer> staging(std::shared_ptr<VulkanContext> context, unsigned long size);
Expand Down Expand Up @@ -56,6 +64,9 @@ class Buffer : public std::enable_shared_from_this<Buffer> {

vk::DeviceSize size;
vk::BufferUsageFlags usage;
uint64_t alignment;
bool shared;

vk::Buffer buffer;
VmaAllocation allocation;
VmaAllocationInfo allocation_info;
Expand All @@ -65,8 +76,12 @@ class Buffer : public std::enable_shared_from_this<Buffer> {


private:
void alloc();

Buffer createStagingBuffer(uint32_t size);
std::shared_ptr<VulkanContext> context;

std::vector<std::tuple<std::weak_ptr<DescriptorSet>, uint32_t, uint32_t, vk::DescriptorType>> boundDescriptorSets;
};


Expand Down
2 changes: 2 additions & 0 deletions vulkan/DescriptorSet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ void DescriptorSet::build() {
for (auto j = 0; j < maxOptions; j++) {
if (binding.second.size() == 1) {
if (binding.second[0].buffer != nullptr) {
binding.second[0].buffer->boundToDescriptorSet(static_cast<std::weak_ptr<DescriptorSet>>(shared_from_this()), i * maxOptions + j, binding.first, binding.second[0].type);
writeDescriptorSets.emplace_back(descriptorSets[i * maxOptions + j].get(), binding.first, 0, 1,
binding.second[0].type, nullptr,
&binding.second[0].bufferInfo);
Expand All @@ -56,6 +57,7 @@ void DescriptorSet::build() {
}
else {
if (binding.second.at(j).buffer != nullptr) {
binding.second.at(j).buffer->boundToDescriptorSet(static_cast<std::weak_ptr<DescriptorSet>>(shared_from_this()), i * maxOptions + j, binding.first, binding.second.at(j).type);
writeDescriptorSets.emplace_back(descriptorSets[i * maxOptions + j].get(), binding.first, 0, 1,
binding.second.at(j).type, nullptr,
&binding.second.at(j).bufferInfo);
Expand Down
9 changes: 6 additions & 3 deletions vulkan/DescriptorSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@

#include "Swapchain.h"

class DescriptorSet {
class Buffer;

class DescriptorSet : public std::enable_shared_from_this<DescriptorSet> {
public:
struct DescriptorBinding {
vk::DescriptorType type;
Expand All @@ -36,12 +38,13 @@ class DescriptorSet {

vk::UniqueDescriptorSetLayout descriptorSetLayout;

std::vector<vk::UniqueDescriptorSet> descriptorSets;
size_t maxOptions = 1;

private:
const std::shared_ptr<VulkanContext> context;
const uint8_t framesInFlight;
std::unordered_map<uint32_t, std::vector<DescriptorBinding>> bindings;
std::vector<vk::UniqueDescriptorSet> descriptorSets;
size_t maxOptions = 1;
};


Expand Down

0 comments on commit 8e0a94b

Please sign in to comment.