Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/dynamic-alloc: automatically increase sort buffer size #12

Merged
merged 1 commit into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 46 additions & 18 deletions 3dgs/Renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@

#include <spdlog/spdlog.h>

#define SORT_ALLOCATE_MULTIPLIER 10

void Renderer::initialize() {
initializeVulkan();
createGui();
Expand Down Expand Up @@ -189,17 +187,17 @@ void Renderer::createPrefixSumPipeline() {

void Renderer::createRadixSortPipeline() {
spdlog::debug("Creating radix sort pipeline");
sortKBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * SORT_ALLOCATE_MULTIPLIER,
sortKBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier,
false);
sortKBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * SORT_ALLOCATE_MULTIPLIER,
sortKBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier,
false);
sortVBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * SORT_ALLOCATE_MULTIPLIER,
sortVBufferEven = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier,
false);
sortVBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * SORT_ALLOCATE_MULTIPLIER,
sortVBufferOdd = Buffer::storage(context, scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier,
false);

uint32_t globalInvocationSize = scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER / numRadixSortBlocksPerWorkgroup;
uint32_t remainder = scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER % numRadixSortBlocksPerWorkgroup;
uint32_t globalInvocationSize = scene->getNumVertices() * sortBufferSizeMultiplier / numRadixSortBlocksPerWorkgroup;
uint32_t remainder = scene->getNumVertices() * sortBufferSizeMultiplier % numRadixSortBlocksPerWorkgroup;
globalInvocationSize += remainder > 0 ? 1 : 0;

auto numWorkgroups = (globalInvocationSize + 256 - 1) / 256;
Expand Down Expand Up @@ -338,6 +336,7 @@ void Renderer::run() {
throw std::runtime_error("Failed to acquire swapchain image");
}

startOfRenderLoop:
handleInput();

updateUniforms();
Expand All @@ -351,7 +350,9 @@ void Renderer::run() {
}
context->device->resetFences(inflightFences[0].get());

recordRenderCommandBuffer(0);
if (!recordRenderCommandBuffer(0)) {
goto startOfRenderLoop;
}
vk::PipelineStageFlags waitStage = vk::PipelineStageFlagBits::eComputeShader;
submitInfo = vk::SubmitInfo {}.setWaitSemaphores(swapchain->imageAvailableSemaphores[0].get())
.setCommandBuffers(renderCommandBuffer.get())
Expand Down Expand Up @@ -421,9 +422,12 @@ void Renderer::createCommandPool() {

void Renderer::recordPreprocessCommandBuffer() {
spdlog::debug("Recording preprocess command buffer");
vk::CommandBufferAllocateInfo allocateInfo = {commandPool.get(), vk::CommandBufferLevel::ePrimary, 1};
auto buffers = context->device->allocateCommandBuffersUnique(allocateInfo);
preprocessCommandBuffer = std::move(buffers[0]);
if (!preprocessCommandBuffer) {
vk::CommandBufferAllocateInfo allocateInfo = {commandPool.get(), vk::CommandBufferLevel::ePrimary, 1};
auto buffers = context->device->allocateCommandBuffersUnique(allocateInfo);
preprocessCommandBuffer = std::move(buffers[0]);
}
preprocessCommandBuffer->reset();

auto numGroups = (scene->getNumVertices() + 255) / 256;

Expand Down Expand Up @@ -492,20 +496,42 @@ void Renderer::recordPreprocessCommandBuffer() {
}


void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
bool Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
if (!renderCommandBuffer) {
renderCommandBuffer = std::move(context->device->allocateCommandBuffersUnique(
vk::CommandBufferAllocateInfo(commandPool.get(), vk::CommandBufferLevel::ePrimary, 1))[0]);
}

uint32_t numInstances = totalSumBufferHost->readOne<uint32_t>();
if (numInstances > scene->getNumVertices() * sortBufferSizeMultiplier) {
auto old = sortBufferSizeMultiplier;
while (numInstances > scene->getNumVertices() * sortBufferSizeMultiplier) {
sortBufferSizeMultiplier++;
}
spdlog::info("Reallocating sort buffers. {} -> {}", old, sortBufferSizeMultiplier);
sortKBufferEven->realloc(scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier);
sortKBufferOdd->realloc(scene->getNumVertices() * sizeof(uint64_t) * sortBufferSizeMultiplier);
sortVBufferEven->realloc(scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier);
sortVBufferOdd->realloc(scene->getNumVertices() * sizeof(uint32_t) * sortBufferSizeMultiplier);

uint32_t globalInvocationSize = scene->getNumVertices() * sortBufferSizeMultiplier / numRadixSortBlocksPerWorkgroup;
uint32_t remainder = scene->getNumVertices() * sortBufferSizeMultiplier % numRadixSortBlocksPerWorkgroup;
globalInvocationSize += remainder > 0 ? 1 : 0;

auto numWorkgroups = (globalInvocationSize + 256 - 1) / 256;

sortHistBuffer->realloc(numWorkgroups * 256 * sizeof(uint32_t));

recordPreprocessCommandBuffer();
return false;
}

renderCommandBuffer->reset({});
renderCommandBuffer->begin(vk::CommandBufferBeginInfo{});

uint32_t numInstances = totalSumBufferHost->readOne<uint32_t>();
// std::cout << "Num instances: " << numInstances << std::endl;
if (numInstances > scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER) {
throw std::runtime_error("Gaussian instantiation out of memory");
}
assert(numInstances <= scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER);

assert(numInstances <= scene->getNumVertices() * sortBufferSizeMultiplier);
for (auto i = 0; i < 8; i++) {
sortHistPipeline->bind(renderCommandBuffer, 0, i % 2 == 0 ? 0 : 1);
if (i == 0) {
Expand Down Expand Up @@ -625,6 +651,8 @@ void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
vk::DependencyFlagBits::eByRegion, nullptr, nullptr, imageMemoryBarrier);
}
renderCommandBuffer->end();

return true;
}

void Renderer::updateUniforms() {
Expand Down
4 changes: 3 additions & 1 deletion 3dgs/Renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ class Renderer {
int fpsCounter = 0;
std::chrono::high_resolution_clock::time_point lastFpsTime = std::chrono::high_resolution_clock::now();

unsigned int sortBufferSizeMultiplier = 3;

void initializeVulkan();

void loadSceneToGPU();
Expand All @@ -155,7 +157,7 @@ class Renderer {

void recordPreprocessCommandBuffer();

void recordRenderCommandBuffer(uint32_t currentFrame);
bool recordRenderCommandBuffer(uint32_t currentFrame);

void createCommandPool();

Expand Down
86 changes: 65 additions & 21 deletions vulkan/Buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,9 @@
#include "Buffer.h"

#include "Utils.h"
#include "spdlog/spdlog.h"

Buffer::Buffer(const std::shared_ptr<VulkanContext>& _context, uint32_t size, vk::BufferUsageFlags usage,
VmaMemoryUsage vmaUsage, VmaAllocationCreateFlags flags, bool shared, vk::DeviceSize alignment)
: context(_context),
size(size),
usage(usage),
vmaUsage(vmaUsage),
flags(flags),
allocation(nullptr) {
void Buffer::alloc() {
auto bufferInfo = vk::BufferCreateInfo()
.setSize(size)
.setUsage(usage)
Expand All @@ -33,7 +27,8 @@ Buffer::Buffer(const std::shared_ptr<VulkanContext>& _context, uint32_t size, vk

VkResult res;
if (alignment != 0) {
res = vmaCreateBufferWithAlignment(context->allocator, &vkBufferInfo, &allocInfo, alignment, &vkBuffer, &allocation, &allocation_info);
res = vmaCreateBufferWithAlignment(context->allocator, &vkBufferInfo, &allocInfo, alignment, &vkBuffer,
&allocation, &allocation_info);
} else {
res = vmaCreateBuffer(context->allocator, &vkBufferInfo, &allocInfo, &vkBuffer, &allocation, &allocation_info);
}
Expand All @@ -43,20 +38,33 @@ Buffer::Buffer(const std::shared_ptr<VulkanContext>& _context, uint32_t size, vk
buffer = static_cast<vk::Buffer>(vkBuffer);
}

Buffer::Buffer(const std::shared_ptr<VulkanContext>& _context, uint32_t size, vk::BufferUsageFlags usage,
VmaMemoryUsage vmaUsage, VmaAllocationCreateFlags flags, bool shared, vk::DeviceSize alignment)
: context(_context),
size(size),
alignment(alignment),
shared(shared),
usage(usage),
vmaUsage(vmaUsage),
flags(flags),
allocation(nullptr) {
alloc();
}

Buffer Buffer::createStagingBuffer(uint32_t size) {
return Buffer(context, size, vk::BufferUsageFlagBits::eTransferSrc,
VMA_MEMORY_USAGE_AUTO,
VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT, false);
}

void Buffer::upload(const void *data, uint32_t size, uint32_t offset) {
void Buffer::upload(const void* data, uint32_t size, uint32_t offset) {
if (size + offset > this->size) {
throw std::runtime_error("Buffer overflow");
}

if (vmaUsage == VMA_MEMORY_USAGE_GPU_ONLY || vmaUsage == VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE) {
auto stagingBuffer = createStagingBuffer(size);
memcpy(stagingBuffer.allocation_info.pMappedData, ((char*) data) + offset, size);
memcpy(stagingBuffer.allocation_info.pMappedData, ((char *) data) + offset, size);
auto commandBuffer = context->beginOneTimeCommandBuffer();
vk::BufferCopy copyRegion = {};
copyRegion.setSize(size);
Expand Down Expand Up @@ -103,28 +111,62 @@ void Buffer::downloadTo(std::shared_ptr<Buffer> buffer, vk::DeviceSize srcOffset

Buffer::~Buffer() {
vmaDestroyBuffer(context->allocator, buffer, allocation);
spdlog::debug("Buffer destroyed");
}

void Buffer::realloc(uint64_t newSize) {
vmaDestroyBuffer(context->allocator, buffer, allocation);

size = newSize;
alloc();

std::vector<vk::WriteDescriptorSet> writeDescriptorSets;
for (auto& tuple: boundDescriptorSets) {
auto descriptorSet = std::get<0>(tuple);
auto shared = descriptorSet.lock();
if (shared) {
vk::DescriptorBufferInfo bufferInfo(buffer, 0, size);
writeDescriptorSets.emplace_back(shared->descriptorSets[std::get<1>(tuple)].get(),
std::get<2>(tuple), 0, 1,
std::get<3>(tuple), nullptr,
&bufferInfo);
}
}
if (!writeDescriptorSets.empty()) {
context->device->updateDescriptorSets(writeDescriptorSets, nullptr);
}
}

void Buffer::boundToDescriptorSet(std::weak_ptr<DescriptorSet> descriptorSet, uint32_t set, uint32_t binding,
vk::DescriptorType type) {
boundDescriptorSets.push_back({descriptorSet, set, binding, type});
}

std::shared_ptr<Buffer> Buffer::uniform(std::shared_ptr<VulkanContext> context, uint32_t size, bool concurrentSharing) {
return std::make_shared<Buffer>(std::move(context), size, vk::BufferUsageFlagBits::eUniformBuffer,
VMA_MEMORY_USAGE_AUTO,
VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT, concurrentSharing);
VMA_MEMORY_USAGE_AUTO,
VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT,
concurrentSharing);
}

std::shared_ptr<Buffer> Buffer::staging(std::shared_ptr<VulkanContext> context, unsigned long size) {
return std::make_shared<Buffer>(context, size, vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
return std::make_shared<Buffer>(context, size,
vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
VMA_MEMORY_USAGE_AUTO, VMA_ALLOCATION_CREATE_MAPPED_BIT |
VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT,
false);
}

std::shared_ptr<Buffer> Buffer::storage(std::shared_ptr<VulkanContext> context, uint64_t size, bool concurrentSharing, vk::DeviceSize alignment) {
return std::make_shared<Buffer>(context, size, vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc,
VMA_MEMORY_USAGE_GPU_ONLY, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT,
concurrentSharing, alignment);
std::shared_ptr<Buffer> Buffer::storage(std::shared_ptr<VulkanContext> context, uint64_t size, bool concurrentSharing,
vk::DeviceSize alignment) {
return std::make_shared<Buffer>(context, size,
vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst |
vk::BufferUsageFlagBits::eTransferSrc,
VMA_MEMORY_USAGE_GPU_ONLY, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT,
concurrentSharing, alignment);
}

void Buffer::assertEquals(char *data, size_t length) {
void Buffer::assertEquals(char* data, size_t length) {
if (length > size) {
throw std::runtime_error("Buffer overflow");
}
Expand Down Expand Up @@ -170,6 +212,8 @@ void Buffer::computeWriteWriteBarrier(vk::CommandBuffer commandBuffer) {
std::vector<char> Buffer::download() {
auto stagingBuffer = Buffer::staging(context, size);
downloadTo(stagingBuffer);
return {(char *) stagingBuffer->allocation_info.pMappedData,
((char *) stagingBuffer->allocation_info.pMappedData) + size};
return {
(char *) stagingBuffer->allocation_info.pMappedData,
((char *) stagingBuffer->allocation_info.pMappedData) + size
};
}
15 changes: 15 additions & 0 deletions vulkan/Buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@

#include <cstdint>
#include <memory>

#include "DescriptorSet.h"
#include "VulkanContext.h"
#include "vk_mem_alloc.h"

class DescriptorSet;

class Buffer : public std::enable_shared_from_this<Buffer> {
public:
Buffer(const std::shared_ptr<VulkanContext>& context, uint32_t size, vk::BufferUsageFlags usage, VmaMemoryUsage vmaUsage,
Expand All @@ -21,6 +25,10 @@ class Buffer : public std::enable_shared_from_this<Buffer> {

~Buffer();

void realloc(uint64_t uint64);

void boundToDescriptorSet(std::weak_ptr<DescriptorSet> descriptorSet, uint32_t set, uint32_t binding, vk::DescriptorType type);

static std::shared_ptr<Buffer> uniform(std::shared_ptr<VulkanContext> context, uint32_t size, bool concurrentSharing = false);

static std::shared_ptr<Buffer> staging(std::shared_ptr<VulkanContext> context, unsigned long size);
Expand Down Expand Up @@ -56,6 +64,9 @@ class Buffer : public std::enable_shared_from_this<Buffer> {

vk::DeviceSize size;
vk::BufferUsageFlags usage;
uint64_t alignment;
bool shared;

vk::Buffer buffer;
VmaAllocation allocation;
VmaAllocationInfo allocation_info;
Expand All @@ -65,8 +76,12 @@ class Buffer : public std::enable_shared_from_this<Buffer> {


private:
void alloc();

Buffer createStagingBuffer(uint32_t size);
std::shared_ptr<VulkanContext> context;

std::vector<std::tuple<std::weak_ptr<DescriptorSet>, uint32_t, uint32_t, vk::DescriptorType>> boundDescriptorSets;
};


Expand Down
2 changes: 2 additions & 0 deletions vulkan/DescriptorSet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ void DescriptorSet::build() {
for (auto j = 0; j < maxOptions; j++) {
if (binding.second.size() == 1) {
if (binding.second[0].buffer != nullptr) {
binding.second[0].buffer->boundToDescriptorSet(static_cast<std::weak_ptr<DescriptorSet>>(shared_from_this()), i * maxOptions + j, binding.first, binding.second[0].type);
writeDescriptorSets.emplace_back(descriptorSets[i * maxOptions + j].get(), binding.first, 0, 1,
binding.second[0].type, nullptr,
&binding.second[0].bufferInfo);
Expand All @@ -56,6 +57,7 @@ void DescriptorSet::build() {
}
else {
if (binding.second.at(j).buffer != nullptr) {
binding.second.at(j).buffer->boundToDescriptorSet(static_cast<std::weak_ptr<DescriptorSet>>(shared_from_this()), i * maxOptions + j, binding.first, binding.second.at(j).type);
writeDescriptorSets.emplace_back(descriptorSets[i * maxOptions + j].get(), binding.first, 0, 1,
binding.second.at(j).type, nullptr,
&binding.second.at(j).bufferInfo);
Expand Down
9 changes: 6 additions & 3 deletions vulkan/DescriptorSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@

#include "Swapchain.h"

class DescriptorSet {
class Buffer;

class DescriptorSet : public std::enable_shared_from_this<DescriptorSet> {
public:
struct DescriptorBinding {
vk::DescriptorType type;
Expand All @@ -36,12 +38,13 @@ class DescriptorSet {

vk::UniqueDescriptorSetLayout descriptorSetLayout;

std::vector<vk::UniqueDescriptorSet> descriptorSets;
size_t maxOptions = 1;

private:
const std::shared_ptr<VulkanContext> context;
const uint8_t framesInFlight;
std::unordered_map<uint32_t, std::vector<DescriptorBinding>> bindings;
std::vector<vk::UniqueDescriptorSet> descriptorSets;
size_t maxOptions = 1;
};


Expand Down
Loading