Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for macOS #5

Merged
merged 15 commits into from
Feb 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions .github/workflows/cmake-multi-platform.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@ name: CMake on multiple platforms

on:
push:
branches: [ "main", "bugfix/workflow" ]
paths-ignore:
- '**/README.md'
pull_request:
branches: [ "main" ]

Expand All @@ -25,7 +22,7 @@ jobs:
#
# To add more build types (Release, Debug, RelWithDebInfo, etc.) customize the build_type list.
matrix:
os: [ubuntu-latest, windows-latest]
os: [ubuntu-latest, windows-latest, macos-14]
build_type: [Release]
c_compiler: [gcc, clang, cl]
include:
Expand All @@ -35,7 +32,7 @@ jobs:
- os: ubuntu-latest
c_compiler: gcc
cpp_compiler: g++
- os: ubuntu-latest
- os: macos-14
c_compiler: clang
cpp_compiler: clang++
exclude:
Expand All @@ -45,6 +42,12 @@ jobs:
c_compiler: clang
- os: ubuntu-latest
c_compiler: cl
- os: macos-14
c_compiler: cl
- os: macos-14
c_compiler: gcc
- os: ubuntu-latest
c_compiler: clang

steps:
- uses: actions/checkout@v3
Expand All @@ -64,8 +67,8 @@ jobs:
- name: Prepare Vulkan SDK
uses: humbletim/[email protected]
with:
vulkan-query-version: 1.3.204.0
vulkan-components: Vulkan-Headers, Vulkan-Loader, Glslang
vulkan-query-version: latest
vulkan-components: Vulkan-Headers, Vulkan-Loader, Glslang, SPIRV-Tools
vulkan-use-cache: true

- name: Configure CMake
Expand All @@ -81,7 +84,7 @@ jobs:

- name: Build
# Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} -j3

- name: Upload build artifacts
uses: actions/upload-artifact@v4
Expand Down
43 changes: 39 additions & 4 deletions 3dgs/Renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,20 @@ void Renderer::handleInput() {
}
}

void Renderer::retrieveTimestamps() {
std::vector<uint64_t> timestamps(queryManager->nextId);
auto res = context->device->getQueryPoolResults(context->queryPool.get(), 0, queryManager->nextId,
timestamps.size() * sizeof(uint64_t),
timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
if (res != vk::Result::eSuccess) {
throw std::runtime_error("Failed to retrieve timestamps");
}

queryManager->parseResults(timestamps);
}

void Renderer::initializeVulkan() {
window = std::make_shared<Window>("Vulkan Splatting", 800, 600);
window = std::make_shared<Window>("Vulkan Splatting", 1920, 1080);
context = std::make_shared<VulkanContext>(Window::getRequiredInstanceExtensions(), std::vector<std::string>{},
configuration.enableVulkanValidationLayers);

Expand All @@ -72,10 +84,11 @@ void Renderer::initializeVulkan() {
vk::PhysicalDeviceFeatures pdf{};
vk::PhysicalDeviceVulkan11Features pdf11{};
vk::PhysicalDeviceVulkan12Features pdf12{};
pdf.shaderStorageImageWriteWithoutFormat = true;
pdf.shaderInt64 = true;
pdf12.shaderFloat16 = true;
pdf12.shaderBufferInt64Atomics = true;
pdf12.shaderSharedInt64Atomics = true;
// pdf12.shaderBufferInt64Atomics = true;
// pdf12.shaderSharedInt64Atomics = true;

context->createLogicalDevice(pdf, pdf11, pdf12);
context->createDescriptorPool(1);
Expand All @@ -97,7 +110,8 @@ void Renderer::initializeVulkan() {
void Renderer::loadSceneToGPU() {
scene = std::make_shared<GSScene>(configuration.scene);
scene->load(context);
// scene->loadTestScene(context);
// reset descriptor pool
context->device->resetDescriptorPool(context->descriptorPool.get());
}

void Renderer::createPreprocessPipeline() {
Expand Down Expand Up @@ -344,6 +358,8 @@ void Renderer::run() {
fpsCounter++;
}

retrieveTimestamps();

// auto nn = totalSumBufferHost->readOne<uint32_t>() ;
// auto staging = Buffer::staging(context, nn* sizeof(uint64_t));
// sortKVBufferEven->downloadTo(staging);
Expand Down Expand Up @@ -384,6 +400,7 @@ void Renderer::recordPreprocessCommandBuffer() {
preprocessCommandBuffer->begin(vk::CommandBufferBeginInfo{});

preprocessPipeline->bind(preprocessCommandBuffer, 0, 0);
preprocessCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eTopOfPipe, context->queryPool.get(), queryManager->registerQuery("preprocess_start"));
preprocessCommandBuffer->dispatch(numGroups, 1, 1);
tileOverlapBuffer->computeWriteReadBarrier(preprocessCommandBuffer.get());

Expand All @@ -392,7 +409,10 @@ void Renderer::recordPreprocessCommandBuffer() {

prefixSumPingBuffer->computeWriteReadBarrier(preprocessCommandBuffer.get());

preprocessCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eBottomOfPipe, context->queryPool.get(), queryManager->registerQuery("preprocess_end"));

prefixSumPipeline->bind(preprocessCommandBuffer, 0, 0);
preprocessCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eTopOfPipe, context->queryPool.get(), queryManager->registerQuery("prefix_sum_start"));
const auto iters = static_cast<uint32_t>(std::ceil(std::log2(static_cast<float>(scene->getNumVertices()))));
for (uint32_t timestep = 0; timestep <= iters; timestep++) {
preprocessCommandBuffer->pushConstants(prefixSumPipeline->pipelineLayout.get(),
Expand Down Expand Up @@ -422,7 +442,10 @@ void Renderer::recordPreprocessCommandBuffer() {

vertexAttributeBuffer->computeWriteReadBarrier(preprocessCommandBuffer.get());

preprocessCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eBottomOfPipe, context->queryPool.get(), queryManager->registerQuery("prefix_sum_end"));

preprocessSortPipeline->bind(preprocessCommandBuffer, 0, iters % 2 == 0 ? 0 : 1);
preprocessCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eTopOfPipe, context->queryPool.get(), queryManager->registerQuery("preprocess_sort_start"));
uint32_t tileX = (swapchain->swapchainExtent.width + 16 - 1) / 16;
// assert(tileX == 50);
preprocessCommandBuffer->pushConstants(preprocessSortPipeline->pipelineLayout.get(),
Expand All @@ -431,6 +454,7 @@ void Renderer::recordPreprocessCommandBuffer() {
preprocessCommandBuffer->dispatch(numGroups, 1, 1);

sortKBufferEven->computeWriteReadBarrier(preprocessCommandBuffer.get());
preprocessCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eBottomOfPipe, context->queryPool.get(), queryManager->registerQuery("preprocess_sort_end"));

preprocessCommandBuffer->end();
}
Expand All @@ -452,6 +476,9 @@ void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
assert(numInstances <= scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER);
for (auto i = 0; i < 8; i++) {
sortHistPipeline->bind(renderCommandBuffer, 0, i % 2 == 0 ? 0 : 1);
if (i == 0) {
renderCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eTopOfPipe, context->queryPool.get(), queryManager->registerQuery("sort_start"));
}
auto invocationSize = (numInstances + numRadixSortBlocksPerWorkgroup - 1) / numRadixSortBlocksPerWorkgroup;
invocationSize = (invocationSize + 255) / 256;

Expand Down Expand Up @@ -482,6 +509,10 @@ void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
sortKBufferEven->computeWriteReadBarrier(renderCommandBuffer.get());
sortVBufferEven->computeWriteReadBarrier(renderCommandBuffer.get());
}

if (i == 7) {
renderCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eBottomOfPipe, context->queryPool.get(), queryManager->registerQuery("sort_end"));
}
}

renderCommandBuffer->fillBuffer(tileBoundaryBuffer->buffer, 0, VK_WHOLE_SIZE, 0);
Expand All @@ -494,14 +525,17 @@ void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {

// Since we have 64 bit keys, the sort result is always in the even buffer
tileBoundaryPipeline->bind(renderCommandBuffer, 0, 0);
renderCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eTopOfPipe, context->queryPool.get(), queryManager->registerQuery("tile_boundary_start"));
renderCommandBuffer->pushConstants(tileBoundaryPipeline->pipelineLayout.get(),
vk::ShaderStageFlagBits::eCompute, 0,
sizeof(uint32_t), &numInstances);
renderCommandBuffer->dispatch((numInstances + 255) / 256, 1, 1);

tileBoundaryBuffer->computeWriteReadBarrier(renderCommandBuffer.get());
renderCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eBottomOfPipe, context->queryPool.get(), queryManager->registerQuery("tile_boundary_end"));

renderPipeline->bind(renderCommandBuffer, 0, std::vector<uint32_t>{0, currentImageIndex});
renderCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eTopOfPipe, context->queryPool.get(), queryManager->registerQuery("render_start"));
auto [width, height] = window->getFramebufferSize();
uint32_t constants[2] = {width, height};
renderCommandBuffer->pushConstants(renderPipeline->pipelineLayout.get(),
Expand Down Expand Up @@ -534,6 +568,7 @@ void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
renderCommandBuffer->pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
vk::PipelineStageFlagBits::eBottomOfPipe,
vk::DependencyFlagBits::eByRegion, nullptr, nullptr, imageMemoryBarrier);
renderCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eBottomOfPipe, context->queryPool.get(), queryManager->registerQuery("render_end"));
renderCommandBuffer->end();
}

Expand Down
5 changes: 5 additions & 0 deletions 3dgs/Renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include "../vulkan/Swapchain.h"
#include <glm/gtc/quaternion.hpp>

#include "../vulkan/QueryManager.h"

struct RendererConfiguration {
bool enableVulkanValidationLayers = false;
std::optional<uint8_t> physicalDeviceId = std::nullopt;
Expand Down Expand Up @@ -64,6 +66,8 @@ class Renderer {

void handleInput();

void retrieveTimestamps();

void run();

~Renderer();
Expand All @@ -72,6 +76,7 @@ class Renderer {
std::shared_ptr<Window> window;
std::shared_ptr<VulkanContext> context;
std::shared_ptr<GSScene> scene;
std::shared_ptr<QueryManager> queryManager = std::make_shared<QueryManager>();

std::shared_ptr<ComputePipeline> preprocessPipeline;
std::shared_ptr<ComputePipeline> renderPipeline;
Expand Down
15 changes: 11 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set(CMAKE_CXX_STANDARD 20)

find_package(Vulkan COMPONENTS glslangValidator)

if (WIN32)
if (WIN32 OR APPLE)
FetchContent_Declare(
glfw
GIT_REPOSITORY https://github.com/glfw/glfw
Expand All @@ -26,6 +26,8 @@ if (WIN32)
add_subdirectory(${glfw_SOURCE_DIR} ${glfw_BINARY_DIR})
endif()

set(GLM_ENABLE_CXX_20 ON CACHE INTERNAL "Enable experimental features")

FetchContent_Declare(
glm
GIT_REPOSITORY https://github.com/g-truc/glm
Expand Down Expand Up @@ -65,12 +67,15 @@ else ()
endif ()

if (CMAKE_BUILD_TYPE MATCHES Debug AND NOT APPLE)
set(GLSLC_DEFINE "-DDEBUG")
list(APPEND GLSLC_DEFINE "-DDEBUG")
else ()
set(GLSLC_DEFINE "-DNDEBUG")
list(APPEND GLSLC_DEFINE "-DNDEBUG")
endif ()

set(GLSLC_DEFINE "${GLSLC_DEFINE}")
if (APPLE)
# append -DAPPLE to GLSLC_DEFINE
list(APPEND GLSLC_DEFINE "-DAPPLE")
endif ()

foreach (GLSL ${GLSL_SOURCE_FILES})
get_filename_component(FILE_NAME ${GLSL} NAME)
Expand Down Expand Up @@ -108,6 +113,8 @@ add_executable(vulkan_splatting main.cpp
vulkan/pipelines/ComputePipeline.h
vulkan/Swapchain.cpp
vulkan/Swapchain.h
vulkan/QueryManager.cpp
vulkan/QueryManager.h
)

add_dependencies(vulkan_splatting Shaders)
Expand Down
38 changes: 34 additions & 4 deletions shaders/sort/sort.comp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@
#extension GL_KHR_shader_subgroup_arithmetic: enable
#extension GL_KHR_shader_subgroup_ballot: enable
#extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
#ifndef APPLE
#extension GL_EXT_shader_atomic_int64 : enable
#endif

#define WORKGROUP_SIZE 256// assert WORKGROUP_SIZE >= RADIX_SORT_BINS
#define RADIX_SORT_BINS 256U
Expand Down Expand Up @@ -85,7 +87,12 @@ shared uint[RADIX_SORT_BINS / SUBGROUP_SIZE] sums;// subgroup reductions
shared uint[RADIX_SORT_BINS] global_offsets;// global exclusive scan (prefix sum)

struct BinFlags {
#ifndef APPLE
key_t flags[WORKGROUP_SIZE / BITS];
#else
uint flags1[WORKGROUP_SIZE / BITS];
uint flags2[WORKGROUP_SIZE / BITS];
#endif
};
shared BinFlags[RADIX_SORT_BINS] bin_flags;

Expand Down Expand Up @@ -133,7 +140,12 @@ void main() {
// initialize bin flags
if (lID < RADIX_SORT_BINS) {
for (int i = 0; i < WORKGROUP_SIZE / BITS; i++) {
#ifndef APPLE
bin_flags[lID].flags[i] = 0U;// init all bin flags to 0
#else
bin_flags[lID].flags1[i] = 0U;// init all bin flags to 0
bin_flags[lID].flags2[i] = 0U;// init all bin flags to 0
#endif
}
}
barrier();
Expand All @@ -149,7 +161,12 @@ void main() {
// offset for group
binOffset = global_offsets[binID];
// add bit to flag
#ifndef APPLE
atomicAdd(bin_flags[binID].flags[flags_bin], flags_bit);
#else
atomicAdd(bin_flags[binID].flags1[flags_bin], uint(flags_bit));
atomicAdd(bin_flags[binID].flags2[flags_bin], uint(flags_bit >> 32));
#endif
}
barrier();

Expand All @@ -158,11 +175,24 @@ void main() {
uint prefix = 0;
uint count = 0;
for (uint i = 0; i < WORKGROUP_SIZE / BITS; i++) {
const key_t bits = bin_flags[binID].flags[i];
#ifndef APPLE
const key_t bits = bin_flags[binID].flags[i];
#else
const uint flag1 = bin_flags[binID].flags1[i];
const uint flag2 = bin_flags[binID].flags2[i];
#endif
#if BITS == 64
const uint full_count = bitCount(uint(bits)) + bitCount(uint(bits >> 32));
const key_t partial_bits = bits & (flags_bit - 1);
const uint partial_count = bitCount(uint(partial_bits)) + bitCount(uint(partial_bits >> 32));
#ifndef APPLE
const uint full_count = bitCount(uint(bits)) + bitCount(uint(bits >> 32));
const key_t partial_bits = bits & (flags_bit - 1);
const uint partial_count = bitCount(uint(partial_bits)) + bitCount(uint(partial_bits >> 32));
#else
const uint full_count = bitCount(flag1) + bitCount(flag2);
const uint64_t f = flags_bit - 1;
const uint partial_bits1 = flag1 & uint(f);
const uint partial_bits2 = flag2 & uint(f >> 32);
const uint partial_count = bitCount(partial_bits1) + bitCount(partial_bits2);
#endif
#else
const uint full_count = bitCount(bits);
const uint partial_count = bitCount(bits & (flags_bit - 1));
Expand Down
Loading
Loading