Skip to content

Commit

Permalink
Support for macOS (#5)
Browse files Browse the repository at this point in the history
* Support for macOS

* Support for macOS

* WIP: add timestamps

* WIP: better bit counting

* Fix compatibility on non Apple devices

* Run on all branches

* Parallelize build

* Run on all branches

* Fix macOS build

* Fix macOS build

* Fix macOS build

* Fix macOS build

* Fix macOS build

* Fix macOS build
  • Loading branch information
shg8 authored Feb 17, 2024
1 parent 2284992 commit 1301ffe
Show file tree
Hide file tree
Showing 9 changed files with 246 additions and 62 deletions.
19 changes: 11 additions & 8 deletions .github/workflows/cmake-multi-platform.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@ name: CMake on multiple platforms

on:
push:
branches: [ "main", "bugfix/workflow" ]
paths-ignore:
- '**/README.md'
pull_request:
branches: [ "main" ]

Expand All @@ -25,7 +22,7 @@ jobs:
#
# To add more build types (Release, Debug, RelWithDebInfo, etc.) customize the build_type list.
matrix:
os: [ubuntu-latest, windows-latest]
os: [ubuntu-latest, windows-latest, macos-14]
build_type: [Release]
c_compiler: [gcc, clang, cl]
include:
Expand All @@ -35,7 +32,7 @@ jobs:
- os: ubuntu-latest
c_compiler: gcc
cpp_compiler: g++
- os: ubuntu-latest
- os: macos-14
c_compiler: clang
cpp_compiler: clang++
exclude:
Expand All @@ -45,6 +42,12 @@ jobs:
c_compiler: clang
- os: ubuntu-latest
c_compiler: cl
- os: macos-14
c_compiler: cl
- os: macos-14
c_compiler: gcc
- os: ubuntu-latest
c_compiler: clang

steps:
- uses: actions/checkout@v3
Expand All @@ -64,8 +67,8 @@ jobs:
- name: Prepare Vulkan SDK
uses: humbletim/[email protected]
with:
vulkan-query-version: 1.3.204.0
vulkan-components: Vulkan-Headers, Vulkan-Loader, Glslang
vulkan-query-version: latest
vulkan-components: Vulkan-Headers, Vulkan-Loader, Glslang, SPIRV-Tools
vulkan-use-cache: true

- name: Configure CMake
Expand All @@ -81,7 +84,7 @@ jobs:
- name: Build
# Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} -j3

- name: Upload build artifacts
uses: actions/upload-artifact@v4
Expand Down
43 changes: 39 additions & 4 deletions 3dgs/Renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,20 @@ void Renderer::handleInput() {
}
}

void Renderer::retrieveTimestamps() {
std::vector<uint64_t> timestamps(queryManager->nextId);
auto res = context->device->getQueryPoolResults(context->queryPool.get(), 0, queryManager->nextId,
timestamps.size() * sizeof(uint64_t),
timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
if (res != vk::Result::eSuccess) {
throw std::runtime_error("Failed to retrieve timestamps");
}

queryManager->parseResults(timestamps);
}

void Renderer::initializeVulkan() {
window = std::make_shared<Window>("Vulkan Splatting", 800, 600);
window = std::make_shared<Window>("Vulkan Splatting", 1920, 1080);
context = std::make_shared<VulkanContext>(Window::getRequiredInstanceExtensions(), std::vector<std::string>{},
configuration.enableVulkanValidationLayers);

Expand All @@ -72,10 +84,11 @@ void Renderer::initializeVulkan() {
vk::PhysicalDeviceFeatures pdf{};
vk::PhysicalDeviceVulkan11Features pdf11{};
vk::PhysicalDeviceVulkan12Features pdf12{};
pdf.shaderStorageImageWriteWithoutFormat = true;
pdf.shaderInt64 = true;
pdf12.shaderFloat16 = true;
pdf12.shaderBufferInt64Atomics = true;
pdf12.shaderSharedInt64Atomics = true;
// pdf12.shaderBufferInt64Atomics = true;
// pdf12.shaderSharedInt64Atomics = true;

context->createLogicalDevice(pdf, pdf11, pdf12);
context->createDescriptorPool(1);
Expand All @@ -97,7 +110,8 @@ void Renderer::initializeVulkan() {
void Renderer::loadSceneToGPU() {
scene = std::make_shared<GSScene>(configuration.scene);
scene->load(context);
// scene->loadTestScene(context);
// reset descriptor pool
context->device->resetDescriptorPool(context->descriptorPool.get());
}

void Renderer::createPreprocessPipeline() {
Expand Down Expand Up @@ -344,6 +358,8 @@ void Renderer::run() {
fpsCounter++;
}

retrieveTimestamps();

// auto nn = totalSumBufferHost->readOne<uint32_t>() ;
// auto staging = Buffer::staging(context, nn* sizeof(uint64_t));
// sortKVBufferEven->downloadTo(staging);
Expand Down Expand Up @@ -384,6 +400,7 @@ void Renderer::recordPreprocessCommandBuffer() {
preprocessCommandBuffer->begin(vk::CommandBufferBeginInfo{});

preprocessPipeline->bind(preprocessCommandBuffer, 0, 0);
preprocessCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eTopOfPipe, context->queryPool.get(), queryManager->registerQuery("preprocess_start"));
preprocessCommandBuffer->dispatch(numGroups, 1, 1);
tileOverlapBuffer->computeWriteReadBarrier(preprocessCommandBuffer.get());

Expand All @@ -392,7 +409,10 @@ void Renderer::recordPreprocessCommandBuffer() {

prefixSumPingBuffer->computeWriteReadBarrier(preprocessCommandBuffer.get());

preprocessCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eBottomOfPipe, context->queryPool.get(), queryManager->registerQuery("preprocess_end"));

prefixSumPipeline->bind(preprocessCommandBuffer, 0, 0);
preprocessCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eTopOfPipe, context->queryPool.get(), queryManager->registerQuery("prefix_sum_start"));
const auto iters = static_cast<uint32_t>(std::ceil(std::log2(static_cast<float>(scene->getNumVertices()))));
for (uint32_t timestep = 0; timestep <= iters; timestep++) {
preprocessCommandBuffer->pushConstants(prefixSumPipeline->pipelineLayout.get(),
Expand Down Expand Up @@ -422,7 +442,10 @@ void Renderer::recordPreprocessCommandBuffer() {

vertexAttributeBuffer->computeWriteReadBarrier(preprocessCommandBuffer.get());

preprocessCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eBottomOfPipe, context->queryPool.get(), queryManager->registerQuery("prefix_sum_end"));

preprocessSortPipeline->bind(preprocessCommandBuffer, 0, iters % 2 == 0 ? 0 : 1);
preprocessCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eTopOfPipe, context->queryPool.get(), queryManager->registerQuery("preprocess_sort_start"));
uint32_t tileX = (swapchain->swapchainExtent.width + 16 - 1) / 16;
// assert(tileX == 50);
preprocessCommandBuffer->pushConstants(preprocessSortPipeline->pipelineLayout.get(),
Expand All @@ -431,6 +454,7 @@ void Renderer::recordPreprocessCommandBuffer() {
preprocessCommandBuffer->dispatch(numGroups, 1, 1);

sortKBufferEven->computeWriteReadBarrier(preprocessCommandBuffer.get());
preprocessCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eBottomOfPipe, context->queryPool.get(), queryManager->registerQuery("preprocess_sort_end"));

preprocessCommandBuffer->end();
}
Expand All @@ -452,6 +476,9 @@ void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
assert(numInstances <= scene->getNumVertices() * SORT_ALLOCATE_MULTIPLIER);
for (auto i = 0; i < 8; i++) {
sortHistPipeline->bind(renderCommandBuffer, 0, i % 2 == 0 ? 0 : 1);
if (i == 0) {
renderCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eTopOfPipe, context->queryPool.get(), queryManager->registerQuery("sort_start"));
}
auto invocationSize = (numInstances + numRadixSortBlocksPerWorkgroup - 1) / numRadixSortBlocksPerWorkgroup;
invocationSize = (invocationSize + 255) / 256;

Expand Down Expand Up @@ -482,6 +509,10 @@ void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
sortKBufferEven->computeWriteReadBarrier(renderCommandBuffer.get());
sortVBufferEven->computeWriteReadBarrier(renderCommandBuffer.get());
}

if (i == 7) {
renderCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eBottomOfPipe, context->queryPool.get(), queryManager->registerQuery("sort_end"));
}
}

renderCommandBuffer->fillBuffer(tileBoundaryBuffer->buffer, 0, VK_WHOLE_SIZE, 0);
Expand All @@ -494,14 +525,17 @@ void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {

// Since we have 64 bit keys, the sort result is always in the even buffer
tileBoundaryPipeline->bind(renderCommandBuffer, 0, 0);
renderCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eTopOfPipe, context->queryPool.get(), queryManager->registerQuery("tile_boundary_start"));
renderCommandBuffer->pushConstants(tileBoundaryPipeline->pipelineLayout.get(),
vk::ShaderStageFlagBits::eCompute, 0,
sizeof(uint32_t), &numInstances);
renderCommandBuffer->dispatch((numInstances + 255) / 256, 1, 1);

tileBoundaryBuffer->computeWriteReadBarrier(renderCommandBuffer.get());
renderCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eBottomOfPipe, context->queryPool.get(), queryManager->registerQuery("tile_boundary_end"));

renderPipeline->bind(renderCommandBuffer, 0, std::vector<uint32_t>{0, currentImageIndex});
renderCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eTopOfPipe, context->queryPool.get(), queryManager->registerQuery("render_start"));
auto [width, height] = window->getFramebufferSize();
uint32_t constants[2] = {width, height};
renderCommandBuffer->pushConstants(renderPipeline->pipelineLayout.get(),
Expand Down Expand Up @@ -534,6 +568,7 @@ void Renderer::recordRenderCommandBuffer(uint32_t currentFrame) {
renderCommandBuffer->pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
vk::PipelineStageFlagBits::eBottomOfPipe,
vk::DependencyFlagBits::eByRegion, nullptr, nullptr, imageMemoryBarrier);
renderCommandBuffer->writeTimestamp(vk::PipelineStageFlagBits::eBottomOfPipe, context->queryPool.get(), queryManager->registerQuery("render_end"));
renderCommandBuffer->end();
}

Expand Down
5 changes: 5 additions & 0 deletions 3dgs/Renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include "../vulkan/Swapchain.h"
#include <glm/gtc/quaternion.hpp>

#include "../vulkan/QueryManager.h"

struct RendererConfiguration {
bool enableVulkanValidationLayers = false;
std::optional<uint8_t> physicalDeviceId = std::nullopt;
Expand Down Expand Up @@ -64,6 +66,8 @@ class Renderer {

void handleInput();

void retrieveTimestamps();

void run();

~Renderer();
Expand All @@ -72,6 +76,7 @@ class Renderer {
std::shared_ptr<Window> window;
std::shared_ptr<VulkanContext> context;
std::shared_ptr<GSScene> scene;
std::shared_ptr<QueryManager> queryManager = std::make_shared<QueryManager>();

std::shared_ptr<ComputePipeline> preprocessPipeline;
std::shared_ptr<ComputePipeline> renderPipeline;
Expand Down
15 changes: 11 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set(CMAKE_CXX_STANDARD 20)

find_package(Vulkan COMPONENTS glslangValidator)

if (WIN32)
if (WIN32 OR APPLE)
FetchContent_Declare(
glfw
GIT_REPOSITORY https://github.com/glfw/glfw
Expand All @@ -26,6 +26,8 @@ if (WIN32)
add_subdirectory(${glfw_SOURCE_DIR} ${glfw_BINARY_DIR})
endif()

set(GLM_ENABLE_CXX_20 ON CACHE INTERNAL "Enable experimental features")

FetchContent_Declare(
glm
GIT_REPOSITORY https://github.com/g-truc/glm
Expand Down Expand Up @@ -65,12 +67,15 @@ else ()
endif ()

if (CMAKE_BUILD_TYPE MATCHES Debug AND NOT APPLE)
set(GLSLC_DEFINE "-DDEBUG")
list(APPEND GLSLC_DEFINE "-DDEBUG")
else ()
set(GLSLC_DEFINE "-DNDEBUG")
list(APPEND GLSLC_DEFINE "-DNDEBUG")
endif ()

set(GLSLC_DEFINE "${GLSLC_DEFINE}")
if (APPLE)
# append -DAPPLE to GLSLC_DEFINE
list(APPEND GLSLC_DEFINE "-DAPPLE")
endif ()

foreach (GLSL ${GLSL_SOURCE_FILES})
get_filename_component(FILE_NAME ${GLSL} NAME)
Expand Down Expand Up @@ -108,6 +113,8 @@ add_executable(vulkan_splatting main.cpp
vulkan/pipelines/ComputePipeline.h
vulkan/Swapchain.cpp
vulkan/Swapchain.h
vulkan/QueryManager.cpp
vulkan/QueryManager.h
)

add_dependencies(vulkan_splatting Shaders)
Expand Down
38 changes: 34 additions & 4 deletions shaders/sort/sort.comp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@
#extension GL_KHR_shader_subgroup_arithmetic: enable
#extension GL_KHR_shader_subgroup_ballot: enable
#extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
#ifndef APPLE
#extension GL_EXT_shader_atomic_int64 : enable
#endif

#define WORKGROUP_SIZE 256// assert WORKGROUP_SIZE >= RADIX_SORT_BINS
#define RADIX_SORT_BINS 256U
Expand Down Expand Up @@ -85,7 +87,12 @@ shared uint[RADIX_SORT_BINS / SUBGROUP_SIZE] sums;// subgroup reductions
shared uint[RADIX_SORT_BINS] global_offsets;// global exclusive scan (prefix sum)

struct BinFlags {
#ifndef APPLE
key_t flags[WORKGROUP_SIZE / BITS];
#else
uint flags1[WORKGROUP_SIZE / BITS];
uint flags2[WORKGROUP_SIZE / BITS];
#endif
};
shared BinFlags[RADIX_SORT_BINS] bin_flags;

Expand Down Expand Up @@ -133,7 +140,12 @@ void main() {
// initialize bin flags
if (lID < RADIX_SORT_BINS) {
for (int i = 0; i < WORKGROUP_SIZE / BITS; i++) {
#ifndef APPLE
bin_flags[lID].flags[i] = 0U;// init all bin flags to 0
#else
bin_flags[lID].flags1[i] = 0U;// init all bin flags to 0
bin_flags[lID].flags2[i] = 0U;// init all bin flags to 0
#endif
}
}
barrier();
Expand All @@ -149,7 +161,12 @@ void main() {
// offset for group
binOffset = global_offsets[binID];
// add bit to flag
#ifndef APPLE
atomicAdd(bin_flags[binID].flags[flags_bin], flags_bit);
#else
atomicAdd(bin_flags[binID].flags1[flags_bin], uint(flags_bit));
atomicAdd(bin_flags[binID].flags2[flags_bin], uint(flags_bit >> 32));
#endif
}
barrier();

Expand All @@ -158,11 +175,24 @@ void main() {
uint prefix = 0;
uint count = 0;
for (uint i = 0; i < WORKGROUP_SIZE / BITS; i++) {
const key_t bits = bin_flags[binID].flags[i];
#ifndef APPLE
const key_t bits = bin_flags[binID].flags[i];
#else
const uint flag1 = bin_flags[binID].flags1[i];
const uint flag2 = bin_flags[binID].flags2[i];
#endif
#if BITS == 64
const uint full_count = bitCount(uint(bits)) + bitCount(uint(bits >> 32));
const key_t partial_bits = bits & (flags_bit - 1);
const uint partial_count = bitCount(uint(partial_bits)) + bitCount(uint(partial_bits >> 32));
#ifndef APPLE
const uint full_count = bitCount(uint(bits)) + bitCount(uint(bits >> 32));
const key_t partial_bits = bits & (flags_bit - 1);
const uint partial_count = bitCount(uint(partial_bits)) + bitCount(uint(partial_bits >> 32));
#else
const uint full_count = bitCount(flag1) + bitCount(flag2);
const uint64_t f = flags_bit - 1;
const uint partial_bits1 = flag1 & uint(f);
const uint partial_bits2 = flag2 & uint(f >> 32);
const uint partial_count = bitCount(partial_bits1) + bitCount(partial_bits2);
#endif
#else
const uint full_count = bitCount(bits);
const uint partial_count = bitCount(bits & (flags_bit - 1));
Expand Down
Loading

0 comments on commit 1301ffe

Please sign in to comment.