Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add run time compilation #11225

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/linux-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ jobs:
"-DVELOX_ENABLE_GCS=ON"
"-DVELOX_ENABLE_ABFS=ON"
"-DVELOX_ENABLE_REMOTE_FUNCTIONS=ON"
"-DVELOX_ENABLE_GPU=ON"
"-DVELOX_ENABLE_GPU=OFF"
"-DVELOX_MONO_LIBRARY=ON"
)
make release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS[*]}"
Expand Down
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ list(INSERT CMAKE_FIND_LIBRARY_SUFFIXES 0 a)
if(VELOX_ENABLE_S3)
# Set AWS_ROOT_DIR if you have a custom install location of AWS SDK CPP.
if(AWSSDK_ROOT_DIR)
set(CMAKE_PREFIX_PATH ${AWSSDK_ROOT_DIR})
list(APPEND CMAKE_PREFIX_PATH ${AWSSDK_ROOT_DIR})
endif()
find_package(AWSSDK REQUIRED COMPONENTS s3;identity-management)
add_definitions(-DVELOX_ENABLE_S3)
Expand Down Expand Up @@ -381,6 +381,7 @@ if(${VELOX_ENABLE_GPU})
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-G>")
endif()
include_directories("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
find_package(CUDAToolkit REQUIRED)
Copy link
Collaborator

@assignUser assignUser Oct 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
find_package(CUDAToolkit REQUIRED)
include(FindCUDAToolkit)

See https://cmake.org/cmake/help/v3.23/module/FindCUDAToolkit.html#imported-targets

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm... it's found which should then also create the targets.
After some local investigation (in the container) it seems we need to install the nvrtc library explicitly but even then cmake fails to find the library location for some reason. In a standalone cml.txt without anything but the include the target is not created because the lib is not found:

CUDA_nvrtc_LIBRARY-NOTFOUND

even if the lib is clearly on the system:

/usr/local/cuda-12.4/targets/x86_64-linux/lib/libnvrtc.so.12

and the path is correct in the find command:

find_library(CUDA_nvrtc_LIBRARY NAMES nvrtc HINTS /usr/local/cuda-12.4/targets/x86_64-linux/lib;/usr/local/cuda/targets/x86_64-linux/lib/stubs;/usr/local/cuda/targets/x86_64-linux/lib ENV CUDA_PATH PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs )

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On my own machine the same script works fine, used the same current cmake in the container as well but it still fails the same way.

endif()

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
Expand Down
10 changes: 8 additions & 2 deletions velox/experimental/wave/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,19 @@ velox_add_library(
velox_wave_common
GpuArena.cpp
Buffer.cpp
Compile.cu
Cuda.cu
Exception.cpp
KernelCache.cpp
Type.cpp
ResultStaging.cpp)

velox_link_libraries(velox_wave_common velox_exception velox_common_base
velox_type)
velox_link_libraries(
velox_wave_common
velox_exception
velox_common_base
velox_type
CUDA::nvrtc)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to work without it but it probably makes sense to explicitly link to the runtime aswell via CUDA::cudart


if(${VELOX_BUILD_TESTING})
add_subdirectory(tests)
Expand Down
225 changes: 225 additions & 0 deletions velox/experimental/wave/common/Compile.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <fmt/format.h>
#include <gflags/gflags.h>
#include <nvrtc.h>
#include "velox/experimental/wave/common/Cuda.h"
#include "velox/experimental/wave/common/CudaUtil.cuh"
#include "velox/experimental/wave/common/Exception.h"

DEFINE_string(
wavegen_architecture,
"compute_70",
"--gpu-architecture flag for generated code");

namespace facebook::velox::wave {

void nvrtcCheck(nvrtcResult result) {
if (result != NVRTC_SUCCESS) {
waveError(nvrtcGetErrorString(result));
}
}

class CompiledModuleImpl : public CompiledModule {
public:
CompiledModuleImpl(CUmodule module, std::vector<CUfunction> kernels)
: module_(module), kernels_(std::move(kernels)) {}

~CompiledModuleImpl() {
auto result = cuModuleUnload(module_);
if (result != CUDA_SUCCESS) {
LOG(ERROR) << "Error in unloading module " << result;
}
}

void launch(
int32_t kernelIdx,
int32_t numBlocks,
int32_t numThreads,
int32_t shared,
Stream* stream,
void** args) override;

KernelInfo info(int32_t kernelIdx) override;

private:
CUmodule module_;
std::vector<CUfunction> kernels_;
};

void addFlag(
const char* flag,
const char* value,
int32_t length,
std::vector<std::string>& data) {
std::string str(flag);
str.resize(str.size() + length + 1);
memcpy(str.data() + strlen(flag), value, length);
str.back() = 0;
data.push_back(std::move(str));
}

// Gets compiler options from the environment and appends them to 'opts''. The
// memory is owned by 'data'.
void getNvrtcOptions(
std::vector<const char*>& opts,
std::vector<std::string>& data) {
const char* includes = getenv("WAVE_NVRTC_INCLUDE_PATH");
if (includes && strlen(includes) > 0) {
for (;;) {
const char* end = strchr(includes, ':');
if (!end) {
addFlag("-I", includes, strlen(includes), data);
break;
}
addFlag("-I", includes, end - includes, data);
includes = end + 1;
}
}
const char* flags = getenv("WAVE_NVRTC_FLAGS");
if (flags && strlen(flags)) {
for (;;) {
auto end = strchr(flags, ' ');
if (!end) {
addFlag("", flags, strlen(flags), data);
break;
}
addFlag("", flags, end - flags, data);
flags = end + 1;
}
}
for (auto& str : data) {
opts.push_back(str.data());
}
}

std::shared_ptr<CompiledModule> CompiledModule::create(const KernelSpec& spec) {
nvrtcProgram prog;
nvrtcCreateProgram(
&prog,
spec.code.c_str(), // buffer
spec.filePath.c_str(), // name
spec.numHeaders, // numHeaders
spec.headers, // headers
spec.headerNames); // includeNames
for (auto& name : spec.entryPoints) {
nvrtcCheck(nvrtcAddNameExpression(prog, name.c_str()));
}
std::vector<const char*> opts;
std::vector<std::string> optsData;
#ifndef NDEBUG
optsData.push_back("-G");
#else
optsData.push_back("-O3");
#endif
getNvrtcOptions(opts, optsData);

auto compileResult = nvrtcCompileProgram(
prog, // prog
opts.size(), // numOptions
opts.data()); // options

size_t logSize;

nvrtcGetProgramLogSize(prog, &logSize);
std::string log;
log.resize(logSize);
nvrtcGetProgramLog(prog, log.data());

if (compileResult != NVRTC_SUCCESS) {
nvrtcDestroyProgram(&prog);
waveError(std::string("Cuda compilation error: ") + log);
}
// Obtain PTX from the program.
size_t ptxSize;
nvrtcCheck(nvrtcGetPTXSize(prog, &ptxSize));
std::string ptx;
ptx.resize(ptxSize);
nvrtcCheck(nvrtcGetPTX(prog, ptx.data()));
std::vector<std::string> loweredNames;
for (auto& entry : spec.entryPoints) {
const char* temp;
nvrtcCheck(nvrtcGetLoweredName(prog, entry.c_str(), &temp));
loweredNames.push_back(std::string(temp));
}

nvrtcDestroyProgram(&prog);
CUjit_option options[] = {
CU_JIT_INFO_LOG_BUFFER,
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
CU_JIT_ERROR_LOG_BUFFER,
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
char info[1024];
char error[1024];
uint32_t infoSize = sizeof(info);
uint32_t errorSize = sizeof(error);
void* values[] = {info, &infoSize, error, &errorSize};

CUmodule module;
auto loadResult = cuModuleLoadDataEx(
&module, ptx.data(), sizeof(values) / sizeof(void*), options, values);
if (loadResult != CUDA_SUCCESS) {
LOG(ERROR) << "Load error " << errorSize << " " << infoSize;
waveError(fmt::format("Error in load module: {} {}", info, error));
}
std::vector<CUfunction> funcs;
for (auto& name : loweredNames) {
funcs.emplace_back();
CU_CHECK(cuModuleGetFunction(&funcs.back(), module, name.c_str()));
}
return std::make_shared<CompiledModuleImpl>(module, std::move(funcs));
}

void CompiledModuleImpl::launch(
int32_t kernelIdx,
int32_t numBlocks,
int32_t numThreads,
int32_t shared,
Stream* stream,
void** args) {
auto result = cuLaunchKernel(
kernels_[kernelIdx],
numBlocks,
1,
1, // grid dim
numThreads,
1,
1, // block dim
shared,
reinterpret_cast<CUstream>(stream->stream()->stream),
args,
0);
CU_CHECK(result);
};

KernelInfo CompiledModuleImpl::info(int32_t kernelIdx) {
KernelInfo info;
auto f = kernels_[kernelIdx];
cuFuncGetAttribute(&info.numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, f);
cuFuncGetAttribute(
&info.sharedMemory, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, f);
cuFuncGetAttribute(
&info.maxThreadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, f);
int32_t max;
cuOccupancyMaxActiveBlocksPerMultiprocessor(&max, f, 256, 0);
info.maxOccupancy0 = max;
cuOccupancyMaxActiveBlocksPerMultiprocessor(&max, f, 256, 256 * 32);
info.maxOccupancy32 = max;
return info;
}

} // namespace facebook::velox::wave
Loading
Loading