Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: CUDA API bottleneck on newer CUDA versions on Linux #482

Merged
merged 1 commit into from
Jan 3, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 18 additions & 11 deletions src/common_host.cu
Original file line number Diff line number Diff line change
Expand Up @@ -225,15 +225,26 @@ bool cuda_supports_virtual_memory(int device) {
return supports_vmm != 0;
}

std::unordered_map<int, cudaDeviceProp>& cuda_device_properties() {
static auto* cuda_device_props = new std::unordered_map<int, cudaDeviceProp>{};
return *cuda_device_props;
}

const cudaDeviceProp& cuda_get_device_properties(int device) {
if (cuda_device_properties().count(device) == 0) {
auto& props = cuda_device_properties()[device];
CUDA_CHECK_THROW(cudaGetDeviceProperties(&props, device));
}

return cuda_device_properties().at(device);
}

std::string cuda_device_name(int device) {
cudaDeviceProp props;
CUDA_CHECK_THROW(cudaGetDeviceProperties(&props, device));
return props.name;
return cuda_get_device_properties(device).name;
}

uint32_t cuda_compute_capability(int device) {
cudaDeviceProp props;
CUDA_CHECK_THROW(cudaGetDeviceProperties(&props, device));
const auto& props = cuda_get_device_properties(device);
return props.major * 10 + props.minor;
}

Expand All @@ -255,15 +266,11 @@ uint32_t cuda_supported_compute_capability(int device) {
}

size_t cuda_max_shmem(int device) {
cudaDeviceProp props;
CUDA_CHECK_THROW(cudaGetDeviceProperties(&props, device));
return props.sharedMemPerBlockOptin;
return cuda_get_device_properties(device).sharedMemPerBlockOptin;
}

uint32_t cuda_max_registers(int device) {
cudaDeviceProp props;
CUDA_CHECK_THROW(cudaGetDeviceProperties(&props, device));
return (uint32_t)props.regsPerBlock;
return (uint32_t)cuda_get_device_properties(device).regsPerBlock;
}

size_t cuda_memory_granularity(int device) {
Expand Down