Skip to content

Commit

Permalink
hmem/cuda: avoid stub loading at runtime
Browse files Browse the repository at this point in the history
When the CUDA toolkit is installed, a set of "stub" libraries are
installed under /usr/local/cuda*/lib64/stubs/. These libraries include a
SONAME field with a `.1' suffix, but the filenames of these stubs are
bare. eg:

 > $ readelf -d /usr/local/cuda-12.5/lib64/stubs/libnvidia-ml.so | grep soname
 > 0x000000000000000e (SONAME)  Library soname: [libnvidia-ml.so.1]

The CUDA toolkit does not include any library file with the name
`libnvidia-ml.so.1` (or `libcuda.so.1`, etc.), as these are provided by
the driver package. This disconnect between the stub filename in the
toolkit and the SONAME within it is done intentionally to allow linking
with the stub at build time, while ensuring it's never loaded at
runtime.

In normal dynamic linking cases (ie: without dlopen), the SONAME field
of `libnvidia-ml.so.1` is used in the DT_NEEDED tag, where that filename
can only come from a driver package and this ensures that the stub
library will never match.

Match the same behavior and provide `.1` suffixes to dlopen where
appropriate for NVIDIA libraries.

Signed-off-by: Nicholas Sielicki <[email protected]>
  • Loading branch information
Nicholas Sielicki committed Dec 12, 2024
1 parent f03fe01 commit eb04700
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 18 deletions.
8 changes: 4 additions & 4 deletions fabtests/common/hmem_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -151,15 +151,15 @@ int ft_cuda_init(void)
cudaError_t cuda_ret;
int ret;

cudart_handle = dlopen("libcudart.so", RTLD_NOW);
cudart_handle = dlopen("libcudart.so.12", RTLD_NOW);
if (!cudart_handle) {
FT_ERR("Failed to dlopen libcudart.so");
FT_ERR("Failed to dlopen libcudart.so.12");
goto err;
}

cuda_handle = dlopen("libcuda.so", RTLD_NOW);
cuda_handle = dlopen("libcuda.so.1", RTLD_NOW);
if (!cuda_handle) {
FT_ERR("Failed to dlopen libcuda.so\n");
FT_ERR("Failed to dlopen libcuda.so.1\n");
goto err_dlclose_cudart;
}

Expand Down
4 changes: 2 additions & 2 deletions prov/psm3/psm3/psm.c
Original file line number Diff line number Diff line change
Expand Up @@ -295,10 +295,10 @@ int psmi_cuda_lib_load()
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetDevice);

/* CUDA Runtime */
psmi_cudart_lib = dlopen("libcudart.so", RTLD_LAZY);
psmi_cudart_lib = dlopen("libcudart.so.12", RTLD_LAZY);
if (!psmi_cudart_lib) {
dlerr = dlerror();
_HFI_ERROR("Unable to open libcudart.so. Error %s\n",
_HFI_ERROR("Unable to open libcudart.so.12. Error %s\n",
dlerr ? dlerr : "no dlerror()");
goto fail;
}
Expand Down
19 changes: 7 additions & 12 deletions src/hmem_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -480,29 +480,24 @@ static int cuda_hmem_dl_init(void)
/* Assume failure to dlopen CUDA runtime is caused by the library not
* being found. Thus, CUDA is not supported.
*/
cuda_attr.runtime_handle = dlopen("libcudart.so", RTLD_NOW);
cuda_attr.runtime_handle = dlopen("libcudart.so.12", RTLD_NOW);
if (!cuda_attr.runtime_handle) {
FI_INFO(&core_prov, FI_LOG_CORE,
"Failed to dlopen libcudart.so\n");
"Failed to dlopen libcudart.so.12\n");
return -FI_ENOSYS;
}

cuda_attr.driver_handle = dlopen("libcuda.so", RTLD_NOW);
cuda_attr.driver_handle = dlopen("libcuda.so.1", RTLD_NOW);
if (!cuda_attr.driver_handle) {
FI_WARN(&core_prov, FI_LOG_CORE,
"Failed to dlopen libcuda.so\n");
"Failed to dlopen libcuda.so.1\n");
goto err_dlclose_cuda_runtime;
}

cuda_attr.nvml_handle = dlopen("libnvidia-ml.so", RTLD_NOW);
cuda_attr.nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
if (!cuda_attr.nvml_handle) {
FI_INFO(&core_prov, FI_LOG_CORE,
"Failed to dlopen libnvidia-ml.so. Trying libnvidia-ml.so.1\n");
cuda_attr.nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
if (!cuda_attr.nvml_handle) {
FI_WARN(&core_prov, FI_LOG_CORE,
"Failed to dlopen libnvidia-ml.so or libnvidia-ml.so.1, bypassing nvml calls\n");
}
FI_WARN(&core_prov, FI_LOG_CORE,
"Failed to dlopen libnvidia-ml.so.1, bypassing nvml calls\n");
}

CUDA_DRIVER_FUNCS_DEF(CUDA_DRIVER_FUNCS_DLOPEN)
Expand Down

0 comments on commit eb04700

Please sign in to comment.