From 9180553cfd37cad258fcd998e939b5648ee9225a Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 8 Sep 2024 09:51:04 -0700 Subject: [PATCH] fix(cuda): avoid stub loading at runtime When the CUDA toolkit is installed, a set of "stub" libraries are installed under /usr/local/cuda*/lib64/stubs/. These libraries include a SONAME field with a `.1' suffix, but the filenames of these stubs are bare. eg: > $ readelf -d /usr/local/cuda-12.5/lib64/stubs/libnvidia-ml.so | grep soname > 0x000000000000000e (SONAME) Library soname: [libnvidia-ml.so.1] The CUDA toolkit does not include any library file with the name `libnvidia-ml.so.1` (or `libcuda.so.1`, etc.), as these are provided by the driver package. This disconnect between the stub filename in the toolkit and the SONAME within it is done intentionally to allow linking with the stub at build time, while ensuring it's never loaded at runtime. In normal dynamic linking cases (ie: without dlopen), the SONAME field of `libnvidia-ml.so.1` is used in the DT_NEEDED tag, where that filename can only come from a driver package and this ensures that the stub library will never match. Match the same behavior and provide `.1` suffixes to dlopen where appropriate for NVIDIA libraries. Signed-off-by: Nicholas Sielicki --- fabtests/common/hmem_cuda.c | 4 ++-- src/hmem_cuda.c | 15 +++++---------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/fabtests/common/hmem_cuda.c b/fabtests/common/hmem_cuda.c index 2f02b6f474c..e4aef962fb6 100644 --- a/fabtests/common/hmem_cuda.c +++ b/fabtests/common/hmem_cuda.c @@ -157,9 +157,9 @@ int ft_cuda_init(void) goto err; } - cuda_handle = dlopen("libcuda.so", RTLD_NOW); + cuda_handle = dlopen("libcuda.so.1", RTLD_NOW); if (!cuda_handle) { - FT_ERR("Failed to dlopen libcuda.so\n"); + FT_ERR("Failed to dlopen libcuda.so.1\n"); goto err_dlclose_cudart; } diff --git a/src/hmem_cuda.c b/src/hmem_cuda.c index 1c8abb03285..1e3b9cdc10c 100644 --- a/src/hmem_cuda.c +++ b/src/hmem_cuda.c @@ -487,22 +487,17 @@ static int cuda_hmem_dl_init(void) return -FI_ENOSYS; } - cuda_attr.driver_handle = dlopen("libcuda.so", RTLD_NOW); + cuda_attr.driver_handle = dlopen("libcuda.so.1", RTLD_NOW); if (!cuda_attr.driver_handle) { FI_WARN(&core_prov, FI_LOG_CORE, - "Failed to dlopen libcuda.so\n"); + "Failed to dlopen libcuda.so.1\n"); goto err_dlclose_cuda_runtime; } - cuda_attr.nvml_handle = dlopen("libnvidia-ml.so", RTLD_NOW); + cuda_attr.nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW); if (!cuda_attr.nvml_handle) { - FI_INFO(&core_prov, FI_LOG_CORE, - "Failed to dlopen libnvidia-ml.so. Trying libnvidia-ml.so.1\n"); - cuda_attr.nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW); - if (!cuda_attr.nvml_handle) { - FI_WARN(&core_prov, FI_LOG_CORE, - "Failed to dlopen libnvidia-ml.so or libnvidia-ml.so.1, bypassing nvml calls\n"); - } + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to dlopen libnvidia-ml.so.1, bypassing nvml calls\n"); } CUDA_DRIVER_FUNCS_DEF(CUDA_DRIVER_FUNCS_DLOPEN)