hmem/cuda: avoid stub loading at runtime

When the CUDA toolkit is installed, a set of "stub" libraries are installed under /usr/local/cuda*/lib64/stubs/. These libraries include a SONAME field with a `.1' suffix, but the filenames of these stubs are bare. eg: > $ readelf -d /usr/local/cuda-12.5/lib64/stubs/libnvidia-ml.so | grep soname > 0x000000000000000e (SONAME) Library soname: [libnvidia-ml.so.1] The CUDA toolkit does not include any library file with the name `libnvidia-ml.so.1` (or `libcuda.so.1`, etc.), as these are provided by the driver package. This disconnect between the stub filename in the toolkit and the SONAME within it is done intentionally to allow linking with the stub at build time, while ensuring it's never loaded at runtime. In normal dynamic linking cases (ie: without dlopen), the SONAME field of `libnvidia-ml.so.1` is used in the DT_NEEDED tag, where that filename can only come from a driver package and this ensures that the stub library will never match. Match the same behavior and provide `.1` suffixes to dlopen where appropriate for NVIDIA libraries. Signed-off-by: Nicholas Sielicki <[email protected]>
ofiwg · Dec 12, 2024 · eb04700 · eb04700
1 parent f03fe01
commit eb04700
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 18 deletions.
diff --git a/fabtests/common/hmem_cuda.c b/fabtests/common/hmem_cuda.c
@@ -151,15 +151,15 @@ int ft_cuda_init(void)
 	cudaError_t cuda_ret;
 	int ret;
 
-	cudart_handle = dlopen("libcudart.so", RTLD_NOW);
+	cudart_handle = dlopen("libcudart.so.12", RTLD_NOW);
 	if (!cudart_handle) {
-		FT_ERR("Failed to dlopen libcudart.so");
+		FT_ERR("Failed to dlopen libcudart.so.12");
 		goto err;
 	}
 
-	cuda_handle = dlopen("libcuda.so", RTLD_NOW);
+	cuda_handle = dlopen("libcuda.so.1", RTLD_NOW);
 	if (!cuda_handle) {
-		FT_ERR("Failed to dlopen libcuda.so\n");
+		FT_ERR("Failed to dlopen libcuda.so.1\n");
 		goto err_dlclose_cudart;
 	}
 

diff --git a/prov/psm3/psm3/psm.c b/prov/psm3/psm3/psm.c
@@ -295,10 +295,10 @@ int psmi_cuda_lib_load()
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetDevice);
 
 	/* CUDA Runtime */
-	psmi_cudart_lib = dlopen("libcudart.so", RTLD_LAZY);
+	psmi_cudart_lib = dlopen("libcudart.so.12", RTLD_LAZY);
 	if (!psmi_cudart_lib) {
 		dlerr = dlerror();
-		_HFI_ERROR("Unable to open libcudart.so.  Error %s\n",
+		_HFI_ERROR("Unable to open libcudart.so.12.  Error %s\n",
 				dlerr ? dlerr : "no dlerror()");
 		goto fail;
 	}

diff --git a/src/hmem_cuda.c b/src/hmem_cuda.c
@@ -480,29 +480,24 @@ static int cuda_hmem_dl_init(void)
 	/* Assume failure to dlopen CUDA runtime is caused by the library not
 	 * being found. Thus, CUDA is not supported.
 	 */
-	cuda_attr.runtime_handle = dlopen("libcudart.so", RTLD_NOW);
+	cuda_attr.runtime_handle = dlopen("libcudart.so.12", RTLD_NOW);
 	if (!cuda_attr.runtime_handle) {
 		FI_INFO(&core_prov, FI_LOG_CORE,
-			"Failed to dlopen libcudart.so\n");
+			"Failed to dlopen libcudart.so.12\n");
 		return -FI_ENOSYS;
 	}
 
-	cuda_attr.driver_handle = dlopen("libcuda.so", RTLD_NOW);
+	cuda_attr.driver_handle = dlopen("libcuda.so.1", RTLD_NOW);
 	if (!cuda_attr.driver_handle) {
 		FI_WARN(&core_prov, FI_LOG_CORE,
-			"Failed to dlopen libcuda.so\n");
+			"Failed to dlopen libcuda.so.1\n");
 		goto err_dlclose_cuda_runtime;
 	}
 
-	cuda_attr.nvml_handle = dlopen("libnvidia-ml.so", RTLD_NOW);
+	cuda_attr.nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
 	if (!cuda_attr.nvml_handle) {
-		FI_INFO(&core_prov, FI_LOG_CORE,
-			"Failed to dlopen libnvidia-ml.so.  Trying libnvidia-ml.so.1\n");
-		cuda_attr.nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
-		if (!cuda_attr.nvml_handle) {
-			FI_WARN(&core_prov, FI_LOG_CORE,
-			"Failed to dlopen libnvidia-ml.so or libnvidia-ml.so.1, bypassing nvml calls\n");
-		}
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to dlopen libnvidia-ml.so.1, bypassing nvml calls\n");
 	}
 
 	CUDA_DRIVER_FUNCS_DEF(CUDA_DRIVER_FUNCS_DLOPEN)