Merge pull request #682 from rapidsai/branch-0.18

[RELEASE] ucx-py v0.18
rapidsai · Feb 24, 2021 · 94329a6 · 94329a6
2 parents 9be9eba + e17a4dc
commit 94329a6
Show file tree

Hide file tree

Showing 6 changed files with 60 additions and 12 deletions.
diff --git a/...bug-tests/test_endpoint_error_callback.py → debug-tests/test_endpoint_error_callback.py b/...bug-tests/test_endpoint_error_callback.py → debug-tests/test_endpoint_error_callback.py
diff --git a/docs/source/deployment.rst b/docs/source/deployment.rst
@@ -16,7 +16,7 @@ Many containers with a shared IPC namespace
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 If you wish to isolate your processes into multiple containers and
-expose one more more GPUs to each container you need to ensure they are
+expose one or more GPUs to each container you need to ensure they are
 using a shared IPC namespace.
 
 In a Docker configuration you can mark one container as having a

diff --git a/docs/source/install.rst b/docs/source/install.rst
@@ -31,7 +31,6 @@ Without GPU support:
     conda create -n ucx -c conda-forge -c rapidsai \
       ucx-proc=*=cpu ucx ucx-py python=3.7
 
-Note: These use UCX's ``v1.8.x`` branch.
 
 Source
 ------
@@ -64,19 +63,45 @@ Test Dependencies
         cupy "numba>=0.46" rmm \
         distributed
 
-UCX
-~~~
+UCX-1.9
+~~~~~~~
+
+Instructions for building ucx 1.9:
+
+::
+
+    conda activate ucx
+    git clone https://github.com/openucx/ucx
+    cd ucx
+    git checkout v1.9.x
+    # apply UCX IB registration cache patch, improves overall
+    # CUDA IB performance when using a memory pool
+    curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/11ad7a3c1f25514df8064930f69c310be4fd55dc/recipe/cuda-alloc-rcache.patch
+    git apply cuda-alloc-rcache.patch
+    ./autogen.sh
+    mkdir build
+    cd build
+    # Performance build
+    ../contrib/configure-release --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I/$CUDA_HOME/include"
+    # Debug build
+    ../contrib/configure-devel --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I/$CUDA_HOME/include"
+    make -j install
+
+UCX-1.8
+~~~~~~~
+
+Instructions for building ucx 1.8:
 
 ::
 
     conda activate ucx
     git clone https://github.com/openucx/ucx
     cd ucx
     git checkout v1.8.x
-    # apply UCX IB registration cache patches, improves overall
+    # apply UCX IB registration cache patch, improves overall
     # CUDA IB performance when using a memory pool
-    curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/master/recipe/add-page-alignment.patch
-    curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/master/recipe/ib_registration_cache.patch
+    curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/bd0377fb7363fd0ddbc3d506ae3414ef6f2e2f50/recipe/add-page-alignment.patch add-page-alignment.patch
+    curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/bd0377fb7363fd0ddbc3d506ae3414ef6f2e2f50/recipe/ib_registration_cache.patch ib_registration_cache.patch
     git apply ib_registration_cache.patch && git apply add-page-alignment.patch
     ./autogen.sh
     mkdir build
@@ -88,7 +113,7 @@ UCX
     make -j install
 
 .. note::
-    If you're running on a machine without CUDA then you _must NOT_ apply the patches.
+    If you're running on a machine without CUDA then you _must NOT_ apply any of the patches above.
 
 UCX + OFED
 ~~~~~~~~~~

diff --git a/docs/source/ucx-debug.rst b/docs/source/ucx-debug.rst
@@ -18,7 +18,23 @@ System Configuration
     mlx5_2 port 1 ==> ib2 (Up)
     mlx5_3 port 1 ==> ib3 (Up)
 
-``ucx_info -d`` and ``ucx_info -p -u t`` are helpful commands to display what UCX understands about the underlying hardware
+``ucx_info -d`` and ``ucx_info -p -u t`` are helpful commands to display what UCX understands about the underlying hardware.
+For example, we can check if UCX has been built correctly with ``RDMA`` and if it is available.
+
+::
+
+    user@pdgx:~$ ucx_info -d | grep -i rdma
+    # Memory domain: rdmacm
+    #     Component: rdmacm
+    # Connection manager: rdmacm
+
+
+    user@dgx:~$ ucx_info -b | grep -i rdma
+    #define HAVE_DECL_RDMA_ESTABLISH  1
+    #define HAVE_DECL_RDMA_INIT_QP_ATTR 1
+    #define HAVE_RDMACM_QP_LESS       1
+    #define UCX_CONFIGURE_FLAGS       "--disable-logging --disable-debug --disable-assertions --disable-params-check --prefix=/gpfs/fs1/user/miniconda3/envs/ucx-dev --with-sysroot --enable-cma --enable-mt --enable-numa --with-gnu-ld --with-rdmacm --with-verbs --with-cuda=/gpfs/fs1/SHARE/Utils/CUDA/10.2.89.0_440.33.01"
+    #define uct_MODULES               ":cuda:ib:rdmacm:cma"
 
 
 InfiniBand Performance
@@ -118,3 +134,9 @@ Experimental Debugging
 A list of problems we have run into along the way while trying to understand performance issues with UCX/UCX-Py:
 
 - System-wide settings environment variables. For example, we saw a system with ``UCX_MEM_MMAP_HOOK_MODE`` set to ``none``.  Unsetting this env var resolved problems: https://github.com/rapidsai/ucx-py/issues/616 .  One can quickly check system wide variables with ``env|grep ^UCX_``.
+
+
+- ``sockcm_iface.c:257 Fatal: sockcm_listener: unable to create handler for new connection``.  This is an error we've seen when limits are place on the number
+of file descriptors and occurs when ``SOCKCM`` is used for establishing connections.  User have two choices for resolving this issue: increase the
+``open files`` limit (check ulimit configuration) or use ``RDMACM`` when establishing a connection ``UCX_SOCKADDR_TLS_PRIORITY=rdmacm``.  ``RDMACM``
+is only available using InfiniBand devices.
diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx
@@ -111,7 +111,7 @@ cdef dict ucx_config_to_dict(ucp_config_t *config):
         if fflush(text_fd) != 0:
             clearerr(text_fd)
             raise IOError("fflush() failed on memory stream")
-        py_text = text.decode()
+        py_text = text.decode(errors="ignore")
         for line in py_text.splitlines():
             k, v = line.split("=")
             k = k[4:]  # Strip "UCX_" prefix
@@ -534,6 +534,8 @@ def _ucx_endpoint_finalizer(uintptr_t handle_as_int, worker, set inflight_msgs):
     cdef str msg
     status = ucp_ep_close_nb(handle, UCP_EP_CLOSE_MODE_FLUSH)
     if UCS_PTR_IS_PTR(status):
+        while ucp_request_check_status(status) == UCS_INPROGRESS:
+            worker.progress()
         ucp_request_free(status)
     elif UCS_PTR_STATUS(status) != UCS_OK:
         msg = ucs_status_string(UCS_PTR_STATUS(status)).decode("utf-8")
@@ -579,7 +581,7 @@ cdef class UCXEndpoint(UCXObject):
             if fflush(text_fd) != 0:
                 clearerr(text_fd)
                 raise IOError("fflush() failed on memory stream")
-            py_text = text.decode()
+            py_text = text.decode(errors="ignore")
         finally:
             if fclose(text_fd) != 0:
                 free(text)

diff --git a/ucp/utils.py b/ucp/utils.py
@@ -161,7 +161,6 @@ async def server_handler(ep):
 
     loop = asyncio.get_event_loop()
     ret = loop.run_until_complete(run())
-    loop.close()
     queue.put(ret)