From 17b99ba215488a999ce4d25cfaaea1d4374d5410 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Tue, 24 Nov 2020 15:47:08 -0500 Subject: [PATCH 1/9] DOC v0.18 Updates From 792ba13a8966531a85fbc92582509c8322169cea Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 8 Dec 2020 18:34:29 +0100 Subject: [PATCH 2/9] Move test_endpoint_error_callback to main debug-tests dir (#664) --- .../debug-tests => debug-tests}/test_endpoint_error_callback.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {tests/debug-tests => debug-tests}/test_endpoint_error_callback.py (100%) diff --git a/tests/debug-tests/test_endpoint_error_callback.py b/debug-tests/test_endpoint_error_callback.py similarity index 100% rename from tests/debug-tests/test_endpoint_error_callback.py rename to debug-tests/test_endpoint_error_callback.py From e69db5bac4d256099cdcfb527d17537f97f7a34f Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 28 Jan 2021 15:48:41 +0100 Subject: [PATCH 3/9] Ensure `ucp_ep_close_nb` is awaited for (#671) --- ucp/_libs/ucx_api.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index f722342bb..6d23563da 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -534,6 +534,8 @@ def _ucx_endpoint_finalizer(uintptr_t handle_as_int, worker, set inflight_msgs): cdef str msg status = ucp_ep_close_nb(handle, UCP_EP_CLOSE_MODE_FLUSH) if UCS_PTR_IS_PTR(status): + while ucp_request_check_status(status) == UCS_INPROGRESS: + worker.progress() ucp_request_free(status) elif UCS_PTR_STATUS(status) != UCS_OK: msg = ucs_status_string(UCS_PTR_STATUS(status)).decode("utf-8") From f1145b7396fa5a4249564efef7877b9d329abfd5 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 29 Jan 2021 14:48:04 -0800 Subject: [PATCH 4/9] Prevent closing event_loop too early This fixes warnings such as: Task was destroyed but it is pending! task: > --- ucp/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ucp/utils.py b/ucp/utils.py index f5fefe286..d300236e6 100644 --- a/ucp/utils.py +++ b/ucp/utils.py @@ -161,7 +161,6 @@ async def server_handler(ep): loop = asyncio.get_event_loop() ret = loop.run_until_complete(run()) - loop.close() queue.put(ret) From 00997f6e7cfdaf2ca841764f222054e4d3671f49 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 4 Feb 2021 17:07:59 +0100 Subject: [PATCH 5/9] Fix character decoding (#675) * Fix character decoding This resolves errors such as below: asyncio:base_events.py:1707 Task exception was never retrieved future: exception=UnicodeDecodeError('utf-8', b'#\n# UCP endpoint\n#\n# peer: \n# lane[0]: 255:\xc0\x7f/.0 md[60] -> md[255]/posix \n# lane[1]: 0:tcp/enp1s0f0.0 md[0] -> md[0]/tcp am am_bw#0 wireup\n# lane[2]: 1:cuda_copy/cuda.0 md[1] -> md[1]/cuda_cpy rma_bw#0\n#\n# tag_send: 0....8185....8192....(inf)\n# tag_send_nbr: 0....8185....8192....(inf)\n# tag_send_sync: 0....8185....8192....(inf)\n#\n# rma_bw: mds rndv_rkey_size 9\n#\n', 88, 89, 'invalid start byte')> Traceback (most recent call last): File "ucp/core.py", line 200, in _listener_handler_coroutine await func(ep) File "tests/test_multiple_nodes.py", line 21, in server_node await hello(ep) File "test_multiple_nodes.py", line 17, in hello assert isinstance(ep.ucx_info(), str) File "core.py", line 655, in ucx_info return self._ep.info() File "ucp/_libs/ucx_api.pyx", line 584, in ucp._libs.ucx_api.UCXEndpoint.info py_text = text.decode() UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc0 in position 88: invalid start byte * Ignore errors in character decoding --- ucp/_libs/ucx_api.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 6d23563da..d2c0698b0 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -111,7 +111,7 @@ cdef dict ucx_config_to_dict(ucp_config_t *config): if fflush(text_fd) != 0: clearerr(text_fd) raise IOError("fflush() failed on memory stream") - py_text = text.decode() + py_text = text.decode(errors="ignore") for line in py_text.splitlines(): k, v = line.split("=") k = k[4:] # Strip "UCX_" prefix @@ -581,7 +581,7 @@ cdef class UCXEndpoint(UCXObject): if fflush(text_fd) != 0: clearerr(text_fd) raise IOError("fflush() failed on memory stream") - py_text = text.decode() + py_text = text.decode(errors="ignore") finally: if fclose(text_fd) != 0: free(text) From 30c6262a147873cbedcc4b08c1aebd6978018df7 Mon Sep 17 00:00:00 2001 From: Manolis Papadakis Date: Mon, 8 Feb 2021 02:12:37 -0800 Subject: [PATCH 6/9] Update install instructions for new UCX version (#666) * Update install instructions for new UCX version * add instructions for 1.8 as well as 1.9 ucx building * Update phrasing on IB registration cache patch Co-authored-by: Benjamin Zaitlen Co-authored-by: Peter Andreas Entschev --- docs/source/install.rst | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/docs/source/install.rst b/docs/source/install.rst index ac889292a..393a3375c 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -31,7 +31,6 @@ Without GPU support: conda create -n ucx -c conda-forge -c rapidsai \ ucx-proc=*=cpu ucx ucx-py python=3.7 -Note: These use UCX's ``v1.8.x`` branch. Source ------ @@ -64,8 +63,34 @@ Test Dependencies cupy "numba>=0.46" rmm \ distributed -UCX -~~~ +UCX-1.9 +~~~~~~~ + +Instructions for building ucx 1.9: + +:: + + conda activate ucx + git clone https://github.com/openucx/ucx + cd ucx + git checkout v1.9.x + # apply UCX IB registration cache patch, improves overall + # CUDA IB performance when using a memory pool + curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/11ad7a3c1f25514df8064930f69c310be4fd55dc/recipe/cuda-alloc-rcache.patch + git apply cuda-alloc-rcache.patch + ./autogen.sh + mkdir build + cd build + # Performance build + ../contrib/configure-release --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I/$CUDA_HOME/include" + # Debug build + ../contrib/configure-devel --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I/$CUDA_HOME/include" + make -j install + +UCX-1.8 +~~~~~~~ + +Instructions for building ucx 1.8: :: @@ -73,10 +98,10 @@ UCX git clone https://github.com/openucx/ucx cd ucx git checkout v1.8.x - # apply UCX IB registration cache patches, improves overall + # apply UCX IB registration cache patch, improves overall # CUDA IB performance when using a memory pool - curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/master/recipe/add-page-alignment.patch - curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/master/recipe/ib_registration_cache.patch + curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/bd0377fb7363fd0ddbc3d506ae3414ef6f2e2f50/recipe/add-page-alignment.patch add-page-alignment.patch + curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/bd0377fb7363fd0ddbc3d506ae3414ef6f2e2f50/recipe/ib_registration_cache.patch ib_registration_cache.patch git apply ib_registration_cache.patch && git apply add-page-alignment.patch ./autogen.sh mkdir build @@ -88,7 +113,7 @@ UCX make -j install .. note:: - If you're running on a machine without CUDA then you _must NOT_ apply the patches. + If you're running on a machine without CUDA then you _must NOT_ apply any of the patches above. UCX + OFED ~~~~~~~~~~ From aea3000fdfa5899d04b181db003596701804052c Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Mon, 8 Feb 2021 10:08:09 -0500 Subject: [PATCH 7/9] fix typo and add note on sockcm error --- docs/source/deployment.rst | 2 +- docs/source/ucx-debug.rst | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/source/deployment.rst b/docs/source/deployment.rst index fa61e9013..ae492d575 100644 --- a/docs/source/deployment.rst +++ b/docs/source/deployment.rst @@ -16,7 +16,7 @@ Many containers with a shared IPC namespace ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If you wish to isolate your processes into multiple containers and -expose one more more GPUs to each container you need to ensure they are +expose one or more GPUs to each container you need to ensure they are using a shared IPC namespace. In a Docker configuration you can mark one container as having a diff --git a/docs/source/ucx-debug.rst b/docs/source/ucx-debug.rst index 7d9b6b89b..87182a257 100644 --- a/docs/source/ucx-debug.rst +++ b/docs/source/ucx-debug.rst @@ -18,7 +18,8 @@ System Configuration mlx5_2 port 1 ==> ib2 (Up) mlx5_3 port 1 ==> ib3 (Up) -``ucx_info -d`` and ``ucx_info -p -u t`` are helpful commands to display what UCX understands about the underlying hardware +``ucx_info -d`` and ``ucx_info -p -u t`` are helpful commands to display what UCX understands about the underlying hardware. +For example, we can check InfiniBand Performance @@ -118,3 +119,8 @@ Experimental Debugging A list of problems we have run into along the way while trying to understand performance issues with UCX/UCX-Py: - System-wide settings environment variables. For example, we saw a system with ``UCX_MEM_MMAP_HOOK_MODE`` set to ``none``. Unsetting this env var resolved problems: https://github.com/rapidsai/ucx-py/issues/616 . One can quickly check system wide variables with ``env|grep ^UCX_``. + + +- ``sockcm_iface.c:257 Fatal: sockcm_listener: unable to create handler for new connection``. This is an error we've seen when limits are place on the number +of file descriptors. User have two choices for resolving this issue: increase the ``open files`` limit (check ulimit configuration) or use ``RDMACM`` when establishing +a connection ``UCX_SOCKADDR_TLS_PRIORITY=rdmacm`` From ce628bce93a94dd9bdce874753542b85bb56af7f Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Mon, 8 Feb 2021 10:45:04 -0500 Subject: [PATCH 8/9] more doc updates --- docs/source/ucx-debug.rst | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/docs/source/ucx-debug.rst b/docs/source/ucx-debug.rst index 87182a257..efee3ed4c 100644 --- a/docs/source/ucx-debug.rst +++ b/docs/source/ucx-debug.rst @@ -19,7 +19,22 @@ System Configuration mlx5_3 port 1 ==> ib3 (Up) ``ucx_info -d`` and ``ucx_info -p -u t`` are helpful commands to display what UCX understands about the underlying hardware. -For example, we can check +For example, we can check if UCX has been built correctly with ``RDMA`` and if it is available. + +:: + + user@pdgx:~$ ucx_info -d | grep -i rdma + # Memory domain: rdmacm + # Component: rdmacm + # Connection manager: rdmacm + + + user@dgx:~$ ucx_info -b | grep -i rdma + #define HAVE_DECL_RDMA_ESTABLISH 1 + #define HAVE_DECL_RDMA_INIT_QP_ATTR 1 + #define HAVE_RDMACM_QP_LESS 1 + #define UCX_CONFIGURE_FLAGS "--disable-logging --disable-debug --disable-assertions --disable-params-check --prefix=/gpfs/fs1/user/miniconda3/envs/ucx-dev --with-sysroot --enable-cma --enable-mt --enable-numa --with-gnu-ld --with-rdmacm --with-verbs --with-cuda=/gpfs/fs1/SHARE/Utils/CUDA/10.2.89.0_440.33.01" + #define uct_MODULES ":cuda:ib:rdmacm:cma" InfiniBand Performance @@ -122,5 +137,6 @@ A list of problems we have run into along the way while trying to understand per - ``sockcm_iface.c:257 Fatal: sockcm_listener: unable to create handler for new connection``. This is an error we've seen when limits are place on the number -of file descriptors. User have two choices for resolving this issue: increase the ``open files`` limit (check ulimit configuration) or use ``RDMACM`` when establishing -a connection ``UCX_SOCKADDR_TLS_PRIORITY=rdmacm`` +of file descriptors and occurs when ``SOCKCM`` is used for establishing connections. User have two choices for resolving this issue: increase the +``open files`` limit (check ulimit configuration) or use ``RDMACM`` when establishing a connection ``UCX_SOCKADDR_TLS_PRIORITY=rdmacm``. ``RDMACM`` +is only available also using InfiniBand devices. From 35c0c997a6b7acc9f3c14f63c3fa65af3cf41485 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Mon, 8 Feb 2021 12:09:15 -0500 Subject: [PATCH 9/9] Update docs/source/ucx-debug.rst Co-authored-by: Peter Andreas Entschev --- docs/source/ucx-debug.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/ucx-debug.rst b/docs/source/ucx-debug.rst index efee3ed4c..8cb6a3646 100644 --- a/docs/source/ucx-debug.rst +++ b/docs/source/ucx-debug.rst @@ -139,4 +139,4 @@ A list of problems we have run into along the way while trying to understand per - ``sockcm_iface.c:257 Fatal: sockcm_listener: unable to create handler for new connection``. This is an error we've seen when limits are place on the number of file descriptors and occurs when ``SOCKCM`` is used for establishing connections. User have two choices for resolving this issue: increase the ``open files`` limit (check ulimit configuration) or use ``RDMACM`` when establishing a connection ``UCX_SOCKADDR_TLS_PRIORITY=rdmacm``. ``RDMACM`` -is only available also using InfiniBand devices. +is only available using InfiniBand devices.