Merge pull request #835 from rapidsai/branch-0.25

rapidsai · Apr 6, 2022 · 6f3f750 · 6f3f750
2 parents 25dd97f + a16f8a2
commit 6f3f750
Show file tree

Hide file tree

Showing 51 changed files with 531 additions and 1,573 deletions.
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -6,13 +6,13 @@ build:
     - libnuma1
 
 python:
-  version: 3.7
+  version: 3.8
   install:
     - method: setuptools
       path: .
 
 conda:
-  environment: conda/environments/builddocs_py37.yml
+  environment: conda/environments/builddocs.yml
 
 sphinx:
   configuration: docs/source/conf.py
diff --git a/README.md b/README.md
@@ -20,17 +20,14 @@ In order to use TCP add `tcp` to `UCX_TLS` and set `UCXPY_IFNAME` to the network
 
 ```bash
 # TCP using "eth0" and CUDA support
-export UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc
-export UCX_SOCKADDR_TLS_PRIORITY=sockcm
+export UCX_TLS=tcp,cuda_copy,cuda_ipc
 export UCXPY_IFNAME="eth0"
 
 # InfiniBand using "ib0" and CUDA support
-export UCX_TLS=rc,sockcm,cuda_copy,cuda_ipc
-export UCX_SOCKADDR_TLS_PRIORITY=sockcm
+export UCX_TLS=rc,cuda_copy,cuda_ipc
 export UCXPY_IFNAME="ib0"
 
 # TCP using "eno0" and no CUDA support
-export UCX_TLS=tcp,sockcm
-export UCX_SOCKADDR_TLS_PRIORITY=sockcm
+export UCX_TLS=tcp
 export UCXPY_IFNAME="eno0"
 ```
diff --git a/benchmarks/cudf-merge.py b/benchmarks/cudf-merge.py
@@ -239,13 +239,6 @@ def parse_args():
         type=float,
         help="Fraction of rows that matches (default 0.3)",
     )
-    parser.add_argument(
-        "--net-devices",
-        metavar="LIST",
-        default=None,
-        type=str,
-        help='List of net devices to use, one for each device or "auto"',
-    )
     parser.add_argument(
         "--profile",
         metavar="FILENAME",
@@ -275,11 +268,6 @@ def parse_args():
             f"Number of chunks must be greater than 1 (chunks-per-dev: \
                     {args.chunks_per_dev}, devs: {args.devs})"
         )
-    if args.net_devices == "auto":
-        args.net_devices = [ucp.utils.get_closest_net_devices(d) for d in args.devs]
-    elif args.net_devices is not None:
-        args.net_devices = args.net_devices.split(",")
-        assert len(args.net_devices) == len(args.devs)
     return args
 
 
@@ -289,18 +277,8 @@ def main():
     assert len(ranks) > 1
     assert len(ranks) % 2 == 0
 
-    ucx_options_list = None
-    if args.net_devices is not None:
-        ucx_options_list = [
-            {"NET_DEVICES": args.net_devices[rank % len(args.devs)]} for rank in ranks
-        ]
-
     stats = run_on_local_network(
-        args.n_chunks,
-        worker,
-        worker_args=args,
-        server_address=args.server_address,
-        ucx_options_list=ucx_options_list,
+        args.n_chunks, worker, worker_args=args, server_address=args.server_address,
     )
 
     wc = stats[0]["wallclock"]

diff --git a/benchmarks/send-recv-core.py b/benchmarks/send-recv-core.py
@@ -1,32 +1,11 @@
 """
-Benchmark send receive on one machine (UCX < 1.10):
-UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc UCX_SOCKADDR_TLS_PRIORITY=sockcm python \
-    send-recv-core.py --server-dev 2 --client-dev 1 \
-    --object_type rmm --reuse-alloc --n-bytes 1GB
-
-
-Benchmark send receive on one machine (UCX >= 1.10):
+Benchmark send receive on one machine:
 UCX_TLS=tcp,cuda_copy,cuda_ipc python send-recv-core.py \
         --server-dev 2 --client-dev 1 --object_type rmm \
         --reuse-alloc --n-bytes 1GB
 
 
-Benchmark send receive on two machines (IB testing, UCX < 1.10):
-# server process
-UCX_NET_DEVICES=mlx5_0:1 UCX_TLS=tcp,sockcm,cuda_copy,rc \
-    UCX_SOCKADDR_TLS_PRIORITY=sockcm python send-recv-core.py \
-    --server-dev 0 --client-dev 5 --object_type rmm --reuse-alloc \
-    --n-bytes 1GB --server-only --port 13337 --n-iter 100
-
-# client process
-UCX_NET_DEVICES=mlx5_2:1 UCX_TLS=tcp,sockcm,cuda_copy,rc \
-    UCX_SOCKADDR_TLS_PRIORITY=sockcm python send-recv-core.py \
-    --server-dev 0 --client-dev 5 --object_type rmm --reuse-alloc \
-    --n-bytes 1GB --client-only --server-address SERVER_IP --port 13337 \
-    --n-iter 100
-
-
-Benchmark send receive on two machines (IB testing, UCX >= 1.10):
+Benchmark send receive on two machines (IB testing):
 # server process
 UCX_MAX_RNDV_RAILS=1 UCX_TLS=tcp,cuda_copy,rc python send-recv-core.py \
         --server-dev 0 --client-dev 5 --object_type rmm --reuse-alloc \
@@ -159,9 +138,7 @@ def _am_recv_handle(recv_obj, exception, ep):
     def _listener_handler(conn_request, msg):
         global ep
         ep = ucx_api.UCXEndpoint.create_from_conn_request(
-            worker,
-            conn_request,
-            endpoint_error_handling=ucx_api.get_ucx_version() >= (1, 10, 0),
+            worker, conn_request, endpoint_error_handling=True,
         )
 
         # Wireup before starting to transfer data
@@ -255,10 +232,7 @@ def client(queue, port, server_address, args):
     worker = ucx_api.UCXWorker(ctx)
     register_am_allocators(args, worker)
     ep = ucx_api.UCXEndpoint.create(
-        worker,
-        server_address,
-        port,
-        endpoint_error_handling=ucx_api.get_ucx_version() >= (1, 10, 0),
+        worker, server_address, port, endpoint_error_handling=True,
     )
 
     send_msg = xp.arange(args.n_bytes, dtype="u1")
@@ -526,10 +500,6 @@ def main():
     # if you are the client, only start the `client process`
     # otherwise, start everything
 
-    if args.enable_am and not ucp._libs.ucx_api.is_am_supported():
-        print("AM only supported in UCX >= 1.11")
-        return
-
     if not args.client_only:
         # server process
         q1 = mp.Queue()

diff --git a/benchmarks/send-recv.py b/benchmarks/send-recv.py
@@ -1,32 +1,11 @@
 """
-Benchmark send receive on one machine (UCX < 1.10):
-UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc UCX_SOCKADDR_TLS_PRIORITY=sockcm python \
-    send-recv.py --server-dev 2 --client-dev 1 \
-    --object_type rmm --reuse-alloc --n-bytes 1GB
-
-
-Benchmark send receive on one machine (UCX >= 1.10):
+Benchmark send receive on one machine:
 UCX_TLS=tcp,cuda_copy,cuda_ipc python send-recv.py \
         --server-dev 2 --client-dev 1 --object_type rmm \
         --reuse-alloc --n-bytes 1GB
 
 
-Benchmark send receive on two machines (IB testing, UCX < 1.10):
-# server process
-UCX_NET_DEVICES=mlx5_0:1 UCX_TLS=tcp,sockcm,cuda_copy,rc \
-    UCX_SOCKADDR_TLS_PRIORITY=sockcm python send-recv.py \
-    --server-dev 0 --client-dev 5 --object_type rmm --reuse-alloc \
-    --n-bytes 1GB --server-only --port 13337 --n-iter 100
-
-# client process
-UCX_NET_DEVICES=mlx5_2:1 UCX_TLS=tcp,sockcm,cuda_copy,rc \
-    UCX_SOCKADDR_TLS_PRIORITY=sockcm python send-recv.py \
-    --server-dev 0 --client-dev 5 --object_type rmm --reuse-alloc \
-    --n-bytes 1GB --client-only --server-address SERVER_IP --port 13337 \
-    --n-iter 100
-
-
-Benchmark send receive on two machines (IB testing, UCX >= 1.10):
+Benchmark send receive on two machines (IB testing):
 # server process
 UCX_MAX_RNDV_RAILS=1 UCX_TLS=tcp,cuda_copy,rc python send-recv.py \
         --server-dev 0 --client-dev 5 --object_type rmm --reuse-alloc \
@@ -379,10 +358,6 @@ def main():
     # if you are the client, only start the `client process`
     # otherwise, start everything
 
-    if args.enable_am and not ucp._libs.ucx_api.is_am_supported():
-        print("AM only supported in UCX >= 1.11")
-        return
-
     if not args.client_only:
         # server process
         q1 = mp.Queue()

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -26,7 +26,7 @@ export HOME=$WORKSPACE
 cd $WORKSPACE
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
-export RAPIDS_VERSION="0.23"
+export RAPIDS_VERSION="22.04"
 export TEST_UCX_MASTER=0
 
 ################################################################################

diff --git a/conda/environments/builddocs_py37.yml → conda/environments/builddocs.yml b/conda/environments/builddocs_py37.yml → conda/environments/builddocs.yml
@@ -16,6 +16,5 @@ dependencies:
 - pandoc=<2.0.0
 - pip
 - psutil
-- libhwloc
 - ucx
-- cython
+- cython
diff --git a/debug-tests/README.md b/debug-tests/README.md
@@ -11,11 +11,11 @@ NOTE: This was moved outside of the tests directory to prevent users running pot
 
 ### Process 1
 
-> UCXPY_IFNAME=enp1s0f0 CUDA_VISIBLE_DEVICES=0,1 UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc,sockcm UCX_SOCKADDR_TLS_PRIORITY=sockcm /usr/local/cuda/bin/nvprof python tests/debug-testssend.py
+> UCXPY_IFNAME=enp1s0f0 CUDA_VISIBLE_DEVICES=0,1 UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc /usr/local/cuda/bin/nvprof python tests/debug-testssend.py
 
 ### Process 2
 
-> UCXPY_LOG_LEVEL=DEBUG UCX_LOG_LEVEL=DEBUG UCXPY_IFNAME=enp1s0f0 CUDA_VISIBLE_DEVICES=0,1 UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc,sockcm UCX_SOCKADDR_TLS_PRIORITY=sockcm /usr/local/cuda/bin/nvprof python tests/recv.py
+> UCXPY_LOG_LEVEL=DEBUG UCX_LOG_LEVEL=DEBUG UCXPY_IFNAME=enp1s0f0 CUDA_VISIBLE_DEVICES=0,1 UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc /usr/local/cuda/bin/nvprof python tests/recv.py
 
 `nvprof` is used to verify NVLINK usage and we are looking at two things primarily:
 - existence of [CUDA memcpy PtoP]

diff --git a/debug-tests/multi-node-workers.sh b/debug-tests/multi-node-workers.sh
@@ -3,8 +3,7 @@ set -e
 #export UCX_LOG_LEVEL=DEBUG
 #export UCXPY_LOG_LEVEL=DEBUG
 export UCX_MEMTYPE_CACHE=n
-export UCX_TLS=tcp,sockcm,cuda_copy,rc
-export UCX_SOCKADDR_TLS_PRIORITY=sockcm
+export UCX_TLS=tcp,cuda_copy,rc
 
 UCX_NET_DEVICES=mlx5_0:1 CUDA_VISIBLE_DEVICES=0 python recv.py 2>&1 | tee /tmp/recv-log-0.txt &
 UCX_NET_DEVICES=mlx5_0:1 CUDA_VISIBLE_DEVICES=1 python recv.py 2>&1 | tee /tmp/recv-log-1.txt &

diff --git a/debug-tests/scheduler.sh b/debug-tests/scheduler.sh
@@ -4,7 +4,6 @@ set -e
 #export UCX_LOG_LEVEL=TRACE
 # export UCXPY_LOG_LEVEL=DEBUG
 export UCX_MEMTYPE_CACHE=n
-export UCX_TLS=tcp,sockcm,cuda_copy,rc,cuda_ipc
-export UCX_SOCKADDR_TLS_PRIORITY=sockcm
+export UCX_TLS=tcp,cuda_copy,rc,cuda_ipc
 
 UCX_NET_DEVICES=mlx5_0:1 CUDA_VISIBLE_DEVICES=0 python send.py 2>&1 | tee /tmp/send-log.txt &
diff --git a/debug-tests/test_endpoint_error_callback.py b/debug-tests/test_endpoint_error_callback.py
@@ -1,6 +1,6 @@
 # This test requires InfiniBand, to run:
 # UCXPY_IFNAME=ib0 UCX_NET_DEVICES=mlx5_0:1 \
-# UCX_TLS=rc,tcp,sockcm,cuda_copy UCX_SOCKADDR_TLS_PRIORITY=sockcm \
+# UCX_TLS=rc,tcp,cuda_copy \
 # py.test --cache-clear tests/debug-tests/test_endpoint_error_callback.py
 import asyncio
 import multiprocessing
@@ -134,12 +134,6 @@ def cupy_obj():
 )
 @pytest.mark.parametrize("endpoint_error_handling", [True, False])
 def test_send_recv_cu(endpoint_error_handling):
-    if endpoint_error_handling is True and ucp.get_ucx_version() < (1, 11, 0):
-        pytest.skip(
-            "Endpoint error handling support for all transports is only available "
-            "in UCX >= 1.11.0"
-        )
-
     base_env = os.environ
     env_client = base_env.copy()
     # grab first two devices

diff --git a/debug-tests/test_send_recv_many_workers.py b/debug-tests/test_send_recv_many_workers.py
@@ -14,12 +14,10 @@
 from distributed.protocol import to_serialize
 
 import ucp
-from ucp._libs.topological_distance import TopologicalDistance
 
 cupy = pytest.importorskip("cupy")
 rmm = pytest.importorskip("rmm")
 
-UCX_110 = ucp.get_ucx_version() >= (1, 10, 0)
 TRANSFER_ITERATIONS = 5
 EP_ITERATIONS = 3
 
@@ -29,15 +27,6 @@ def get_environment_variables(cuda_device_index):
 
     env["CUDA_VISIBLE_DEVICES"] = str(cuda_device_index)
 
-    if not UCX_110:
-        tls = env.get("UCX_TLS")
-        if tls is not None and "rc" in tls:
-            td = TopologicalDistance()
-            closest_openfabrics = td.get_cuda_distances_from_device_index(
-                cuda_device_index, "openfabrics"
-            )
-            env["UCX_NET_DEVICES"] = closest_openfabrics[0]["name"] + ":1"
-
     return env
 
 
@@ -47,12 +36,6 @@ def restore_environment_variables(cuda_visible_devices, ucx_net_devices):
     else:
         os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
 
-    if not UCX_110:
-        if ucx_net_devices is None:
-            os.environ.pop("UCX_NET_DEVICES")
-        else:
-            os.environ["UCX_NET_DEVICES"] = ucx_net_devices
-
 
 async def get_ep(name, port):
     addr = ucp.get_address()

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -8,7 +8,6 @@ RUN apt-get update && \
         g++ \
         git \
         libcap2 \
-        libhwloc-dev \
         libnuma-dev \
         libtool \
         make \