Skip to content

Commit

Permalink
Merge pull request #835 from rapidsai/branch-0.25
Browse files Browse the repository at this point in the history
  • Loading branch information
raydouglass committed Apr 6, 2022
2 parents 25dd97f + a16f8a2 commit 6f3f750
Show file tree
Hide file tree
Showing 51 changed files with 531 additions and 1,573 deletions.
4 changes: 2 additions & 2 deletions .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ build:
- libnuma1

python:
version: 3.7
version: 3.8
install:
- method: setuptools
path: .

conda:
environment: conda/environments/builddocs_py37.yml
environment: conda/environments/builddocs.yml

sphinx:
configuration: docs/source/conf.py
9 changes: 3 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,14 @@ In order to use TCP add `tcp` to `UCX_TLS` and set `UCXPY_IFNAME` to the network

```bash
# TCP using "eth0" and CUDA support
export UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc
export UCX_SOCKADDR_TLS_PRIORITY=sockcm
export UCX_TLS=tcp,cuda_copy,cuda_ipc
export UCXPY_IFNAME="eth0"

# InfiniBand using "ib0" and CUDA support
export UCX_TLS=rc,sockcm,cuda_copy,cuda_ipc
export UCX_SOCKADDR_TLS_PRIORITY=sockcm
export UCX_TLS=rc,cuda_copy,cuda_ipc
export UCXPY_IFNAME="ib0"

# TCP using "eno0" and no CUDA support
export UCX_TLS=tcp,sockcm
export UCX_SOCKADDR_TLS_PRIORITY=sockcm
export UCX_TLS=tcp
export UCXPY_IFNAME="eno0"
```
24 changes: 1 addition & 23 deletions benchmarks/cudf-merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,13 +239,6 @@ def parse_args():
type=float,
help="Fraction of rows that matches (default 0.3)",
)
parser.add_argument(
"--net-devices",
metavar="LIST",
default=None,
type=str,
help='List of net devices to use, one for each device or "auto"',
)
parser.add_argument(
"--profile",
metavar="FILENAME",
Expand Down Expand Up @@ -275,11 +268,6 @@ def parse_args():
f"Number of chunks must be greater than 1 (chunks-per-dev: \
{args.chunks_per_dev}, devs: {args.devs})"
)
if args.net_devices == "auto":
args.net_devices = [ucp.utils.get_closest_net_devices(d) for d in args.devs]
elif args.net_devices is not None:
args.net_devices = args.net_devices.split(",")
assert len(args.net_devices) == len(args.devs)
return args


Expand All @@ -289,18 +277,8 @@ def main():
assert len(ranks) > 1
assert len(ranks) % 2 == 0

ucx_options_list = None
if args.net_devices is not None:
ucx_options_list = [
{"NET_DEVICES": args.net_devices[rank % len(args.devs)]} for rank in ranks
]

stats = run_on_local_network(
args.n_chunks,
worker,
worker_args=args,
server_address=args.server_address,
ucx_options_list=ucx_options_list,
args.n_chunks, worker, worker_args=args, server_address=args.server_address,
)

wc = stats[0]["wallclock"]
Expand Down
38 changes: 4 additions & 34 deletions benchmarks/send-recv-core.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,11 @@
"""
Benchmark send receive on one machine (UCX < 1.10):
UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc UCX_SOCKADDR_TLS_PRIORITY=sockcm python \
send-recv-core.py --server-dev 2 --client-dev 1 \
--object_type rmm --reuse-alloc --n-bytes 1GB
Benchmark send receive on one machine (UCX >= 1.10):
Benchmark send receive on one machine:
UCX_TLS=tcp,cuda_copy,cuda_ipc python send-recv-core.py \
--server-dev 2 --client-dev 1 --object_type rmm \
--reuse-alloc --n-bytes 1GB
Benchmark send receive on two machines (IB testing, UCX < 1.10):
# server process
UCX_NET_DEVICES=mlx5_0:1 UCX_TLS=tcp,sockcm,cuda_copy,rc \
UCX_SOCKADDR_TLS_PRIORITY=sockcm python send-recv-core.py \
--server-dev 0 --client-dev 5 --object_type rmm --reuse-alloc \
--n-bytes 1GB --server-only --port 13337 --n-iter 100
# client process
UCX_NET_DEVICES=mlx5_2:1 UCX_TLS=tcp,sockcm,cuda_copy,rc \
UCX_SOCKADDR_TLS_PRIORITY=sockcm python send-recv-core.py \
--server-dev 0 --client-dev 5 --object_type rmm --reuse-alloc \
--n-bytes 1GB --client-only --server-address SERVER_IP --port 13337 \
--n-iter 100
Benchmark send receive on two machines (IB testing, UCX >= 1.10):
Benchmark send receive on two machines (IB testing):
# server process
UCX_MAX_RNDV_RAILS=1 UCX_TLS=tcp,cuda_copy,rc python send-recv-core.py \
--server-dev 0 --client-dev 5 --object_type rmm --reuse-alloc \
Expand Down Expand Up @@ -159,9 +138,7 @@ def _am_recv_handle(recv_obj, exception, ep):
def _listener_handler(conn_request, msg):
global ep
ep = ucx_api.UCXEndpoint.create_from_conn_request(
worker,
conn_request,
endpoint_error_handling=ucx_api.get_ucx_version() >= (1, 10, 0),
worker, conn_request, endpoint_error_handling=True,
)

# Wireup before starting to transfer data
Expand Down Expand Up @@ -255,10 +232,7 @@ def client(queue, port, server_address, args):
worker = ucx_api.UCXWorker(ctx)
register_am_allocators(args, worker)
ep = ucx_api.UCXEndpoint.create(
worker,
server_address,
port,
endpoint_error_handling=ucx_api.get_ucx_version() >= (1, 10, 0),
worker, server_address, port, endpoint_error_handling=True,
)

send_msg = xp.arange(args.n_bytes, dtype="u1")
Expand Down Expand Up @@ -526,10 +500,6 @@ def main():
# if you are the client, only start the `client process`
# otherwise, start everything

if args.enable_am and not ucp._libs.ucx_api.is_am_supported():
print("AM only supported in UCX >= 1.11")
return

if not args.client_only:
# server process
q1 = mp.Queue()
Expand Down
29 changes: 2 additions & 27 deletions benchmarks/send-recv.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,11 @@
"""
Benchmark send receive on one machine (UCX < 1.10):
UCX_TLS=tcp,sockcm,cuda_copy,cuda_ipc UCX_SOCKADDR_TLS_PRIORITY=sockcm python \
send-recv.py --server-dev 2 --client-dev 1 \
--object_type rmm --reuse-alloc --n-bytes 1GB
Benchmark send receive on one machine (UCX >= 1.10):
Benchmark send receive on one machine:
UCX_TLS=tcp,cuda_copy,cuda_ipc python send-recv.py \
--server-dev 2 --client-dev 1 --object_type rmm \
--reuse-alloc --n-bytes 1GB
Benchmark send receive on two machines (IB testing, UCX < 1.10):
# server process
UCX_NET_DEVICES=mlx5_0:1 UCX_TLS=tcp,sockcm,cuda_copy,rc \
UCX_SOCKADDR_TLS_PRIORITY=sockcm python send-recv.py \
--server-dev 0 --client-dev 5 --object_type rmm --reuse-alloc \
--n-bytes 1GB --server-only --port 13337 --n-iter 100
# client process
UCX_NET_DEVICES=mlx5_2:1 UCX_TLS=tcp,sockcm,cuda_copy,rc \
UCX_SOCKADDR_TLS_PRIORITY=sockcm python send-recv.py \
--server-dev 0 --client-dev 5 --object_type rmm --reuse-alloc \
--n-bytes 1GB --client-only --server-address SERVER_IP --port 13337 \
--n-iter 100
Benchmark send receive on two machines (IB testing, UCX >= 1.10):
Benchmark send receive on two machines (IB testing):
# server process
UCX_MAX_RNDV_RAILS=1 UCX_TLS=tcp,cuda_copy,rc python send-recv.py \
--server-dev 0 --client-dev 5 --object_type rmm --reuse-alloc \
Expand Down Expand Up @@ -379,10 +358,6 @@ def main():
# if you are the client, only start the `client process`
# otherwise, start everything

if args.enable_am and not ucp._libs.ucx_api.is_am_supported():
print("AM only supported in UCX >= 1.11")
return

if not args.client_only:
# server process
q1 = mp.Queue()
Expand Down
2 changes: 1 addition & 1 deletion ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export HOME=$WORKSPACE
cd $WORKSPACE
export GIT_DESCRIBE_TAG=`git describe --tags`
export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
export RAPIDS_VERSION="0.23"
export RAPIDS_VERSION="22.04"
export TEST_UCX_MASTER=0

################################################################################
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,5 @@ dependencies:
- pandoc=<2.0.0
- pip
- psutil
- libhwloc
- ucx
- cython
- cython
4 changes: 2 additions & 2 deletions debug-tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ NOTE: This was moved outside of the tests directory to prevent users running pot

### Process 1

> UCXPY_IFNAME=enp1s0f0 CUDA_VISIBLE_DEVICES=0,1 UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc,sockcm UCX_SOCKADDR_TLS_PRIORITY=sockcm /usr/local/cuda/bin/nvprof python tests/debug-testssend.py
> UCXPY_IFNAME=enp1s0f0 CUDA_VISIBLE_DEVICES=0,1 UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc /usr/local/cuda/bin/nvprof python tests/debug-testssend.py
### Process 2

> UCXPY_LOG_LEVEL=DEBUG UCX_LOG_LEVEL=DEBUG UCXPY_IFNAME=enp1s0f0 CUDA_VISIBLE_DEVICES=0,1 UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc,sockcm UCX_SOCKADDR_TLS_PRIORITY=sockcm /usr/local/cuda/bin/nvprof python tests/recv.py
> UCXPY_LOG_LEVEL=DEBUG UCX_LOG_LEVEL=DEBUG UCXPY_IFNAME=enp1s0f0 CUDA_VISIBLE_DEVICES=0,1 UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc /usr/local/cuda/bin/nvprof python tests/recv.py
`nvprof` is used to verify NVLINK usage and we are looking at two things primarily:
- existence of [CUDA memcpy PtoP]
Expand Down
3 changes: 1 addition & 2 deletions debug-tests/multi-node-workers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ set -e
#export UCX_LOG_LEVEL=DEBUG
#export UCXPY_LOG_LEVEL=DEBUG
export UCX_MEMTYPE_CACHE=n
export UCX_TLS=tcp,sockcm,cuda_copy,rc
export UCX_SOCKADDR_TLS_PRIORITY=sockcm
export UCX_TLS=tcp,cuda_copy,rc

UCX_NET_DEVICES=mlx5_0:1 CUDA_VISIBLE_DEVICES=0 python recv.py 2>&1 | tee /tmp/recv-log-0.txt &
UCX_NET_DEVICES=mlx5_0:1 CUDA_VISIBLE_DEVICES=1 python recv.py 2>&1 | tee /tmp/recv-log-1.txt &
Expand Down
3 changes: 1 addition & 2 deletions debug-tests/scheduler.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ set -e
#export UCX_LOG_LEVEL=TRACE
# export UCXPY_LOG_LEVEL=DEBUG
export UCX_MEMTYPE_CACHE=n
export UCX_TLS=tcp,sockcm,cuda_copy,rc,cuda_ipc
export UCX_SOCKADDR_TLS_PRIORITY=sockcm
export UCX_TLS=tcp,cuda_copy,rc,cuda_ipc

UCX_NET_DEVICES=mlx5_0:1 CUDA_VISIBLE_DEVICES=0 python send.py 2>&1 | tee /tmp/send-log.txt &
8 changes: 1 addition & 7 deletions debug-tests/test_endpoint_error_callback.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This test requires InfiniBand, to run:
# UCXPY_IFNAME=ib0 UCX_NET_DEVICES=mlx5_0:1 \
# UCX_TLS=rc,tcp,sockcm,cuda_copy UCX_SOCKADDR_TLS_PRIORITY=sockcm \
# UCX_TLS=rc,tcp,cuda_copy \
# py.test --cache-clear tests/debug-tests/test_endpoint_error_callback.py
import asyncio
import multiprocessing
Expand Down Expand Up @@ -134,12 +134,6 @@ def cupy_obj():
)
@pytest.mark.parametrize("endpoint_error_handling", [True, False])
def test_send_recv_cu(endpoint_error_handling):
if endpoint_error_handling is True and ucp.get_ucx_version() < (1, 11, 0):
pytest.skip(
"Endpoint error handling support for all transports is only available "
"in UCX >= 1.11.0"
)

base_env = os.environ
env_client = base_env.copy()
# grab first two devices
Expand Down
17 changes: 0 additions & 17 deletions debug-tests/test_send_recv_many_workers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,10 @@
from distributed.protocol import to_serialize

import ucp
from ucp._libs.topological_distance import TopologicalDistance

cupy = pytest.importorskip("cupy")
rmm = pytest.importorskip("rmm")

UCX_110 = ucp.get_ucx_version() >= (1, 10, 0)
TRANSFER_ITERATIONS = 5
EP_ITERATIONS = 3

Expand All @@ -29,15 +27,6 @@ def get_environment_variables(cuda_device_index):

env["CUDA_VISIBLE_DEVICES"] = str(cuda_device_index)

if not UCX_110:
tls = env.get("UCX_TLS")
if tls is not None and "rc" in tls:
td = TopologicalDistance()
closest_openfabrics = td.get_cuda_distances_from_device_index(
cuda_device_index, "openfabrics"
)
env["UCX_NET_DEVICES"] = closest_openfabrics[0]["name"] + ":1"

return env


Expand All @@ -47,12 +36,6 @@ def restore_environment_variables(cuda_visible_devices, ucx_net_devices):
else:
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices

if not UCX_110:
if ucx_net_devices is None:
os.environ.pop("UCX_NET_DEVICES")
else:
os.environ["UCX_NET_DEVICES"] = ucx_net_devices


async def get_ep(name, port):
addr = ucp.get_address()
Expand Down
1 change: 0 additions & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ RUN apt-get update && \
g++ \
git \
libcap2 \
libhwloc-dev \
libnuma-dev \
libtool \
make \
Expand Down
Loading

0 comments on commit 6f3f750

Please sign in to comment.