diff --git a/.devcontainer.json b/.devcontainer.json
new file mode 100644
index 00000000..83afd5d9
--- /dev/null
+++ b/.devcontainer.json
@@ -0,0 +1,19 @@
+{
+  "build": {
+    "dockerfile": "docker/dev-cuda12.1.dockerfile",
+    "context": "."
+  },
+  "runArgs": ["--gpus", "all"],
+  "features": {
+    "ghcr.io/devcontainers/features/github-cli:1": {},
+  },
+  "customizations": {
+    "vscode": {
+      "extensions": ["ms-vscode.cmake-tools"]
+    }
+  },
+  "remoteEnv": {
+    "OMPI_ALLOW_RUN_AS_ROOT": "1",
+    "OMPI_ALLOW_RUN_AS_ROOT_CONFIRM": "1"
+  }
+}
diff --git a/.gitignore b/.gitignore
index af2117f7..7e4b82bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ __pycache__
 .*.swp
 .idea/
 *.so
+.venv/
diff --git a/docker/dev-cuda12.1.dockerfile b/docker/dev-cuda12.1.dockerfile
index 70fe684c..7f9ca1f4 100644
--- a/docker/dev-cuda12.1.dockerfile
+++ b/docker/dev-cuda12.1.dockerfile
@@ -13,7 +13,8 @@ WORKDIR ${MSCCLPP_SRC_DIR}
 ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
     CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
 RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
-    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local
+    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
+    rm -rf ${CMAKE_HOME}.tar.gz
 ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
 
 # Install pytest & dependencies
diff --git a/docs/setup_example.ipynb b/docs/setup_example.ipynb
new file mode 100644
index 00000000..16546ac3
--- /dev/null
+++ b/docs/setup_example.ipynb
@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Copyright (c) Microsoft Corporation.\n",
+    "Licensed under the MIT license.\n",
+    "\n",
+    "The following example demonstrates how to initialize the MSCCL++ library and perform necessary setup for communicating from GPU kernels. First we define a function for registering memory, making connections and creating channels."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mscclpp\n",
+    "\n",
+    "def setup_channels(comm, memory, proxy_service):\n",
+    "    # Register the memory with the communicator\n",
+    "    reg_mem = comm.register_memory(memory.data.ptr, memory.nbytes, mscclpp.Transport.CudaIpc)\n",
+    "\n",
+    "    # Create connections to all other ranks and exchange registered memories\n",
+    "    connections = []\n",
+    "    remote_memories = []\n",
+    "    for r in range(comm.bootstrap.size):\n",
+    "        if r == comm.bootstrap.rank: # Don't connect to self\n",
+    "            continue\n",
+    "        connections.append(comm.connect(r, 0, mscclpp.Transport.CudaIpc))\n",
+    "        comm.send_memory(reg_mem, r, 0)\n",
+    "        remote_mem = comm.recv_memory(r, 0)\n",
+    "        remote_memories.append(remote_mem)\n",
+    "\n",
+    "    # Both connections and received remote memories are returned as futures,\n",
+    "    # so we wait for them to complete and unwrap them.\n",
+    "    connections = [conn.get() for conn in connections]\n",
+    "    remote_memories = [mem.get() for mem in remote_memories]\n",
+    "\n",
+    "    # Finally, create proxy channels for each connection\n",
+    "    proxy_channels = [mscclpp.SimpleProxyChannel(\n",
+    "        proxy_service.proxy_channel(proxy_service.build_and_add_semaphore(comm, conn)),\n",
+    "        proxy_service.add_memory(remote_memories[i]),\n",
+    "        proxy_service.add_memory(reg_mem),\n",
+    "    ) for i, conn in enumerate(connections)]\n",
+    "\n",
+    "    return proxy_channels"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we are ready to write the top-level code for each rank."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cupy as cp\n",
+    "\n",
+    "def run(rank, world_size, if_ip_port_trio):\n",
+    "    # Use the right GPU for this rank\n",
+    "    cp.cuda.Device(rank).use()\n",
+    "    \n",
+    "    # Allocate memory on the GPU\n",
+    "    memory = cp.zeros(1024, dtype=cp.int32)\n",
+    "\n",
+    "    # Initialize a bootstrapper using a known interface/IP/port trio for the root rank\n",
+    "    boot = mscclpp.TcpBootstrap.create(rank, world_size)\n",
+    "    boot.initialize(if_ip_port_trio)\n",
+    "\n",
+    "    # Create a communicator for the processes in the bootstrapper\n",
+    "    comm = mscclpp.Communicator(boot)\n",
+    "\n",
+    "    # Create a proxy service, which enables GPU kernels to use connections\n",
+    "    proxy_service = mscclpp.ProxyService()\n",
+    "\n",
+    "    if rank == 0:\n",
+    "        print(\"Setting up channels\")\n",
+    "    proxy_channels = setup_channels(comm, memory, proxy_service)\n",
+    "\n",
+    "    if rank == 0:\n",
+    "        print(\"Starting proxy service\")\n",
+    "    proxy_service.start_proxy()\n",
+    "\n",
+    "    # This is where we could launch a GPU kernel that uses proxy_channels[i].device_handle\n",
+    "    # to initiate communication. See include/mscclpp/proxy_channel_device.hpp for details.\n",
+    "    if rank == 0:\n",
+    "        print(\"GPU kernels that use the proxy go here.\")\n",
+    "\n",
+    "    if rank == 0:\n",
+    "        print(f\"Stopping proxy service\")\n",
+    "    proxy_service.stop_proxy()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, to test the code we can run each process using the `multiprocessing` package."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Setting up channels\n",
+      "Starting proxy service\n",
+      "GPU kernels that use the proxy go here.\n",
+      "Stopping proxy service\n",
+      "\n",
+      "Starting proxy service\n",
+      "GPU kernels that use the proxy go here.\n",
+      "Stopping proxy service\n"
+     ]
+    }
+   ],
+   "source": [
+    "import multiprocessing as mp\n",
+    "\n",
+    "world_size = 2\n",
+    "processes = [mp.Process(target=run, args=(rank, world_size, \"eth0:localhost:50051\")) for rank in range(world_size)]\n",
+    "for p in processes:\n",
+    "    p.start()\n",
+    "for p in processes:\n",
+    "    p.join()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 03eb8cc6..d9596803 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -32,15 +32,15 @@ class Bootstrap {
  public:
   Bootstrap(){};
   virtual ~Bootstrap() = default;
-  virtual int getRank() = 0;
-  virtual int getNranks() = 0;
+  virtual int rank() = 0;
+  virtual int size() = 0;
   virtual void send(void* data, int size, int peer, int tag) = 0;
-  virtual void recv(void* data, int size, int peer, int tag) = 0;
+  [[nodiscard]] virtual std::future<void> recv(void* data, int size, int peer, int tag) = 0;
   virtual void allGather(void* allData, int size) = 0;
   virtual void barrier() = 0;
 
   void send(const std::vector<char>& data, int peer, int tag);
-  void recv(std::vector<char>& data, int peer, int tag);
+  std::future<std::vector<char>> recv(int peer, int tag);
 };
 
 /// A native implementation of the bootstrap using TCP sockets.
@@ -73,10 +73,10 @@ class TcpBootstrap : public Bootstrap {
   void initialize(const std::string& ifIpPortTrio, int64_t timeoutSec = 30);
 
   /// Return the rank of the process.
-  int getRank() override;
+  int rank() override;
 
   /// Return the total number of ranks.
-  int getNranks() override;
+  int size() override;
 
   /// Send data to another process.
   ///
@@ -98,7 +98,8 @@ class TcpBootstrap : public Bootstrap {
   /// @param size The size of the data to receive.
   /// @param peer The rank of the process to receive the data from.
   /// @param tag The tag to receive the data with.
-  void recv(void* data, int size, int peer, int tag) override;
+  /// @return A future that will be ready when the data has been received.
+  [[nodiscard]] std::future<void> recv(void* data, int size, int peer, int tag) override;
 
   /// Gather data from all processes.
   ///
@@ -329,17 +330,17 @@ class RegisteredMemory {
   /// Get the size of the memory block.
   ///
   /// @return The size of the memory block.
-  size_t size();
+  size_t size() const;
 
   /// Get the transport flags associated with the memory block.
   ///
   /// @return The transport flags associated with the memory block.
-  TransportFlags transports();
+  TransportFlags transports() const;
 
   /// Serialize the RegisteredMemory object to a vector of characters.
   ///
   /// @return A vector of characters representing the serialized RegisteredMemory object.
-  std::vector<char> serialize();
+  std::vector<char> serialize() const;
 
   /// Deserialize a RegisteredMemory object from a vector of characters.
   ///
@@ -370,12 +371,12 @@ class Endpoint {
   /// Get the transport used.
   ///
   /// @return The transport used.
-  Transport transport();
+  Transport transport() const;
 
   /// Serialize the Endpoint object to a vector of characters.
   ///
   /// @return A vector of characters representing the serialized Endpoint object.
-  std::vector<char> serialize();
+  std::vector<char> serialize() const;
 
   /// Deserialize a Endpoint object from a vector of characters.
   ///
@@ -522,60 +523,14 @@ class Context {
   friend class Endpoint;
 };
 
-/// A base class for objects that can be set up during @ref Communicator::setup().
-struct Setuppable {
-  /// Called inside @ref Communicator::setup() before any call to @ref endSetup() of any @ref Setuppable object that is
-  /// being set up within the same @ref Communicator::setup() call.
-  ///
-  /// @param bootstrap A shared pointer to the bootstrap implementation.
-  virtual void beginSetup(std::shared_ptr<Bootstrap> bootstrap);
-
-  /// Called inside @ref Communicator::setup() after all calls to @ref beginSetup() of all @ref Setuppable objects that
-  /// are being set up within the same @ref Communicator::setup() call.
-  ///
-  /// @param bootstrap A shared pointer to the bootstrap implementation.
-  virtual void endSetup(std::shared_ptr<Bootstrap> bootstrap);
-};
-
-/// A non-blocking future that can be used to check if a value is ready and retrieve it.
-template <typename T>
-class NonblockingFuture {
-  std::shared_future<T> future;
-
- public:
-  /// Default constructor.
-  NonblockingFuture() = default;
-
-  /// Constructor that takes a shared future and moves it into the NonblockingFuture.
-  ///
-  /// @param future The shared future to move.
-  NonblockingFuture(std::shared_future<T>&& future) : future(std::move(future)) {}
-
-  /// Check if the value is ready to be retrieved.
-  ///
-  /// @return True if the value is ready, false otherwise.
-  bool ready() const { return future.wait_for(std::chrono::seconds(0)) == std::future_status::ready; }
-
-  /// Get the value.
-  ///
-  /// @return The value.
-  ///
-  /// @throws Error if the value is not ready.
-  T get() const {
-    if (!ready()) throw Error("NonblockingFuture::get() called before ready", ErrorCode::InvalidUsage);
-    return future.get();
-  }
-};
-
 /// A class that sets up all registered memories and connections between processes.
 ///
 /// A typical way to use this class:
-///   1. Call @ref connectOnSetup() to declare connections between the calling process with other processes.
+///   1. Call @ref connect() to declare connections between the calling process with other processes.
 ///   2. Call @ref registerMemory() to register memory regions that will be used for communication.
-///   3. Call @ref sendMemoryOnSetup() or @ref recvMemoryOnSetup() to send/receive registered memory regions to/from
+///   3. Call @ref sendMemory() or @ref recvMemory() to send/receive registered memory regions to/from
 ///      other processes.
-///   4. Call @ref setup() to set up all registered memories and connections declared in the previous steps.
-///   5. Call @ref NonblockingFuture<RegisteredMemory>::get() to get the registered memory regions received from other
+///   5. Call @ref std::future<RegisteredMemory>::get() to get the registered memory regions received from other
 ///      processes.
 ///   6. All done; use connections and registered memories to build channels.
 ///
@@ -608,30 +563,23 @@ class Communicator {
   /// @return RegisteredMemory A handle to the buffer.
   RegisteredMemory registerMemory(void* ptr, size_t size, TransportFlags transports);
 
-  /// Send information of a registered memory to the remote side on setup.
-  ///
-  /// This function registers a send to a remote process that will happen by a following call of @ref setup(). The send
-  /// will carry information about a registered memory on the local process.
+  /// Send information of a registered memory to the remote side.
   ///
   /// @param memory The registered memory buffer to send information about.
   /// @param remoteRank The rank of the remote process.
   /// @param tag The tag to use for identifying the send.
-  void sendMemoryOnSetup(RegisteredMemory memory, int remoteRank, int tag);
+  void sendMemory(RegisteredMemory memory, int remoteRank, int tag);
 
-  /// Receive memory on setup.
-  ///
-  /// This function registers a receive from a remote process that will happen by a following call of @ref setup(). The
-  /// receive will carry information about a registered memory on the remote process.
+  /// Receive memory.
   ///
   /// @param remoteRank The rank of the remote process.
   /// @param tag The tag to use for identifying the receive.
-  /// @return NonblockingFuture<RegisteredMemory> A non-blocking future of registered memory.
-  NonblockingFuture<RegisteredMemory> recvMemoryOnSetup(int remoteRank, int tag);
+  /// @return std::future<RegisteredMemory> A future of registered memory.
+  std::future<RegisteredMemory> recvMemory(int remoteRank, int tag);
 
-  /// Connect to a remote rank on setup.
+  /// Connect to a remote rank.
   ///
-  /// This function only prepares metadata for connection. The actual connection is made by a following call of
-  /// @ref setup(). Note that this function is two-way and a connection from rank `i` to remote rank `j` needs
+  /// Note that this function is two-way and a connection from rank `i` to remote rank `j` needs
   /// to have a counterpart from rank `j` to rank `i`. Note that with IB, buffers are registered at a page level and if
   /// a buffer is spread through multiple pages and do not fully utilize all of them, IB's QP has to register for all
   /// involved pages. This potentially has security risks if the connection's accesses are given to a malicious process.
@@ -639,9 +587,8 @@ class Communicator {
   /// @param remoteRank The rank of the remote process.
   /// @param tag The tag of the connection for identifying it.
   /// @param config The configuration for the local endpoint.
-  /// @return NonblockingFuture<NonblockingFuture<std::shared_ptr<Connection>>> A non-blocking future of shared pointer
-  /// to the connection.
-  NonblockingFuture<std::shared_ptr<Connection>> connectOnSetup(int remoteRank, int tag, EndpointConfig localConfig);
+  /// @return std::future<std::shared_ptr<Connection>> A future of shared pointer to the connection.
+  std::future<std::shared_ptr<Connection>> connect(int remoteRank, int tag, EndpointConfig localConfig);
 
   /// Get the remote rank a connection is connected to.
   ///
@@ -655,18 +602,6 @@ class Communicator {
   /// @return The tag the connection was made with.
   int tagOf(const Connection& connection);
 
-  /// Add a custom Setuppable object to a list of objects to be setup later, when @ref setup() is called.
-  ///
-  /// @param setuppable A shared pointer to the Setuppable object.
-  void onSetup(std::shared_ptr<Setuppable> setuppable);
-
-  /// Setup all objects that have registered for setup.
-  ///
-  /// This includes previous calls of @ref sendMemoryOnSetup(), @ref recvMemoryOnSetup(), @ref connectOnSetup(), and
-  /// @ref onSetup(). It is allowed to call this function multiple times, where the n-th call will only setup objects
-  /// that have been registered after the (n-1)-th call.
-  void setup();
-
  private:
   // The interal implementation.
   struct Impl;
diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp
index 7ad3ec6b..bd42d84a 100644
--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -30,7 +30,7 @@ template <template <typename> typename InboundDeleter, template <typename> typen
 class BaseSemaphore {
  protected:
   /// The registered memory for the remote peer's inbound semaphore ID.
-  NonblockingFuture<RegisteredMemory> remoteInboundSemaphoreIdsRegMem_;
+  std::shared_future<RegisteredMemory> remoteInboundSemaphoreIdsRegMem_;
 
   /// The inbound semaphore ID that is incremented by the remote peer and waited on by the local peer.
   ///
diff --git a/python/examples/bootstrap.py b/python/examples/bootstrap.py
deleted file mode 100644
index 71539e0b..00000000
--- a/python/examples/bootstrap.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import argparse
-import logging
-import multiprocessing as mp
-import sys
-
-import mscclpp
-import torch
-
-IB_TRANSPORTS = [
-    mscclpp.Transport.IB0,
-    mscclpp.Transport.IB1,
-    mscclpp.Transport.IB2,
-    mscclpp.Transport.IB3,
-    mscclpp.Transport.IB4,
-    mscclpp.Transport.IB5,
-    mscclpp.Transport.IB6,
-    mscclpp.Transport.IB7,
-]
-
-# Use to hold the sm channels so they don't get garbage collected
-sm_channels = []
-
-
-def setup_connections(comm, rank, world_size, element_size, proxy_service):
-    simple_proxy_channels = []
-    sm_semaphores = []
-    connections = []
-    remote_memories = []
-    memory = torch.zeros(element_size, dtype=torch.int32)
-    memory = memory.to("cuda")
-
-    transport_flag = mscclpp.TransportFlags(IB_TRANSPORTS[rank]) | mscclpp.Transport.CudaIpc
-    ptr = memory.data_ptr()
-    size = memory.numel() * memory.element_size()
-    reg_mem = comm.register_memory(ptr, size, transport_flag)
-
-    for r in range(world_size):
-        if r == rank:
-            continue
-        conn = comm.connect_on_setup(r, 0, mscclpp.Transport.CudaIpc)
-        connections.append(conn)
-        comm.send_memory_on_setup(reg_mem, r, 0)
-        remote_mem = comm.recv_memory_on_setup(r, 0)
-        remote_memories.append(remote_mem)
-    comm.setup()
-
-    connections = [conn.get() for conn in connections]
-
-    # Create simple proxy channels
-    for i, conn in enumerate(connections):
-        proxy_channel = mscclpp.SimpleProxyChannel(
-            proxy_service.proxy_channel(proxy_service.build_and_add_semaphore(comm, conn)),
-            proxy_service.add_memory(remote_memories[i].get()),
-            proxy_service.add_memory(reg_mem),
-        )
-        simple_proxy_channels.append(proxy_channel.device_handle())
-    comm.setup()
-
-    # Create sm channels
-    for i, conn in enumerate(connections):
-        sm_chan = mscclpp.SmDevice2DeviceSemaphore(comm, conn)
-        sm_semaphores.append(sm_chan)
-    comm.setup()
-
-    for i, conn in enumerate(sm_semaphores):
-        sm_chan = mscclpp.SmChannel(sm_semaphores[i], remote_memories[i].get(), ptr)
-        sm_channels.append(sm_chan)
-    return simple_proxy_channels, [sm_chan.device_handle() for sm_chan in sm_channels]
-
-
-def run(rank, args):
-    world_size = args.gpu_number
-    torch.cuda.set_device(rank)
-
-    boot = mscclpp.TcpBootstrap.create(rank, world_size)
-    boot.initialize(args.if_ip_port_trio)
-    comm = mscclpp.Communicator(boot)
-    proxy_service = mscclpp.ProxyService()
-
-    logging.info("Rank: %d, setting up connections", rank)
-    setup_connections(comm, rank, world_size, args.num_elements, proxy_service)
-
-    logging.info("Rank: %d, starting proxy service", rank)
-    proxy_service.start_proxy()
-
-
-def main():
-    logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
-    parser = argparse.ArgumentParser()
-    parser.add_argument("if_ip_port_trio", type=str)
-    parser.add_argument("-n", "--num-elements", type=int, default=10)
-    parser.add_argument("-g", "--gpu_number", type=int, default=2)
-    args = parser.parse_args()
-    processes = []
-
-    for rank in range(args.gpu_number):
-        p = mp.Process(target=run, args=(rank, args))
-        p.start()
-        processes.append(p)
-
-    for p in processes:
-        p.join()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/examples/send_recv.py b/python/examples/send_recv.py
deleted file mode 100644
index d19a7be2..00000000
--- a/python/examples/send_recv.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import argparse
-import time
-
-import mscclpp
-
-
-def main(args):
-    if args.root:
-        rank = 0
-    else:
-        rank = 1
-
-    boot = mscclpp.TcpBootstrap.create(rank, 2)
-    boot.initialize(args.if_ip_port_trio)
-
-    comm = mscclpp.Communicator(boot)
-
-    if args.gpu:
-        import torch
-
-        print("Allocating GPU memory")
-        memory = torch.zeros(args.num_elements, dtype=torch.int32)
-        memory = memory.to("cuda")
-        ptr = memory.data_ptr()
-        size = memory.numel() * memory.element_size()
-    else:
-        from array import array
-
-        print("Allocating host memory")
-        memory = array("i", [0] * args.num_elements)
-        ptr, elements = memory.buffer_info()
-        size = elements * memory.itemsize
-    my_reg_mem = comm.register_memory(ptr, size, mscclpp.Transport.IB0)
-
-    conn = comm.connect_on_setup((rank + 1) % 2, 0, mscclpp.Transport.IB0)
-
-    other_reg_mem = None
-    if rank == 0:
-        other_reg_mem = comm.recv_memory_on_setup((rank + 1) % 2, 0)
-    else:
-        comm.send_memory_on_setup(my_reg_mem, (rank + 1) % 2, 0)
-
-    comm.setup()
-
-    if rank == 0:
-        other_reg_mem = other_reg_mem.get()
-
-    if rank == 0:
-        for i in range(args.num_elements):
-            memory[i] = i + 1
-        conn.write(other_reg_mem, 0, my_reg_mem, 0, size)
-        print("Done sending")
-    else:
-        print("Checking for correctness")
-        # polling
-        for _ in range(args.polling_num):
-            all_correct = True
-            for i in range(args.num_elements):
-                if memory[i] != i + 1:
-                    all_correct = False
-                    print(f"Error: Mismatch at index {i}: expected {i + 1}, got {memory[i]}")
-                    break
-            if all_correct:
-                print("All data matched expected values")
-                break
-            else:
-                time.sleep(0.1)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("if_ip_port_trio", type=str)
-    parser.add_argument("-r", "--root", action="store_true")
-    parser.add_argument("-n", "--num-elements", type=int, default=10)
-    parser.add_argument("--gpu", action="store_true")
-    parser.add_argument("--polling_num", type=int, default=100)
-    args = parser.parse_args()
-
-    main(args)
diff --git a/python/examples/utils.py b/python/examples/utils.py
deleted file mode 100644
index 7f2b4c98..00000000
--- a/python/examples/utils.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import time
-
-import mscclpp
-
-
-def main():
-    timer = mscclpp.Timer()
-    timer.reset()
-    time.sleep(2)
-    assert timer.elapsed() >= 2000000
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index 5165e95c..f14cbaa4 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -19,6 +19,9 @@
     Transport,
     TransportFlags,
     version,
+    get_ib_device_count,
+    get_ib_device_name,
+    get_ib_transport_by_device_name,
 )
 
 __version__ = version()
diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
index 60ceb96c..2b6451ee 100644
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -21,19 +21,17 @@ extern void register_utils(nb::module_& m);
 extern void register_numa(nb::module_& m);
 
 template <typename T>
-void def_nonblocking_future(nb::handle& m, const std::string& typestr) {
-  std::string pyclass_name = std::string("NonblockingFuture") + typestr;
-  nb::class_<NonblockingFuture<T>>(m, pyclass_name.c_str())
-      .def("ready", &NonblockingFuture<T>::ready)
-      .def("get", &NonblockingFuture<T>::get);
+void def_future(nb::handle& m, const std::string& typestr) {
+  std::string pyclass_name = std::string("std_future_") + typestr;
+  nb::class_<std::future<T>>(m, pyclass_name.c_str()).def("get", &std::future<T>::get);
 }
 
 void register_core(nb::module_& m) {
   m.def("version", &version);
 
   nb::class_<Bootstrap>(m, "Bootstrap")
-      .def("get_rank", &Bootstrap::getRank)
-      .def("get_n_ranks", &Bootstrap::getNranks)
+      .def_prop_ro("rank", &Bootstrap::rank)
+      .def_prop_ro("size", &Bootstrap::size)
       .def(
           "send",
           [](Bootstrap* self, uintptr_t ptr, size_t size, int peer, int tag) {
@@ -45,15 +43,18 @@ void register_core(nb::module_& m) {
           "recv",
           [](Bootstrap* self, uintptr_t ptr, size_t size, int peer, int tag) {
             void* data = reinterpret_cast<void*>(ptr);
-            self->recv(data, size, peer, tag);
+            return self->recv(data, size, peer, tag);
           },
           nb::arg("data"), nb::arg("size"), nb::arg("peer"), nb::arg("tag"))
       .def("all_gather", &Bootstrap::allGather, nb::arg("allData"), nb::arg("size"))
       .def("barrier", &Bootstrap::barrier)
       .def("send", (void (Bootstrap::*)(const std::vector<char>&, int, int)) & Bootstrap::send, nb::arg("data"),
            nb::arg("peer"), nb::arg("tag"))
-      .def("recv", (void (Bootstrap::*)(std::vector<char>&, int, int)) & Bootstrap::recv, nb::arg("data"),
-           nb::arg("peer"), nb::arg("tag"));
+      .def("recv", (std::future<std::vector<char>>(Bootstrap::*)(int, int)) & Bootstrap::recv, nb::arg("peer"),
+           nb::arg("tag"));
+
+  def_future<void>(m, "void");
+  def_future<std::vector<char>>(m, "vector_char");
 
   nb::class_<UniqueId>(m, "UniqueId");
 
@@ -149,28 +150,29 @@ void register_core(nb::module_& m) {
       .def("create_endpoint", &Context::createEndpoint, nb::arg("config"))
       .def("connect", &Context::connect, nb::arg("local_endpoint"), nb::arg("remote_endpoint"));
 
-  def_nonblocking_future<RegisteredMemory>(m, "RegisteredMemory");
-  def_nonblocking_future<std::shared_ptr<Connection>>(m, "shared_ptr_Connection");
+  def_future<RegisteredMemory>(m, "RegisteredMemory");
+  def_future<std::shared_ptr<Connection>>(m, "shared_ptr_Connection");
 
   nb::class_<Communicator>(m, "Communicator")
       .def(nb::init<std::shared_ptr<Bootstrap>, std::shared_ptr<Context>>(), nb::arg("bootstrap"),
            nb::arg("context") = nullptr)
-      .def("bootstrap", &Communicator::bootstrap)
-      .def("context", &Communicator::context)
+      .def_prop_ro("bootstrap", &Communicator::bootstrap)
+      .def_prop_ro("context", &Communicator::context)
       .def(
           "register_memory",
           [](Communicator* self, uintptr_t ptr, size_t size, TransportFlags transports) {
             return self->registerMemory((void*)ptr, size, transports);
           },
           nb::arg("ptr"), nb::arg("size"), nb::arg("transports"))
-      .def("send_memory_on_setup", &Communicator::sendMemoryOnSetup, nb::arg("memory"), nb::arg("remoteRank"),
-           nb::arg("tag"))
-      .def("recv_memory_on_setup", &Communicator::recvMemoryOnSetup, nb::arg("remoteRank"), nb::arg("tag"))
-      .def("connect_on_setup", &Communicator::connectOnSetup, nb::arg("remoteRank"), nb::arg("tag"),
-           nb::arg("localConfig"))
+      .def("send_memory", &Communicator::sendMemory, nb::arg("memory"), nb::arg("remoteRank"), nb::arg("tag"))
+      .def("recv_memory", &Communicator::recvMemory, nb::arg("remoteRank"), nb::arg("tag"))
+      .def("connect", &Communicator::connect, nb::arg("remoteRank"), nb::arg("tag"), nb::arg("localConfig"))
       .def("remote_rank_of", &Communicator::remoteRankOf)
-      .def("tag_of", &Communicator::tagOf)
-      .def("setup", &Communicator::setup);
+      .def("tag_of", &Communicator::tagOf);
+
+  m.def("get_ib_device_count", &getIBDeviceCount);
+  m.def("get_ib_device_name", &getIBDeviceName, nb::arg("ib_transport"));
+  m.def("get_ib_transport_by_device_name", &getIBTransportByDeviceName, nb::arg("ib_device_name"));
 }
 
 NB_MODULE(_mscclpp, m) {
diff --git a/python/test/mscclpp_group.py b/python/test/mscclpp_group.py
index 7a7c7b01..be73f671 100644
--- a/python/test/mscclpp_group.py
+++ b/python/test/mscclpp_group.py
@@ -41,8 +41,8 @@ def __init__(self, mpi_group: MpiGroup, interfaceIpPortTrio=""):
             # use this instead
             self.bootstrap.initialize(interfaceIpPortTrio)
         self.communicator = Communicator(self.bootstrap)
-        self.my_rank = self.bootstrap.get_rank()
-        self.nranks = self.bootstrap.get_n_ranks()
+        self.my_rank = self.bootstrap.rank
+        self.nranks = self.bootstrap.size
 
     def barrier(self):
         self.bootstrap.barrier()
@@ -51,7 +51,7 @@ def send(self, tensor: np.ndarray, peer: int, tag: int):
         self.bootstrap.send(tensor.ctypes.data, tensor.size * tensor.itemsize, peer, tag)
 
     def recv(self, tensor: np.ndarray, peer: int, tag: int):
-        self.bootstrap.recv(tensor.ctypes.data, tensor.size * tensor.itemsize, peer, tag)
+        self.bootstrap.recv(tensor.ctypes.data, tensor.size * tensor.itemsize, peer, tag).get()
 
     def my_ib_device(self, local_rank: int) -> Transport:
         if local_rank == 0:
@@ -74,12 +74,8 @@ def my_ib_device(self, local_rank: int) -> Transport:
             assert False  # only 8 IBs are supported
 
     def make_connection(self, remote_ranks: list[int], transport: Transport) -> dict[int, Connection]:
-        connections = {}
-        for rank in remote_ranks:
-            connections[rank] = self.communicator.connect_on_setup(rank, 0, transport)
-        self.communicator.setup()
-        connections = {rank: connections[rank].get() for rank in connections}
-        return connections
+        connections = {rank: self.communicator.connect(rank, 0, transport) for rank in remote_ranks}
+        return {k: v.get() for k, v in connections.items()}
 
     def register_tensor_with_connections(
         self, tensor: Type[cp.ndarray] or Type[np.ndarray], connections: dict[int, Connection]
@@ -93,9 +89,8 @@ def register_tensor_with_connections(
         all_registered_memories[self.my_rank] = local_reg_memory
         future_memories = {}
         for rank in connections:
-            self.communicator.send_memory_on_setup(local_reg_memory, rank, 0)
-            future_memories[rank] = self.communicator.recv_memory_on_setup(rank, 0)
-        self.communicator.setup()
+            self.communicator.send_memory(local_reg_memory, rank, 0)
+            future_memories[rank] = self.communicator.recv_memory(rank, 0)
         for rank in connections:
             all_registered_memories[rank] = future_memories[rank].get()
         return all_registered_memories
@@ -108,7 +103,6 @@ def make_semaphore(
         semaphores = {}
         for rank in connections:
             semaphores[rank] = semaphore_type(self.communicator, connections[rank])
-        self.communicator.setup()
         return semaphores
 
     def make_sm_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, SmChannel]:
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 3af1580a..76841f4c 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -9,7 +9,15 @@
 import netifaces as ni
 import pytest
 
-from mscclpp import Fifo, Host2DeviceSemaphore, Host2HostSemaphore, ProxyService, SmDevice2DeviceSemaphore, Transport
+from mscclpp import (
+    Fifo,
+    Host2DeviceSemaphore,
+    Host2HostSemaphore,
+    ProxyService,
+    SmDevice2DeviceSemaphore,
+    Transport,
+    get_ib_device_count,
+)
 from ._cpp import _ext
 from .mscclpp_group import MscclppGroup
 from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
@@ -17,6 +25,21 @@
 
 ethernet_interface_name = "eth0"
 
+skipif_ib = pytest.mark.skipif(get_ib_device_count() == 0, reason="no IB device")
+
+
+def parametrize_transport(*transports: list):
+    def decorator(func):
+        params = []
+        for transport in transports:
+            if transport == "IB":
+                params.append(pytest.param(transport, marks=skipif_ib))
+            else:
+                params.append(transport)
+        return pytest.mark.parametrize("transport", params)(func)
+
+    return decorator
+
 
 def all_ranks_on_the_same_node(mpi_group: MpiGroup):
     if (ethernet_interface_name in ni.interfaces()) is False:
@@ -81,13 +104,13 @@ def create_and_connect(mpi_group: MpiGroup, transport: str):
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-@pytest.mark.parametrize("transport", ["IB", "NVLink"])
+@parametrize_transport("IB", "NVLink")
 def test_group_with_connections(mpi_group: MpiGroup, transport: str):
     create_and_connect(mpi_group, transport)
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-@pytest.mark.parametrize("transport", ["IB", "NVLink"])
+@parametrize_transport("IB", "NVLink")
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int):
     group, connections = create_and_connect(mpi_group, transport)
@@ -122,7 +145,7 @@ def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int)
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-@pytest.mark.parametrize("transport", ["IB", "NVLink"])
+@parametrize_transport("IB", "NVLink")
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20, 27]])
 @pytest.mark.parametrize("device", ["cuda", "cpu"])
 def test_connection_write_and_signal(mpi_group: MpiGroup, transport: Transport, nelem: int, device: str):
@@ -174,6 +197,7 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, transport: Transport,
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
+@skipif_ib
 def test_h2h_semaphores(mpi_group: MpiGroup):
     group, connections = create_and_connect(mpi_group, "IB")
 
@@ -262,7 +286,7 @@ def __call__(self):
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-@pytest.mark.parametrize("transport", ["NVLink", "IB"])
+@parametrize_transport("NVLink", "IB")
 def test_h2d_semaphores(mpi_group: MpiGroup, transport: str):
     def signal(semaphores):
         for rank in semaphores:
@@ -344,7 +368,7 @@ def test_fifo(
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
-@pytest.mark.parametrize("transport", ["IB", "NVLink"])
+@parametrize_transport("IB", "NVLink")
 def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
     group, connections = create_and_connect(mpi_group, transport)
 
@@ -393,7 +417,7 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
-@pytest.mark.parametrize("transport", ["NVLink", "IB"])
+@parametrize_transport("NVLink", "IB")
 @pytest.mark.parametrize("use_packet", [False, True])
 def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
     group, connections = create_and_connect(mpi_group, transport)
diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc
index 649a1f62..896c7db8 100644
--- a/src/bootstrap/bootstrap.cc
+++ b/src/bootstrap/bootstrap.cc
@@ -6,6 +6,7 @@
 #include <cstring>
 #include <mscclpp/core.hpp>
 #include <mscclpp/errors.hpp>
+#include <queue>
 #include <sstream>
 #include <thread>
 #include <unordered_map>
@@ -41,11 +42,16 @@ MSCCLPP_API_CPP void Bootstrap::send(const std::vector<char>& data, int peer, in
   send((void*)data.data(), data.size(), peer, tag + 1);
 }
 
-MSCCLPP_API_CPP void Bootstrap::recv(std::vector<char>& data, int peer, int tag) {
-  size_t size;
-  recv((void*)&size, sizeof(size_t), peer, tag);
-  data.resize(size);
-  recv((void*)data.data(), data.size(), peer, tag + 1);
+MSCCLPP_API_CPP std::future<std::vector<char>> Bootstrap::recv(int peer, int tag) {
+  auto size = std::make_unique<size_t>();
+  auto recvTask = recv((void*)size.get(), sizeof(size_t), peer, tag);
+  return std::async(std::launch::deferred,
+                    [this, size = std::move(size), recvTask = std::move(recvTask), peer, tag]() mutable {
+                      recvTask.wait();
+                      std::vector<char> data(*size);
+                      recv((void*)data.data(), data.size(), peer, tag + 1).wait();
+                      return data;
+                    });
 }
 
 struct UniqueIdInternal {
@@ -54,6 +60,22 @@ struct UniqueIdInternal {
 };
 static_assert(sizeof(UniqueIdInternal) <= sizeof(UniqueId), "UniqueIdInternal is too large to fit into UniqueId");
 
+struct RecvTask {
+  std::promise<void> promise;
+  std::shared_ptr<Socket> sock;
+  void* data;
+  int size;
+  RecvTask() = default;
+  RecvTask(std::shared_ptr<Socket> sock, void* data, int size) : sock(sock), data(data), size(size) {}
+};
+
+struct RecvThreadData {
+  std::thread thread;
+  std::mutex mutex;
+  std::condition_variable cond;
+  std::queue<RecvTask> queue;
+};
+
 class TcpBootstrap::Impl {
  public:
   Impl(int rank, int nRanks);
@@ -63,11 +85,11 @@ class TcpBootstrap::Impl {
   void establishConnections(int64_t timeoutSec);
   UniqueId createUniqueId();
   UniqueId getUniqueId() const;
-  int getRank();
-  int getNranks();
+  int rank();
+  int size();
   void allGather(void* allData, int size);
   void send(void* data, int size, int peer, int tag);
-  void recv(void* data, int size, int peer, int tag);
+  std::future<void> recv(void* data, int size, int peer, int tag);
   void barrier();
   void close();
 
@@ -85,6 +107,7 @@ class TcpBootstrap::Impl {
   std::unique_ptr<uint32_t> abortFlagStorage_;
   volatile uint32_t* abortFlag_;
   std::thread rootThread_;
+  std::unordered_map<std::pair<int, int>, std::unique_ptr<RecvThreadData>, PairHash> recvThreads_;
   char netIfName_[MAX_IF_NAME_SIZE + 1];
   SocketAddress netIfAddr_;
   std::unordered_map<std::pair<int, int>, std::shared_ptr<Socket>, PairHash> peerSendSockets_;
@@ -93,6 +116,7 @@ class TcpBootstrap::Impl {
   void netSend(Socket* sock, const void* data, int size);
   void netRecv(Socket* sock, void* data, int size);
 
+  RecvThreadData* getRecvThreadData(int peer, int tag);
   std::shared_ptr<Socket> getPeerSendSocket(int peer, int tag);
   std::shared_ptr<Socket> getPeerRecvSocket(int peer, int tag);
 
@@ -128,9 +152,9 @@ UniqueId TcpBootstrap::Impl::createUniqueId() {
   return getUniqueId();
 }
 
-int TcpBootstrap::Impl::getRank() { return rank_; }
+int TcpBootstrap::Impl::rank() { return rank_; }
 
-int TcpBootstrap::Impl::getNranks() { return nRanks_; }
+int TcpBootstrap::Impl::size() { return nRanks_; }
 
 void TcpBootstrap::Impl::initialize(const UniqueId& uniqueId, int64_t timeoutSec) {
   netInit("", "");
@@ -176,6 +200,16 @@ TcpBootstrap::Impl::~Impl() {
   if (rootThread_.joinable()) {
     rootThread_.join();
   }
+  for (auto& it : recvThreads_) {
+    {
+      std::lock_guard<std::mutex> lock(it.second->mutex);
+      it.second->queue.push(RecvTask());  // signal thread to exit
+      it.second->cond.notify_one();
+    }
+  }
+  for (auto& it : recvThreads_) {
+    it.second->thread.join();
+  }
 }
 
 void TcpBootstrap::Impl::getRemoteAddresses(Socket* listenSock, std::vector<SocketAddress>& rankAddresses,
@@ -404,6 +438,32 @@ void TcpBootstrap::Impl::allGather(void* allData, int size) {
   TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nRanks, size);
 }
 
+RecvThreadData* TcpBootstrap::Impl::getRecvThreadData(int peer, int tag) {
+  auto it = recvThreads_.find(std::make_pair(peer, tag));
+  if (it != recvThreads_.end()) {
+    return it->second.get();
+  }
+  auto threadData = std::make_unique<RecvThreadData>();
+  threadData->thread = std::thread([this, threadData = threadData.get(), peer, tag]() {
+    for (;;) {
+      RecvTask task;
+      {
+        std::unique_lock<std::mutex> lock(threadData->mutex);
+        threadData->cond.wait(lock, [&]() { return !threadData->queue.empty(); });
+        task = std::move(threadData->queue.front());
+        threadData->queue.pop();
+      }
+      if (task.sock == nullptr) {
+        break;
+      }
+      netRecv(task.sock.get(), task.data, task.size);
+      task.promise.set_value();
+    }
+  });
+  recvThreads_[std::make_pair(peer, tag)] = std::move(threadData);
+  return recvThreads_[std::make_pair(peer, tag)].get();
+}
+
 std::shared_ptr<Socket> TcpBootstrap::Impl::getPeerSendSocket(int peer, int tag) {
   auto it = peerSendSockets_.find(std::make_pair(peer, tag));
   if (it != peerSendSockets_.end()) {
@@ -456,9 +516,17 @@ void TcpBootstrap::Impl::send(void* data, int size, int peer, int tag) {
   netSend(sock.get(), data, size);
 }
 
-void TcpBootstrap::Impl::recv(void* data, int size, int peer, int tag) {
+std::future<void> TcpBootstrap::Impl::recv(void* data, int size, int peer, int tag) {
   auto sock = getPeerRecvSocket(peer, tag);
-  netRecv(sock.get(), data, size);
+  RecvTask task(sock, data, size);
+  auto future = task.promise.get_future();
+  auto threadData = getRecvThreadData(peer, tag);
+  {
+    std::lock_guard<std::mutex> lock(threadData->mutex);
+    threadData->queue.push(std::move(task));
+    threadData->cond.notify_one();
+  }
+  return future;
 }
 
 void TcpBootstrap::Impl::barrier() { allGather(barrierArr_.data(), sizeof(int)); }
@@ -478,16 +546,16 @@ MSCCLPP_API_CPP UniqueId TcpBootstrap::createUniqueId() { return pimpl_->createU
 
 MSCCLPP_API_CPP UniqueId TcpBootstrap::getUniqueId() const { return pimpl_->getUniqueId(); }
 
-MSCCLPP_API_CPP int TcpBootstrap::getRank() { return pimpl_->getRank(); }
+MSCCLPP_API_CPP int TcpBootstrap::rank() { return pimpl_->rank(); }
 
-MSCCLPP_API_CPP int TcpBootstrap::getNranks() { return pimpl_->getNranks(); }
+MSCCLPP_API_CPP int TcpBootstrap::size() { return pimpl_->size(); }
 
 MSCCLPP_API_CPP void TcpBootstrap::send(void* data, int size, int peer, int tag) {
   pimpl_->send(data, size, peer, tag);
 }
 
-MSCCLPP_API_CPP void TcpBootstrap::recv(void* data, int size, int peer, int tag) {
-  pimpl_->recv(data, size, peer, tag);
+MSCCLPP_API_CPP std::future<void> TcpBootstrap::recv(void* data, int size, int peer, int tag) {
+  return pimpl_->recv(data, size, peer, tag);
 }
 
 MSCCLPP_API_CPP void TcpBootstrap::allGather(void* allData, int size) { pimpl_->allGather(allData, size); }
diff --git a/src/communicator.cc b/src/communicator.cc
index d2f0e617..f9f3fb95 100644
--- a/src/communicator.cc
+++ b/src/communicator.cc
@@ -30,79 +30,29 @@ MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t
   return context()->registerMemory(ptr, size, transports);
 }
 
-struct MemorySender : public Setuppable {
-  MemorySender(RegisteredMemory memory, int remoteRank, int tag)
-      : memory_(memory), remoteRank_(remoteRank), tag_(tag) {}
-
-  void beginSetup(std::shared_ptr<Bootstrap> bootstrap) override {
-    bootstrap->send(memory_.serialize(), remoteRank_, tag_);
-  }
-
-  RegisteredMemory memory_;
-  int remoteRank_;
-  int tag_;
-};
-
-MSCCLPP_API_CPP void Communicator::sendMemoryOnSetup(RegisteredMemory memory, int remoteRank, int tag) {
-  onSetup(std::make_shared<MemorySender>(memory, remoteRank, tag));
+MSCCLPP_API_CPP void Communicator::sendMemory(RegisteredMemory memory, int remoteRank, int tag) {
+  pimpl_->bootstrap_->send(memory.serialize(), remoteRank, tag);
 }
 
-struct MemoryReceiver : public Setuppable {
-  MemoryReceiver(int remoteRank, int tag) : remoteRank_(remoteRank), tag_(tag) {}
-
-  void endSetup(std::shared_ptr<Bootstrap> bootstrap) override {
-    std::vector<char> data;
-    bootstrap->recv(data, remoteRank_, tag_);
-    memoryPromise_.set_value(RegisteredMemory::deserialize(data));
-  }
-
-  std::promise<RegisteredMemory> memoryPromise_;
-  int remoteRank_;
-  int tag_;
-};
-
-MSCCLPP_API_CPP NonblockingFuture<RegisteredMemory> Communicator::recvMemoryOnSetup(int remoteRank, int tag) {
-  auto memoryReceiver = std::make_shared<MemoryReceiver>(remoteRank, tag);
-  onSetup(memoryReceiver);
-  return NonblockingFuture<RegisteredMemory>(memoryReceiver->memoryPromise_.get_future());
+MSCCLPP_API_CPP std::future<RegisteredMemory> Communicator::recvMemory(int remoteRank, int tag) {
+  auto futureData = pimpl_->bootstrap_->recv(remoteRank, tag);
+  return std::async(std::launch::deferred, [futureData = std::move(futureData)]() mutable {
+    return RegisteredMemory::deserialize(futureData.get());
+  });
 }
 
-struct Communicator::Impl::Connector : public Setuppable {
-  Connector(Communicator& comm, Communicator::Impl& commImpl_, int remoteRank, int tag, EndpointConfig localConfig)
-      : comm_(comm),
-        commImpl_(commImpl_),
-        remoteRank_(remoteRank),
-        tag_(tag),
-        localEndpoint_(comm.context()->createEndpoint(localConfig)) {}
-
-  void beginSetup(std::shared_ptr<Bootstrap> bootstrap) override {
-    bootstrap->send(localEndpoint_.serialize(), remoteRank_, tag_);
-  }
-
-  void endSetup(std::shared_ptr<Bootstrap> bootstrap) override {
-    std::vector<char> data;
-    bootstrap->recv(data, remoteRank_, tag_);
-    auto remoteEndpoint = Endpoint::deserialize(data);
-    auto connection = comm_.context()->connect(localEndpoint_, remoteEndpoint);
-    commImpl_.connectionInfos_[connection.get()] = {remoteRank_, tag_};
-    connectionPromise_.set_value(connection);
-    INFO(MSCCLPP_INIT, "Connection %d -> %d created (%s)", comm_.bootstrap()->getRank(), remoteRank_,
-         connection->getTransportName().c_str());
-  }
-
-  std::promise<std::shared_ptr<Connection>> connectionPromise_;
-  Communicator& comm_;
-  Communicator::Impl& commImpl_;
-  int remoteRank_;
-  int tag_;
-  Endpoint localEndpoint_;
-};
-
-MSCCLPP_API_CPP NonblockingFuture<std::shared_ptr<Connection>> Communicator::connectOnSetup(
-    int remoteRank, int tag, EndpointConfig localConfig) {
-  auto connector = std::make_shared<Communicator::Impl::Connector>(*this, *pimpl_, remoteRank, tag, localConfig);
-  onSetup(connector);
-  return NonblockingFuture<std::shared_ptr<Connection>>(connector->connectionPromise_.get_future());
+MSCCLPP_API_CPP std::future<std::shared_ptr<Connection>> Communicator::connect(int remoteRank, int tag,
+                                                                               EndpointConfig localConfig) {
+  auto localEndpoint = context()->createEndpoint(localConfig);
+  pimpl_->bootstrap_->send(localEndpoint.serialize(), remoteRank, tag);
+  auto futureData = pimpl_->bootstrap_->recv(remoteRank, tag);
+  return std::async(std::launch::deferred, [this, localEndpoint = std::move(localEndpoint),
+                                            futureData = std::move(futureData), remoteRank, tag]() mutable {
+    auto remoteEndpoint = Endpoint::deserialize(futureData.get());
+    auto connection = context()->connect(localEndpoint, remoteEndpoint);
+    pimpl_->connectionInfos_[connection.get()] = {remoteRank, tag};
+    return connection;
+  });
 }
 
 MSCCLPP_API_CPP int Communicator::remoteRankOf(const Connection& connection) {
@@ -113,18 +63,4 @@ MSCCLPP_API_CPP int Communicator::tagOf(const Connection& connection) {
   return pimpl_->connectionInfos_.at(&connection).tag;
 }
 
-MSCCLPP_API_CPP void Communicator::onSetup(std::shared_ptr<Setuppable> setuppable) {
-  pimpl_->toSetup_.push_back(setuppable);
-}
-
-MSCCLPP_API_CPP void Communicator::setup() {
-  for (auto& setuppable : pimpl_->toSetup_) {
-    setuppable->beginSetup(pimpl_->bootstrap_);
-  }
-  for (auto& setuppable : pimpl_->toSetup_) {
-    setuppable->endSetup(pimpl_->bootstrap_);
-  }
-  pimpl_->toSetup_.clear();
-}
-
 }  // namespace mscclpp
diff --git a/src/core.cc b/src/core.cc
index 4d89250d..aef2ffd1 100644
--- a/src/core.cc
+++ b/src/core.cc
@@ -89,10 +89,6 @@ const TransportFlags AllIBTransports = Transport::IB0 | Transport::IB1 | Transpo
 
 const TransportFlags AllTransports = AllIBTransports | Transport::CudaIpc;
 
-void Setuppable::beginSetup(std::shared_ptr<Bootstrap>) {}
-
-void Setuppable::endSetup(std::shared_ptr<Bootstrap>) {}
-
 }  // namespace mscclpp
 
 namespace std {
diff --git a/src/endpoint.cc b/src/endpoint.cc
index dbc77389..a1b7b241 100644
--- a/src/endpoint.cc
+++ b/src/endpoint.cc
@@ -18,9 +18,9 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl)
   }
 }
 
-MSCCLPP_API_CPP Transport Endpoint::transport() { return pimpl_->transport_; }
+MSCCLPP_API_CPP Transport Endpoint::transport() const { return pimpl_->transport_; }
 
-MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() {
+MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() const {
   std::vector<char> data;
   std::copy_n(reinterpret_cast<char*>(&pimpl_->transport_), sizeof(pimpl_->transport_), std::back_inserter(data));
   std::copy_n(reinterpret_cast<char*>(&pimpl_->hostHash_), sizeof(pimpl_->hostHash_), std::back_inserter(data));
diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp
index 55b5d572..016a17d1 100644
--- a/src/include/communicator.hpp
+++ b/src/include/communicator.hpp
@@ -22,7 +22,6 @@ struct Communicator::Impl {
   std::shared_ptr<Bootstrap> bootstrap_;
   std::shared_ptr<Context> context_;
   std::unordered_map<const Connection*, ConnectionInfo> connectionInfos_;
-  std::vector<std::shared_ptr<Setuppable>> toSetup_;
 
   Impl(std::shared_ptr<Bootstrap> bootstrap, std::shared_ptr<Context> context);
 
diff --git a/src/registered_memory.cc b/src/registered_memory.cc
index 9c35e144..b6ff3354 100644
--- a/src/registered_memory.cc
+++ b/src/registered_memory.cc
@@ -64,11 +64,11 @@ MSCCLPP_API_CPP RegisteredMemory::~RegisteredMemory() = default;
 
 MSCCLPP_API_CPP void* RegisteredMemory::data() const { return pimpl_->data; }
 
-MSCCLPP_API_CPP size_t RegisteredMemory::size() { return pimpl_->size; }
+MSCCLPP_API_CPP size_t RegisteredMemory::size() const { return pimpl_->size; }
 
-MSCCLPP_API_CPP TransportFlags RegisteredMemory::transports() { return pimpl_->transports; }
+MSCCLPP_API_CPP TransportFlags RegisteredMemory::transports() const { return pimpl_->transports; }
 
-MSCCLPP_API_CPP std::vector<char> RegisteredMemory::serialize() {
+MSCCLPP_API_CPP std::vector<char> RegisteredMemory::serialize() const {
   std::vector<char> result;
   std::copy_n(reinterpret_cast<char*>(&pimpl_->originalDataPtr), sizeof(pimpl_->originalDataPtr),
               std::back_inserter(result));
diff --git a/src/semaphore.cc b/src/semaphore.cc
index 6605be99..43cfbea6 100644
--- a/src/semaphore.cc
+++ b/src/semaphore.cc
@@ -9,14 +9,14 @@
 
 namespace mscclpp {
 
-static NonblockingFuture<RegisteredMemory> setupInboundSemaphoreId(Communicator& communicator, Connection* connection,
-                                                                   void* localInboundSemaphoreId) {
+static std::future<RegisteredMemory> setupInboundSemaphoreId(Communicator& communicator, Connection* connection,
+                                                             void* localInboundSemaphoreId) {
   auto localInboundSemaphoreIdsRegMem =
       communicator.registerMemory(localInboundSemaphoreId, sizeof(uint64_t), connection->transport());
   int remoteRank = communicator.remoteRankOf(*connection);
   int tag = communicator.tagOf(*connection);
-  communicator.sendMemoryOnSetup(localInboundSemaphoreIdsRegMem, remoteRank, tag);
-  return communicator.recvMemoryOnSetup(remoteRank, tag);
+  communicator.sendMemory(localInboundSemaphoreIdsRegMem, remoteRank, tag);
+  return communicator.recvMemory(remoteRank, tag);
 }
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator,
@@ -24,7 +24,7 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communi
     : BaseSemaphore(allocUniqueCuda<uint64_t>(), allocUniqueCuda<uint64_t>(), std::make_unique<uint64_t>()),
       connection_(connection) {
   INFO(MSCCLPP_INIT, "Creating a Host2Device semaphore for %s transport from %d to %d",
-       connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
+       connection->getTransportName().c_str(), communicator.bootstrap()->rank(),
        communicator.remoteRankOf(*connection));
   remoteInboundSemaphoreIdsRegMem_ =
       setupInboundSemaphoreId(communicator, connection.get(), localInboundSemaphore_.get());
@@ -49,7 +49,7 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(Communicator& communicato
     : BaseSemaphore(std::make_unique<uint64_t>(), std::make_unique<uint64_t>(), std::make_unique<uint64_t>()),
       connection_(connection) {
   INFO(MSCCLPP_INIT, "Creating a Host2Host semaphore for %s transport from %d to %d",
-       connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
+       connection->getTransportName().c_str(), communicator.bootstrap()->rank(),
        communicator.remoteRankOf(*connection));
 
   if (connection->transport() == Transport::CudaIpc) {
@@ -88,7 +88,7 @@ MSCCLPP_API_CPP SmDevice2DeviceSemaphore::SmDevice2DeviceSemaphore(Communicator&
                                                                    std::shared_ptr<Connection> connection)
     : BaseSemaphore(allocUniqueCuda<uint64_t>(), allocUniqueCuda<uint64_t>(), allocUniqueCuda<uint64_t>()) {
   INFO(MSCCLPP_INIT, "Creating a Device2Device semaphore for %s transport from %d to %d",
-       connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
+       connection->getTransportName().c_str(), communicator.bootstrap()->rank(),
        communicator.remoteRankOf(*connection));
   if (connection->transport() == Transport::CudaIpc) {
     remoteInboundSemaphoreIdsRegMem_ =
diff --git a/test/allgather_test_cpp.cu b/test/allgather_test_cpp.cu
index fcebd759..420ce3fe 100644
--- a/test/allgather_test_cpp.cu
+++ b/test/allgather_test_cpp.cu
@@ -201,8 +201,8 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co
   mscclpp::Transport ibTransport = mscclpp::getIBTransportByDeviceName(ibDevStr);
   std::vector<mscclpp::SemaphoreId> semaphoreIds;
   std::vector<mscclpp::RegisteredMemory> localMemories;
-  std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> connections(world_size);
-  std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteMemories;
+  std::vector<std::future<std::shared_ptr<mscclpp::Connection>>> connections(world_size);
+  std::vector<std::future<mscclpp::RegisteredMemory>> remoteMemories;
 
   for (int r = 0; r < world_size; ++r) {
     if (r == rank) continue;
@@ -213,22 +213,18 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co
       transport = ibTransport;
     }
     // Connect with all other ranks
-    connections[r] = comm.connectOnSetup(r, 0, transport);
+    connections[r] = comm.connect(r, 0, transport);
     auto memory = comm.registerMemory(data_d, dataSize, mscclpp::Transport::CudaIpc | ibTransport);
     localMemories.push_back(memory);
-    comm.sendMemoryOnSetup(memory, r, 0);
-    remoteMemories.push_back(comm.recvMemoryOnSetup(r, 0));
+    comm.sendMemory(memory, r, 0);
+    remoteMemories.push_back(comm.recvMemory(r, 0));
   }
 
-  comm.setup();
-
   for (int r = 0; r < world_size; ++r) {
     if (r == rank) continue;
     semaphoreIds.push_back(proxyService.buildAndAddSemaphore(comm, connections[r].get()));
   }
 
-  comm.setup();
-
   std::vector<DeviceHandle<mscclpp::SimpleProxyChannel>> proxyChannels;
   for (size_t i = 0; i < semaphoreIds.size(); ++i) {
     proxyChannels.push_back(mscclpp::deviceHandle(mscclpp::SimpleProxyChannel(
diff --git a/test/allgather_test_host_offloading.cu b/test/allgather_test_host_offloading.cu
index 76e6b631..c1dfea85 100644
--- a/test/allgather_test_host_offloading.cu
+++ b/test/allgather_test_host_offloading.cu
@@ -104,8 +104,8 @@ class MyProxyService {
     int cudaNum = rankToLocalRank(rank);
     std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum);
     mscclpp::Transport ibTransport = mscclpp::getIBTransportByDeviceName(ibDevStr);
-    std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> connectionsFuture(world_size);
-    std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteMemoriesFuture(world_size);
+    std::vector<std::future<std::shared_ptr<mscclpp::Connection>>> connectionsFuture(world_size);
+    std::vector<std::future<mscclpp::RegisteredMemory>> remoteMemoriesFuture(world_size);
 
     localMemory_ = comm.registerMemory(data_d, dataSize, mscclpp::Transport::CudaIpc | ibTransport);
     for (int r = 0; r < world_size; ++r) {
@@ -122,14 +122,12 @@ class MyProxyService {
         transport = ibTransport;
       }
       // Connect with all other ranks
-      connectionsFuture[r] = comm.connectOnSetup(r, 0, transport);
-      comm.sendMemoryOnSetup(localMemory_, r, 0);
+      connectionsFuture[r] = comm.connect(r, 0, transport);
+      comm.sendMemory(localMemory_, r, 0);
 
-      remoteMemoriesFuture[r] = comm.recvMemoryOnSetup(r, 0);
+      remoteMemoriesFuture[r] = comm.recvMemory(r, 0);
     }
 
-    comm.setup();
-
     for (int r = 0; r < world_size; ++r) {
       if (r == rank) {
         continue;
@@ -144,8 +142,6 @@ class MyProxyService {
       deviceSemaphores2_.emplace_back(std::make_shared<mscclpp::Host2DeviceSemaphore>(comm, connections_[r]));
       remoteMemories_[r] = remoteMemoriesFuture[r].get();
     }
-
-    comm.setup();
   }
 
   void bindThread() {
diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc
index 82120a1f..830a5c0c 100644
--- a/test/mp_unit/bootstrap_tests.cc
+++ b/test/mp_unit/bootstrap_tests.cc
@@ -6,10 +6,10 @@
 #include "mp_unit_tests.hpp"
 
 void BootstrapTest::bootstrapTestAllGather(std::shared_ptr<mscclpp::Bootstrap> bootstrap) {
-  std::vector<int> tmp(bootstrap->getNranks(), 0);
-  tmp[bootstrap->getRank()] = bootstrap->getRank() + 1;
+  std::vector<int> tmp(bootstrap->size(), 0);
+  tmp[bootstrap->rank()] = bootstrap->rank() + 1;
   bootstrap->allGather(tmp.data(), sizeof(int));
-  for (int i = 0; i < bootstrap->getNranks(); ++i) {
+  for (int i = 0; i < bootstrap->size(); ++i) {
     EXPECT_EQ(tmp[i], i + 1);
   }
 }
@@ -17,25 +17,25 @@ void BootstrapTest::bootstrapTestAllGather(std::shared_ptr<mscclpp::Bootstrap> b
 void BootstrapTest::bootstrapTestBarrier(std::shared_ptr<mscclpp::Bootstrap> bootstrap) { bootstrap->barrier(); }
 
 void BootstrapTest::bootstrapTestSendRecv(std::shared_ptr<mscclpp::Bootstrap> bootstrap) {
-  for (int i = 0; i < bootstrap->getNranks(); i++) {
-    if (bootstrap->getRank() == i) continue;
-    int msg1 = (bootstrap->getRank() + 1) * 3;
-    int msg2 = (bootstrap->getRank() + 1) * 3 + 1;
-    int msg3 = (bootstrap->getRank() + 1) * 3 + 2;
+  for (int i = 0; i < bootstrap->size(); i++) {
+    if (bootstrap->rank() == i) continue;
+    int msg1 = (bootstrap->rank() + 1) * 3;
+    int msg2 = (bootstrap->rank() + 1) * 3 + 1;
+    int msg3 = (bootstrap->rank() + 1) * 3 + 2;
     bootstrap->send(&msg1, sizeof(int), i, 0);
     bootstrap->send(&msg2, sizeof(int), i, 1);
     bootstrap->send(&msg3, sizeof(int), i, 2);
   }
 
-  for (int i = 0; i < bootstrap->getNranks(); i++) {
-    if (bootstrap->getRank() == i) continue;
+  for (int i = 0; i < bootstrap->size(); i++) {
+    if (bootstrap->rank() == i) continue;
     int msg1 = 0;
     int msg2 = 0;
     int msg3 = 0;
     // recv them in the opposite order to check correctness
-    bootstrap->recv(&msg2, sizeof(int), i, 1);
-    bootstrap->recv(&msg3, sizeof(int), i, 2);
-    bootstrap->recv(&msg1, sizeof(int), i, 0);
+    bootstrap->recv(&msg2, sizeof(int), i, 1).wait();
+    bootstrap->recv(&msg3, sizeof(int), i, 2).wait();
+    bootstrap->recv(&msg1, sizeof(int), i, 0).wait();
     EXPECT_EQ(msg1, (i + 1) * 3);
     EXPECT_EQ(msg2, (i + 1) * 3 + 1);
     EXPECT_EQ(msg3, (i + 1) * 3 + 2);
@@ -51,7 +51,7 @@ void BootstrapTest::bootstrapTestAll(std::shared_ptr<mscclpp::Bootstrap> bootstr
 TEST_F(BootstrapTest, WithId) {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
   mscclpp::UniqueId id;
-  if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId();
+  if (bootstrap->rank() == 0) id = bootstrap->createUniqueId();
   MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
   bootstrap->initialize(id);
   bootstrapTestAll(bootstrap);
@@ -70,7 +70,7 @@ TEST_F(BootstrapTest, ResumeWithId) {
   for (int i = 0; i < 3000; ++i) {
     auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
     mscclpp::UniqueId id;
-    if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId();
+    if (bootstrap->rank() == 0) id = bootstrap->createUniqueId();
     MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
     bootstrap->initialize(id, 300);
   }
@@ -110,12 +110,12 @@ TEST_F(BootstrapTest, TimeoutWithId) {
 class MPIBootstrap : public mscclpp::Bootstrap {
  public:
   MPIBootstrap() : Bootstrap() {}
-  int getRank() override {
+  int rank() override {
     int rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     return rank;
   }
-  int getNranks() override {
+  int size() override {
     int worldSize;
     MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
     return worldSize;
@@ -125,10 +125,14 @@ class MPIBootstrap : public mscclpp::Bootstrap {
   }
   void barrier() override { MPI_Barrier(MPI_COMM_WORLD); }
   void send(void* sendbuf, int size, int dest, int tag) override {
-    MPI_Send(sendbuf, size, MPI_BYTE, dest, tag, MPI_COMM_WORLD);
+    MPI_Request request;
+    MPI_Isend(sendbuf, size, MPI_BYTE, dest, tag, MPI_COMM_WORLD, &request);
+    MPI_Wait(&request, MPI_STATUS_IGNORE);
   }
-  void recv(void* recvbuf, int size, int source, int tag) override {
-    MPI_Recv(recvbuf, size, MPI_BYTE, source, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+  std::future<void> recv(void* recvbuf, int size, int source, int tag) override {
+    MPI_Request request;
+    MPI_Irecv(recvbuf, size, MPI_BYTE, source, tag, MPI_COMM_WORLD, &request);
+    return std::async(std::launch::deferred, [request]() mutable { MPI_Wait(&request, MPI_STATUS_IGNORE); });
   }
 };
 
diff --git a/test/mp_unit/communicator_tests.cu b/test/mp_unit/communicator_tests.cu
index e3709357..55fbbc2d 100644
--- a/test/mp_unit/communicator_tests.cu
+++ b/test/mp_unit/communicator_tests.cu
@@ -43,17 +43,16 @@ void CommunicatorTestBase::TearDown() {
 void CommunicatorTestBase::setNumRanksToUse(int num) { numRanksToUse = num; }
 
 void CommunicatorTestBase::connectMesh(bool useIbOnly) {
-  std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> connectionFutures(numRanksToUse);
+  std::vector<std::future<std::shared_ptr<mscclpp::Connection>>> connectionFutures(numRanksToUse);
   for (int i = 0; i < numRanksToUse; i++) {
     if (i != gEnv->rank) {
       if ((rankToNode(i) == rankToNode(gEnv->rank)) && !useIbOnly) {
-        connectionFutures[i] = communicator->connectOnSetup(i, 0, mscclpp::Transport::CudaIpc);
+        connectionFutures[i] = communicator->connect(i, 0, mscclpp::Transport::CudaIpc);
       } else {
-        connectionFutures[i] = communicator->connectOnSetup(i, 0, ibTransport);
+        connectionFutures[i] = communicator->connect(i, 0, ibTransport);
       }
     }
   }
-  communicator->setup();
   for (int i = 0; i < numRanksToUse; i++) {
     if (i != gEnv->rank) {
       connections[i] = connectionFutures[i].get();
@@ -67,16 +66,15 @@ void CommunicatorTestBase::registerMemoryPairs(void* buff, size_t buffSize, mscc
                                                mscclpp::RegisteredMemory& localMemory,
                                                std::unordered_map<int, mscclpp::RegisteredMemory>& remoteMemories) {
   localMemory = communicator->registerMemory(buff, buffSize, transport);
-  std::unordered_map<int, mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> futureRemoteMemories;
+  std::unordered_map<int, std::future<mscclpp::RegisteredMemory>> futureRemoteMemories;
   for (int remoteRank : remoteRanks) {
-    if (remoteRank != communicator->bootstrap()->getRank()) {
-      communicator->sendMemoryOnSetup(localMemory, remoteRank, tag);
-      futureRemoteMemories[remoteRank] = communicator->recvMemoryOnSetup(remoteRank, tag);
+    if (remoteRank != communicator->bootstrap()->rank()) {
+      communicator->sendMemory(localMemory, remoteRank, tag);
+      futureRemoteMemories[remoteRank] = communicator->recvMemory(remoteRank, tag);
     }
   }
-  communicator->setup();
   for (int remoteRank : remoteRanks) {
-    if (remoteRank != communicator->bootstrap()->getRank()) {
+    if (remoteRank != communicator->bootstrap()->rank()) {
       remoteMemories[remoteRank] = futureRemoteMemories[remoteRank].get();
     }
   }
@@ -206,7 +204,6 @@ TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
     auto& conn = entry.second;
     semaphores.insert({entry.first, std::make_shared<mscclpp::Host2DeviceSemaphore>(*communicator.get(), conn)});
   }
-  communicator->setup();
   communicator->bootstrap()->barrier();
 
   deviceBufferInit();
@@ -247,7 +244,6 @@ TEST_F(CommunicatorTest, WriteWithHostSemaphores) {
     if (conn->transport() == mscclpp::Transport::CudaIpc) continue;
     semaphores.insert({entry.first, std::make_shared<mscclpp::Host2HostSemaphore>(*communicator.get(), conn)});
   }
-  communicator->setup();
   communicator->bootstrap()->barrier();
 
   deviceBufferInit();
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index d32f102c..d5532bb2 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -25,7 +25,7 @@ void IbPeerToPeerTest::SetUp() {
   if (gEnv->rank < 2) {
     // This test needs only two ranks
     bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, 2);
-    if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId();
+    if (bootstrap->rank() == 0) id = bootstrap->createUniqueId();
   }
   MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
   if (gEnv->rank >= 2) {
@@ -48,7 +48,7 @@ void IbPeerToPeerTest::registerBufferAndConnect(void* buf, size_t size) {
   mrInfo[gEnv->rank] = mr->getInfo();
   bootstrap->allGather(mrInfo.data(), sizeof(mscclpp::IbMrInfo));
 
-  for (int i = 0; i < bootstrap->getNranks(); ++i) {
+  for (int i = 0; i < bootstrap->size(); ++i) {
     if (i == gEnv->rank) continue;
     qp->rtr(qpInfo[i]);
     qp->rts();
diff --git a/test/mp_unit/proxy_channel_tests.cu b/test/mp_unit/proxy_channel_tests.cu
index ae0ea4c6..ef81d219 100644
--- a/test/mp_unit/proxy_channel_tests.cu
+++ b/test/mp_unit/proxy_channel_tests.cu
@@ -18,13 +18,13 @@ void ProxyChannelOneToOneTest::TearDown() { CommunicatorTestBase::TearDown(); }
 void ProxyChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::SimpleProxyChannel>& proxyChannels,
                                                     bool useIbOnly, void* sendBuff, size_t sendBuffBytes,
                                                     void* recvBuff, size_t recvBuffBytes) {
-  const int rank = communicator->bootstrap()->getRank();
-  const int worldSize = communicator->bootstrap()->getNranks();
+  const int rank = communicator->bootstrap()->rank();
+  const int worldSize = communicator->bootstrap()->size();
   const bool isInPlace = (recvBuff == nullptr);
   mscclpp::TransportFlags transport = (useIbOnly) ? ibTransport : (mscclpp::Transport::CudaIpc | ibTransport);
 
-  std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> connectionFutures(worldSize);
-  std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteMemFutures(worldSize);
+  std::vector<std::future<std::shared_ptr<mscclpp::Connection>>> connectionFutures(worldSize);
+  std::vector<std::future<mscclpp::RegisteredMemory>> remoteMemFutures(worldSize);
 
   mscclpp::RegisteredMemory sendBufRegMem = communicator->registerMemory(sendBuff, sendBuffBytes, transport);
   mscclpp::RegisteredMemory recvBufRegMem;
@@ -37,21 +37,19 @@ void ProxyChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::SimpleP
       continue;
     }
     if ((rankToNode(r) == rankToNode(gEnv->rank)) && !useIbOnly) {
-      connectionFutures[r] = communicator->connectOnSetup(r, 0, mscclpp::Transport::CudaIpc);
+      connectionFutures[r] = communicator->connect(r, 0, mscclpp::Transport::CudaIpc);
     } else {
-      connectionFutures[r] = communicator->connectOnSetup(r, 0, ibTransport);
+      connectionFutures[r] = communicator->connect(r, 0, ibTransport);
     }
 
     if (isInPlace) {
-      communicator->sendMemoryOnSetup(sendBufRegMem, r, 0);
+      communicator->sendMemory(sendBufRegMem, r, 0);
     } else {
-      communicator->sendMemoryOnSetup(recvBufRegMem, r, 0);
+      communicator->sendMemory(recvBufRegMem, r, 0);
     }
-    remoteMemFutures[r] = communicator->recvMemoryOnSetup(r, 0);
+    remoteMemFutures[r] = communicator->recvMemory(r, 0);
   }
 
-  communicator->setup();
-
   for (int r = 0; r < worldSize; r++) {
     if (r == rank) {
       continue;
@@ -61,8 +59,6 @@ void ProxyChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::SimpleP
     proxyChannels.emplace_back(proxyService->proxyChannel(cid), proxyService->addMemory(remoteMemFutures[r].get()),
                                proxyService->addMemory(sendBufRegMem));
   }
-
-  communicator->setup();
 }
 
 __constant__ DeviceHandle<mscclpp::SimpleProxyChannel> gChannelOneToOneTestConstProxyChans;
diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu
index 37b3ce63..a34178b6 100644
--- a/test/mp_unit/sm_channel_tests.cu
+++ b/test/mp_unit/sm_channel_tests.cu
@@ -19,13 +19,13 @@ void SmChannelOneToOneTest::TearDown() { CommunicatorTestBase::TearDown(); }
 
 void SmChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::SmChannel>& smChannels, void* inputBuff,
                                                  size_t inputBuffBytes, void* outputBuff, size_t outputBuffBytes) {
-  const int rank = communicator->bootstrap()->getRank();
-  const int worldSize = communicator->bootstrap()->getNranks();
+  const int rank = communicator->bootstrap()->rank();
+  const int worldSize = communicator->bootstrap()->size();
   const bool isInPlace = (outputBuff == nullptr);
   mscclpp::TransportFlags transport = mscclpp::Transport::CudaIpc | ibTransport;
 
-  std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> connectionFutures(worldSize);
-  std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteMemFutures(worldSize);
+  std::vector<std::future<std::shared_ptr<mscclpp::Connection>>> connectionFutures(worldSize);
+  std::vector<std::future<mscclpp::RegisteredMemory>> remoteMemFutures(worldSize);
 
   mscclpp::RegisteredMemory inputBufRegMem = communicator->registerMemory(inputBuff, inputBuffBytes, transport);
   mscclpp::RegisteredMemory outputBufRegMem;
@@ -38,21 +38,19 @@ void SmChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::SmChannel>
       continue;
     }
     if (rankToNode(r) == rankToNode(gEnv->rank)) {
-      connectionFutures[r] = communicator->connectOnSetup(r, 0, mscclpp::Transport::CudaIpc);
+      connectionFutures[r] = communicator->connect(r, 0, mscclpp::Transport::CudaIpc);
     } else {
-      connectionFutures[r] = communicator->connectOnSetup(r, 0, ibTransport);
+      connectionFutures[r] = communicator->connect(r, 0, ibTransport);
     }
 
     if (isInPlace) {
-      communicator->sendMemoryOnSetup(inputBufRegMem, r, 0);
+      communicator->sendMemory(inputBufRegMem, r, 0);
     } else {
-      communicator->sendMemoryOnSetup(outputBufRegMem, r, 0);
+      communicator->sendMemory(outputBufRegMem, r, 0);
     }
-    remoteMemFutures[r] = communicator->recvMemoryOnSetup(r, 0);
+    remoteMemFutures[r] = communicator->recvMemory(r, 0);
   }
 
-  communicator->setup();
-
   for (int r = 0; r < worldSize; r++) {
     if (r == rank) {
       continue;
@@ -64,8 +62,6 @@ void SmChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::SmChannel>
     smChannels.emplace_back(smSemaphores[r], remoteMemFutures[r].get(), inputBufRegMem.data(),
                             (isInPlace ? nullptr : outputBufRegMem.data()));
   }
-
-  communicator->setup();
 }
 
 __constant__ DeviceHandle<mscclpp::SmChannel> gChannelOneToOneTestConstSmChans;
diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu
index ca050826..34c9c934 100644
--- a/test/mscclpp-test/allgather_test.cu
+++ b/test/mscclpp-test/allgather_test.cu
@@ -496,7 +496,7 @@ void AllGatherTestEngine::setupConnections() {
     auto service = std::dynamic_pointer_cast<AllGatherProxyService>(chanService_);
     setupMeshConnections(devProxyChannels, sendBuff_.get(), args_.maxBytes, nullptr, 0,
                          [&](std::vector<std::shared_ptr<mscclpp::Connection>> conns,
-                             std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>>& remoteMemories,
+                             std::vector<std::future<mscclpp::RegisteredMemory>>& remoteMemories,
                              const mscclpp::RegisteredMemory& localMemory) {
                            std::vector<mscclpp::SemaphoreId> semaphoreIds;
                            for (size_t i = 0; i < conns.size(); ++i) {
@@ -504,7 +504,6 @@ void AllGatherTestEngine::setupConnections() {
                              service->addRemoteMemory(remoteMemories[i].get());
                            }
                            service->setLocalMemory(localMemory);
-                           comm_->setup();
                          });
     auto proxyChannels = service->proxyChannels();
     if (proxyChannels.size() > sizeof(constRawProxyChan) / sizeof(DeviceHandle<mscclpp::ProxyChannel>)) {
diff --git a/test/mscclpp-test/common.cc b/test/mscclpp-test/common.cc
index a8e533ab..0c405e3a 100644
--- a/test/mscclpp-test/common.cc
+++ b/test/mscclpp-test/common.cc
@@ -327,7 +327,7 @@ void BaseTestEngine::runTest() {
 void BaseTestEngine::bootstrap() {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(args_.rank, args_.totalRanks);
   mscclpp::UniqueId id;
-  if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId();
+  if (bootstrap->rank() == 0) id = bootstrap->createUniqueId();
   MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
   bootstrap->initialize(id);
   comm_ = std::make_shared<mscclpp::Communicator>(bootstrap);
@@ -362,13 +362,13 @@ std::shared_ptr<mscclpp::BaseProxyService> BaseTestEngine::createProxyService()
 
 void BaseTestEngine::setupMeshConnectionsInternal(
     std::vector<std::shared_ptr<mscclpp::Connection>>& connections, mscclpp::RegisteredMemory& localRegMemory,
-    std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>>& remoteRegMemories, bool addConnections) {
+    std::vector<std::future<mscclpp::RegisteredMemory>>& remoteRegMemories, bool addConnections) {
   const int worldSize = args_.totalRanks;
   const int rank = args_.rank;
   const int nRanksPerNode = args_.nRanksPerNode;
   const int thisNode = rank / nRanksPerNode;
   const mscclpp::Transport ibTransport = IBs[args_.gpuNum];
-  std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> connectionFutures;
+  std::vector<std::future<std::shared_ptr<mscclpp::Connection>>> connectionFutures;
 
   auto rankToNode = [&](int rank) { return rank / nRanksPerNode; };
   for (int r = 0; r < worldSize; r++) {
@@ -383,16 +383,13 @@ void BaseTestEngine::setupMeshConnectionsInternal(
         transport = ibTransport;
       }
       // Connect with all other ranks
-      connectionFutures.push_back(comm_->connectOnSetup(r, 0, transport));
+      connectionFutures.push_back(comm_->connect(r, 0, transport));
     }
-    comm_->sendMemoryOnSetup(localRegMemory, r, 0);
-    auto remoteMemory = comm_->recvMemoryOnSetup(r, 0);
-    remoteRegMemories.push_back(remoteMemory);
-  }
-  comm_->setup();
-  std::transform(
-      connectionFutures.begin(), connectionFutures.end(), std::back_inserter(connections),
-      [](const mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>& future) { return future.get(); });
+    comm_->sendMemory(localRegMemory, r, 0);
+    remoteRegMemories.push_back(comm_->recvMemory(r, 0));
+  }
+  std::transform(connectionFutures.begin(), connectionFutures.end(), std::back_inserter(connections),
+                 [](std::future<std::shared_ptr<mscclpp::Connection>>& future) { return future.get(); });
 }
 
 // Create mesh connections between all ranks. If recvBuff is nullptr, assume in-place.
@@ -408,7 +405,7 @@ void BaseTestEngine::setupMeshConnections(std::vector<DeviceHandle<mscclpp::Simp
   }
 
   std::vector<std::shared_ptr<mscclpp::Connection>> connections;
-  std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteRegMemories;
+  std::vector<std::future<mscclpp::RegisteredMemory>> remoteRegMemories;
   mscclpp::RegisteredMemory& localRegMemory = (outputBuff) ? outputBufRegMem : inputBufRegMem;
 
   setupMeshConnectionsInternal(connections, localRegMemory, remoteRegMemories);
@@ -423,8 +420,6 @@ void BaseTestEngine::setupMeshConnections(std::vector<DeviceHandle<mscclpp::Simp
           service->addMemory(remoteRegMemories[i].get()), service->addMemory(inputBufRegMem))));
     }
   }
-
-  comm_->setup();
 }
 
 void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::SmChannel>& smChannels, void* inputBuff,
@@ -439,7 +434,7 @@ void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::SmChannel>& smCha
   }
 
   std::vector<std::shared_ptr<mscclpp::Connection>> connections;
-  std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteRegMemories;
+  std::vector<std::future<mscclpp::RegisteredMemory>> remoteRegMemories;
   mscclpp::RegisteredMemory& localRegMemory =
       (outputBuff && semantic == ChannelSemantic::PUT) ? outputBufRegMem : inputBufRegMem;
   setupMeshConnectionsInternal(connections, localRegMemory, remoteRegMemories);
@@ -450,7 +445,6 @@ void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::SmChannel>& smCha
       smSemaphores.emplace(cid, std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*comm_, connections[cid]));
     }
   }
-  comm_->setup();
 
   for (size_t cid = 0; cid < connections.size(); ++cid) {
     if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
@@ -482,13 +476,13 @@ void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::SmChannel>& smCha
   }
 
   std::vector<std::shared_ptr<mscclpp::Connection>> connections;
-  std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteRegMemories;
+  std::vector<std::future<mscclpp::RegisteredMemory>> remoteRegMemories;
   mscclpp::RegisteredMemory& localRegMemory =
       (getPacketBuff) ? getPacketBufRegMem : ((outputBuff) ? outputBufRegMem : inputBufRegMem);
 
   setupMeshConnectionsInternal(connections, localRegMemory, remoteRegMemories);
 
-  std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteRegMemoriesOutput;
+  std::vector<std::future<mscclpp::RegisteredMemory>> remoteRegMemoriesOutput;
   if (outputBuff) {
     setupMeshConnectionsInternal(connections, outputBufRegMem, remoteRegMemoriesOutput, false);
   }
@@ -504,7 +498,6 @@ void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::SmChannel>& smCha
       connIdToSemId[cid] = service->buildAndAddSemaphore(*comm_, connections[cid]);
     }
   }
-  comm_->setup();
 
   for (size_t cid = 0; cid < connections.size(); ++cid) {
     if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
diff --git a/test/mscclpp-test/common.hpp b/test/mscclpp-test/common.hpp
index 665ff911..24d94ec5 100644
--- a/test/mscclpp-test/common.hpp
+++ b/test/mscclpp-test/common.hpp
@@ -102,15 +102,15 @@ class BaseTestEngine {
 
   double benchTime();
 
-  void setupMeshConnectionsInternal(
-      std::vector<std::shared_ptr<mscclpp::Connection>>& connections, mscclpp::RegisteredMemory& localMemory,
-      std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>>& remoteRegMemories,
-      bool addConnections = true);
+  void setupMeshConnectionsInternal(std::vector<std::shared_ptr<mscclpp::Connection>>& connections,
+                                    mscclpp::RegisteredMemory& localMemory,
+                                    std::vector<std::future<mscclpp::RegisteredMemory>>& remoteRegMemories,
+                                    bool addConnections = true);
 
  protected:
-  using SetupChannelFunc = std::function<void(std::vector<std::shared_ptr<mscclpp::Connection>>,
-                                              std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>>&,
-                                              const mscclpp::RegisteredMemory&)>;
+  using SetupChannelFunc =
+      std::function<void(std::vector<std::shared_ptr<mscclpp::Connection>>,
+                         std::vector<std::future<mscclpp::RegisteredMemory>>&, const mscclpp::RegisteredMemory&)>;
   template <class T>
   using DeviceHandle = mscclpp::DeviceHandle<T>;
   void setupMeshConnections(std::vector<DeviceHandle<mscclpp::SimpleProxyChannel>>& proxyChannels, void* inputBuff,
diff --git a/test/mscclpp-test/sendrecv_test.cu b/test/mscclpp-test/sendrecv_test.cu
index 46cc0658..0bcdbcaf 100644
--- a/test/mscclpp-test/sendrecv_test.cu
+++ b/test/mscclpp-test/sendrecv_test.cu
@@ -156,29 +156,25 @@ void SendRecvTestEngine::setupConnections() {
   std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
 
   auto sendConnFuture =
-      comm_->connectOnSetup(sendToRank, 0, getTransport(args_.rank, sendToRank, args_.nRanksPerNode, ibDevice));
+      comm_->connect(sendToRank, 0, getTransport(args_.rank, sendToRank, args_.nRanksPerNode, ibDevice));
   if (recvFromRank != sendToRank) {
     auto recvConnFuture =
-        comm_->connectOnSetup(recvFromRank, 0, getTransport(args_.rank, recvFromRank, args_.nRanksPerNode, ibDevice));
-    comm_->setup();
+        comm_->connect(recvFromRank, 0, getTransport(args_.rank, recvFromRank, args_.nRanksPerNode, ibDevice));
     smSemaphores.push_back(std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*comm_, sendConnFuture.get()));
     smSemaphores.push_back(std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*comm_, recvConnFuture.get()));
   } else {
-    comm_->setup();
     smSemaphores.push_back(std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*comm_, sendConnFuture.get()));
     smSemaphores.push_back(smSemaphores[0]);  // reuse the send channel if worldSize is 2
   }
-  comm_->setup();
 
   std::vector<mscclpp::RegisteredMemory> localMemories;
-  std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> futureRemoteMemory;
+  std::vector<std::future<mscclpp::RegisteredMemory>> futureRemoteMemory;
 
   for (int i : {0, 1}) {
     auto regMem = comm_->registerMemory(devicePtrs_[i].get(), args_.maxBytes, mscclpp::Transport::CudaIpc | ibDevice);
-    comm_->sendMemoryOnSetup(regMem, ranks[i], 0);
+    comm_->sendMemory(regMem, ranks[i], 0);
     localMemories.push_back(regMem);
-    futureRemoteMemory.push_back(comm_->recvMemoryOnSetup(ranks[1 - i], 0));
-    comm_->setup();
+    futureRemoteMemory.push_back(comm_->recvMemory(ranks[1 - i], 0));
   }
 
   // swap to make sure devicePtrs_[0] in local rank write to devicePtrs_[1] in remote rank
diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc
index 90da5dd7..d2a53d43 100644
--- a/test/unit/core_tests.cc
+++ b/test/unit/core_tests.cc
@@ -18,20 +18,6 @@ class LocalCommunicatorTest : public ::testing::Test {
   std::shared_ptr<mscclpp::Communicator> comm;
 };
 
-class MockSetuppable : public mscclpp::Setuppable {
- public:
-  MOCK_METHOD(void, beginSetup, (std::shared_ptr<mscclpp::Bootstrap> bootstrap), (override));
-  MOCK_METHOD(void, endSetup, (std::shared_ptr<mscclpp::Bootstrap> bootstrap), (override));
-};
-
-TEST_F(LocalCommunicatorTest, OnSetup) {
-  auto mockSetuppable = std::make_shared<MockSetuppable>();
-  comm->onSetup(mockSetuppable);
-  EXPECT_CALL(*mockSetuppable, beginSetup(std::dynamic_pointer_cast<mscclpp::Bootstrap>(bootstrap)));
-  EXPECT_CALL(*mockSetuppable, endSetup(std::dynamic_pointer_cast<mscclpp::Bootstrap>(bootstrap)));
-  comm->setup();
-}
-
 TEST_F(LocalCommunicatorTest, RegisterMemory) {
   int dummy[42];
   auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
@@ -43,9 +29,8 @@ TEST_F(LocalCommunicatorTest, RegisterMemory) {
 TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
   int dummy[42];
   auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
-  comm->sendMemoryOnSetup(memory, 0, 0);
-  auto memoryFuture = comm->recvMemoryOnSetup(0, 0);
-  comm->setup();
+  comm->sendMemory(memory, 0, 0);
+  auto memoryFuture = comm->recvMemory(0, 0);
   auto sameMemory = memoryFuture.get();
   EXPECT_EQ(sameMemory.data(), memory.data());
   EXPECT_EQ(sameMemory.size(), memory.size());