PR #21708: NUMA-pin host memory buffers for D2H/H2D transfers

Imported from GitHub PR #21708 This ensures that the pinned host buffers used for transfers between host and device are pinned to the NUMA node closest to the device. It had a previous life as #15216. In a benchmark that triggers large, concurrent, copies from all devices to the host then achieved D2H throughput is around 33 GiB/s with NUMA pinning on a DGX H100 node (2xCPU, 8xH100). Without pinning, the achieved throughput is around 13.5 GiB/s from the same benchmark. While it is already possible to achieve the correct NUMA pinning in process-per-GPU and process-per-NUMA-node configurations using `numactl` or similar, achieving correct pinning in process-per-node configuration requires logic inside XLA. Copybara import of the project: -- 0eab66c by Olli Lupton <[email protected]>: NUMA-pin host memory buffers for D2H/H2D transfers -- 57a4664 by Olli Lupton <[email protected]>: 256 byte alignment for host allocations when NUMA is not enabled -- ad2895a by Olli Lupton <[email protected]>: Address review comments -- 629777e by Olli Lupton <[email protected]>: std::string_view -> absl::string_view -- 21587a5 by Olli Lupton <[email protected]>: Apply @beckerhe's suggested Bazel changes -- 175c5f6 by Olli Lupton <[email protected]>: add missing dependency Merging this change closes #21708 FUTURE_COPYBARA_INTEGRATE_REVIEW=#21708 from olupton:numa 175c5f6 PiperOrigin-RevId: 722688719
openxla · Feb 5, 2025 · d261874 · d261874
1 parent 383d9bd
commit d261874
Show file tree

Hide file tree

Showing 8 changed files with 195 additions and 72 deletions.
diff --git a/xla/stream_executor/cuda/cuda_executor.cc b/xla/stream_executor/cuda/cuda_executor.cc
@@ -461,7 +461,32 @@ std::string GetPCIBusID(CUdevice device) {
     LOG(ERROR) << "PCI bus id is not null terminated.";
     return "";
   }
-  return std::string(raw_pci_bus_id.data());
+  // Lower the hex characters to match sysfs.
+  return absl::AsciiStrToLower(absl::string_view(raw_pci_bus_id.data()));
+}
+
+bool HostRegister(Context* context, void* location, uint64_t size) {
+  ScopedActivateContext activation(context);
+  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
+  auto status = cuda::ToStatus(
+      cuMemHostRegister(location, size, CU_MEMHOSTREGISTER_PORTABLE));
+  if (!status.ok()) {
+    LOG(ERROR) << "error registering host memory at " << location << ": "
+               << status;
+    return false;
+  }
+  return true;
+}
+
+bool HostUnregister(Context* context, void* location) {
+  ScopedActivateContext activation(context);
+  auto status = cuda::ToStatus(cuMemHostUnregister(location));
+  if (!status.ok()) {
+    LOG(ERROR) << "error unregistering host memory at " << location << ": "
+               << status;
+    return false;
+  }
+  return true;
 }
 
 // Allocates memory on the GPU device.
@@ -502,34 +527,69 @@ void DeviceDeallocate(Context* context, void* location) {
 }
 
 // Allocates memory on the host.
-absl::StatusOr<void*> HostAllocate(Context* context, uint64_t bytes) {
-  ScopedActivateContext activation(context);
-  void* host_mem = nullptr;
-  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
-  TF_RETURN_IF_ERROR(cuda::ToStatus(
-      cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE)));
-  return host_mem;
+absl::StatusOr<void*> HostAllocate(Context* context, int numa_node,
+                                   uint64_t size) {
+  if (numa_node != tsl::port::kNUMANoAffinity) {
+    // CUDA programming guide: "Any address of a variable ... returned by one
+    // of the memory allocation routines from the driver ... API is always
+    // aligned to at least 256 bytes."
+    auto* buffer =
+        tsl::port::NUMAMalloc(numa_node, size, /* minimum_alignment=*/256);
+    if (buffer == nullptr && size > 0) {
+      return absl::InternalError(absl::StrFormat(
+          "Failed to allocate host memory of size %d pinned to NUMA node %d",
+          size, numa_node));
+    }
+    if (size > 0 && !HostRegister(context, buffer, size)) {
+      tsl::port::NUMAFree(buffer, size);
+      return absl::InternalError(
+          absl::StrFormat("Failed to register host memory of size %d pinned to "
+                          "NUMA node %d with the GPU driver",
+                          size, numa_node));
+    }
+    return buffer;
+  } else {
+    ScopedActivateContext activation(context);
+    void* buffer = nullptr;
+    // "Portable" memory is visible to all CUDA contexts. Safe for our use
+    // model.
+    TF_RETURN_IF_ERROR(cuda::ToStatus(
+        cuMemHostAlloc(&buffer, size, CU_MEMHOSTALLOC_PORTABLE)));
+    if (!buffer && size > 0) {
+      return absl::InternalError(absl::StrFormat(
+          "Failed to allocate pinned host memory of size %d", size));
+    }
+    return buffer;
+  }
 }
 
 // Deallocates memory allocated via HostAllocate.
-void HostDeallocate(Context* context, void* location) {
-  ScopedActivateContext activation(context);
-  auto status = cuda::ToStatus(cuMemFreeHost(location));
-  if (!status.ok()) {
-    LOG(ERROR) << "error deallocating host memory at " << location << ": "
-               << status;
+void HostDeallocate(Context* context, int numa_node, void* location,
+                    uint64_t size) {
+  if (numa_node != tsl::port::kNUMANoAffinity) {
+    if (size > 0) {
+      HostUnregister(context, location);
+    }
+    tsl::port::NUMAFree(location, size);
+  } else {
+    ScopedActivateContext activation(context);
+    auto status = cuda::ToStatus(cuMemFreeHost(location));
+    if (!status.ok()) {
+      LOG(ERROR) << "error deallocating host memory at " << location << ": "
+                 << status;
+    }
   }
 }
 
 // Creates a MemoryAllocation wrapping the given host buffer.
 absl::StatusOr<std::unique_ptr<MemoryAllocation>> AllocateHostMemory(
-    CudaContext* cuda_context, uint64_t size) {
-  TF_ASSIGN_OR_RETURN(void* ptr, HostAllocate(cuda_context, size));
+    CudaContext* cuda_context, int numa_node, uint64_t size) {
+  TF_ASSIGN_OR_RETURN(void* ptr, HostAllocate(cuda_context, numa_node, size));
   VLOG(2) << "allocated " << ptr << " for context " << cuda_context << " of "
           << size << " bytes of host memory";
   return std::make_unique<GenericMemoryAllocation>(
-      ptr, size, [cuda_context](void* location, uint64_t size) {
-        HostDeallocate(cuda_context, location);
+      ptr, size, [cuda_context, numa_node](void* location, uint64_t size) {
+        HostDeallocate(cuda_context, numa_node, location, size);
         VLOG(2) << "deallocated collective memory at " << location
                 << " for context " << cuda_context;
       });
@@ -622,7 +682,7 @@ CudaExecutor::CreateMemoryAllocator(MemoryType type) {
         });
   } else if (type == MemoryType::kHost) {
     return std::make_unique<GenericMemoryAllocator>([this](uint64_t size) {
-      return AllocateHostMemory(cuda_context_, size);
+      return AllocateHostMemory(cuda_context_, numa_node_, size);
     });
   }
   return absl::UnimplementedError(
@@ -636,6 +696,12 @@ absl::Status CudaExecutor::Init() {
   cuda_context_ = context;
   TF_RETURN_IF_ERROR(GetComputeCapability(&cc_major_, &cc_minor_, device_));
   TF_ASSIGN_OR_RETURN(delay_kernels_supported_, DelayKernelIsSupported());
+  numa_node_ = ReadNumaNode(GetPCIBusID(device_), device_ordinal())
+                   .value_or(tsl::port::kNUMANoAffinity);
+  if (numa_node_ == tsl::port::kNUMANoAffinity) {
+    VLOG(2) << "Could not determine NUMA node of device ordinal "
+            << device_ordinal();
+  }
   return absl::OkStatus();
 }
 
@@ -936,7 +1002,7 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
     return DeviceMemoryBase(result.value(), size);
   } else if (memory_space ==
              static_cast<int64_t>(stream_executor::MemoryType::kHost)) {
-    auto result = HostAllocate(cuda_context_, size);
+    auto result = HostAllocate(cuda_context_, numa_node_, size);
     if (!result.ok()) {
       return DeviceMemoryBase(nullptr, 0);
     }
@@ -948,7 +1014,7 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
 
 absl::StatusOr<std::unique_ptr<MemoryAllocation>>
 CudaExecutor::HostMemoryAllocate(uint64_t size) {
-  return AllocateHostMemory(cuda_context_, size);
+  return AllocateHostMemory(cuda_context_, numa_node_, size);
 }
 
 void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
@@ -959,7 +1025,7 @@ void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
   }
   auto memory_space = status_or_memory_space.value();
   if (memory_space == MemoryType::kHost) {
-    HostDeallocate(cuda_context_, mem->opaque());
+    HostDeallocate(cuda_context_, numa_node_, mem->opaque(), mem->size());
   } else {
     DeviceDeallocate(cuda_context_, mem->opaque());
   }
@@ -972,30 +1038,12 @@ bool CudaExecutor::SynchronizeAllActivity() {
 bool CudaExecutor::HostMemoryRegister(void* location, uint64_t size) {
   VLOG(1) << "Called StreamExecutor::HostMemoryRegister(data=" << location
           << ")";
-
-  std::unique_ptr<ActivateContext> activation = Activate();
-  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
-  auto status = cuda::ToStatus(
-      cuMemHostRegister(location, size, CU_MEMHOSTREGISTER_PORTABLE));
-  if (!status.ok()) {
-    LOG(ERROR) << "error registering host memory at " << location << ": "
-               << status;
-    return false;
-  }
-  return true;
+  return HostRegister(cuda_context_, location, size);
 }
 
 bool CudaExecutor::HostMemoryUnregister(void* location) {
   VLOG(1) << "Called StreamExecutor::HostUnregister(data=" << location << ")";
-
-  std::unique_ptr<ActivateContext> activation = Activate();
-  auto status = cuda::ToStatus(cuMemHostUnregister(location));
-  if (!status.ok()) {
-    LOG(ERROR) << "error unregistering host memory at " << location << ": "
-               << status;
-    return false;
-  }
-  return true;
+  return HostUnregister(cuda_context_, location);
 }
 
 absl::Status CudaExecutor::SynchronousMemZero(DeviceMemoryBase* location,
@@ -1235,14 +1283,14 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {
 
   {
     std::string pci_bus_id = GetPCIBusID(device);
-
-    // Lower the hex characters to match sysfs.
-    pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
     desc.set_pci_bus_id(pci_bus_id);
 
     // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
-    int numa_node = ReadNumaNode(pci_bus_id, device_ordinal);
-    desc.set_numa_node(numa_node);
+    std::optional<int> numa_node = ReadNumaNode(pci_bus_id, device_ordinal);
+    // If the kernel reports -1, adjust to 0; leave as -1 if no value could be
+    // obtained.
+    desc.set_numa_node(numa_node.has_value() ? std::max(0, *numa_node)
+                                             : tsl::port::kNUMANoAffinity);
   }
 
   {

diff --git a/xla/stream_executor/cuda/cuda_executor.h b/xla/stream_executor/cuda/cuda_executor.h
@@ -191,6 +191,9 @@ class CudaExecutor : public GpuExecutor {
   // The minor version of the compute capability for device_.
   int cc_minor_;
 
+  // The NUMA node of the CPU closest to device_
+  int numa_node_;
+
   // Reader/writer lock for mutable data structures on this object.
   absl::Mutex mu_;
 

diff --git a/xla/stream_executor/gpu/BUILD b/xla/stream_executor/gpu/BUILD
@@ -733,6 +733,7 @@ xla_test(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
+        "@tsl//tsl/platform:platform_port",
         "@tsl//tsl/platform:statusor",
         "@tsl//tsl/platform:test",
     ] + if_cuda([

diff --git a/xla/stream_executor/gpu/gpu_executor_test.cc b/xla/stream_executor/gpu/gpu_executor_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/statusor.h"
+#include "tsl/platform/numa.h"
 
 namespace stream_executor {
 
@@ -45,6 +46,17 @@ TEST_F(GetPointerMemorySpaceTest, Host) {
   EXPECT_EQ(memory_space, MemoryType::kHost);
 }
 
+TEST_F(GetPointerMemorySpaceTest, HostAllocatedWithMemoryKind) {
+  StreamExecutor* executor = GetPlatform()->ExecutorForDevice(0).value();
+  DeviceMemoryBase host_ptr = executor->Allocate(
+      64, static_cast<int64_t>(stream_executor::MemoryType::kHost));
+  EXPECT_FALSE(host_ptr.is_null());
+  TF_ASSERT_OK_AND_ASSIGN(MemoryType memory_space,
+                          executor->GetPointerMemorySpace(host_ptr.opaque()))
+  EXPECT_EQ(memory_space, MemoryType::kHost);
+  executor->Deallocate(&host_ptr);
+}
+
 TEST_F(GetPointerMemorySpaceTest, Device) {
   StreamExecutor* executor = GetPlatform()->ExecutorForDevice(0).value();
   auto mem = executor->Allocate(64);
@@ -63,4 +75,44 @@ TEST_F(GetPointerMemorySpaceTest, Collective) {
   executor->Deallocate(&mem);
 }
 
+using HostMemoryAllocateTest = GpuExecutorTest;
+
+TEST_F(HostMemoryAllocateTest, Numa) {
+  Platform* platform = GetPlatform();
+  constexpr uint64_t kSize = 1024;
+  const int num_devices = platform->VisibleDeviceCount();
+  for (int device = 0; device < num_devices; ++device) {
+    TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
+                            platform->ExecutorForDevice(device));
+    ASSERT_TRUE(executor);
+    const DeviceDescription& device_desc = executor->GetDeviceDescription();
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> host_ptr,
+                            executor->HostMemoryAllocate(kSize));
+    ASSERT_TRUE(host_ptr);
+    EXPECT_NE(host_ptr->opaque(), nullptr);
+    const int numa_node = tsl::port::NUMAGetMemAffinity(host_ptr->opaque());
+    if (numa_node == tsl::port::kNUMANoAffinity) {
+      // Could be because `executor` could not determine its own NUMA node, in
+      // which case numa_node() will be -1 or 0, depending on the failure mode.
+      EXPECT_LE(device_desc.numa_node(), 0);
+      EXPECT_GE(device_desc.numa_node(), -1);
+    } else {
+      EXPECT_EQ(device_desc.numa_node(), numa_node);
+    }
+  }
+}
+
+TEST_F(HostMemoryAllocateTest, TooBig) {
+  Platform* platform = GetPlatform();
+  constexpr uint64_t kTooBig = 1125899906842624;  // 1 PiB
+  const int num_devices = platform->VisibleDeviceCount();
+  for (int device = 0; device < num_devices; ++device) {
+    TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
+                            platform->ExecutorForDevice(device));
+    ASSERT_TRUE(executor);
+    auto should_fail = executor->HostMemoryAllocate(kTooBig);
+    EXPECT_FALSE(should_fail.ok());
+  }
+}
+
 }  // namespace stream_executor
diff --git a/xla/stream_executor/gpu/read_numa_node.cc b/xla/stream_executor/gpu/read_numa_node.cc
@@ -26,17 +26,17 @@ limitations under the License.
 
 namespace stream_executor::gpu {
 
-int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal) {
-  if (!tsl::port::NUMAEnabled()) {
-    // NUMA is not currently enabled. Return node 0.
-    return 0;
+std::optional<int> ReadNumaNode(absl::string_view pci_bus_id,
+                                int device_ordinal) {
+  if (tsl::port::NUMANumNodes() < 2) {
+    // NUMA support is not currently enabled, or there is only one node.
+    return tsl::port::kNUMANoAffinity;
   }
   VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
-  static const int kUnknownNumaNode = -1;
 
   if (pci_bus_id.empty()) {
     LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
-    return kUnknownNumaNode;
+    return std::nullopt;
   }
 
   std::string filename =
@@ -49,12 +49,13 @@ int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal) {
   if (file == nullptr) {
     LOG(INFO) << "could not open file to read NUMA node: " << filename
               << "\nYour kernel may have been built without NUMA support.";
-    return kUnknownNumaNode;
+    return std::nullopt;
   }
 
   std::string content;
   char buf[32];
   size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
+  fclose(file);
   buf[did_read] = '\0';
   content = buf;
 
@@ -63,24 +64,21 @@ int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal) {
     if (value < 0) {  // See http://b/18228951 for details on this path.
       LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
                 << value
-                << "), but there must be at least one NUMA node"
-                   ", so returning NUMA node zero."
+                << "), but there must be at least one NUMA node so this will "
+                   " be massaged to NUMA node zero in some places."
                    " See more at "
                    "https://github.com/torvalds/linux/blob/v6.0/Documentation/"
                    "ABI/testing/sysfs-bus-pci#L344-L355";
-      fclose(file);
-      return 0;
+      return tsl::port::kNUMANoAffinity;
     }
-    fclose(file);
     return value;
   }
 
   LOG(WARNING)
       << "could not convert SysFS file contents to integral NUMA node value: "
       << content;
 
-  fclose(file);
-  return kUnknownNumaNode;
+  return std::nullopt;
 }
 
 }  // namespace stream_executor::gpu