Skip to content

Commit

Permalink
PR #21708: NUMA-pin host memory buffers for D2H/H2D transfers
Browse files Browse the repository at this point in the history
Imported from GitHub PR #21708

This ensures that the pinned host buffers used for transfers between host and device are pinned to the NUMA node closest to the device. It had a previous life as #15216.

In a benchmark that triggers large, concurrent, copies from all devices to the host then achieved D2H throughput is around 33 GiB/s with NUMA pinning on a DGX H100 node (2xCPU, 8xH100). Without pinning, the achieved throughput is around 13.5 GiB/s from the same benchmark.

While it is already possible to achieve the correct NUMA pinning in process-per-GPU and process-per-NUMA-node configurations using `numactl` or similar, achieving correct pinning in process-per-node configuration requires logic inside XLA.
Copybara import of the project:

--
0eab66c by Olli Lupton <[email protected]>:

NUMA-pin host memory buffers for D2H/H2D transfers

--
57a4664 by Olli Lupton <[email protected]>:

256 byte alignment for host allocations when NUMA is not enabled

--
ad2895a by Olli Lupton <[email protected]>:

Address review comments

--
629777e by Olli Lupton <[email protected]>:

std::string_view -> absl::string_view

--
21587a5 by Olli Lupton <[email protected]>:

Apply @beckerhe's suggested Bazel changes

--
175c5f6 by Olli Lupton <[email protected]>:

add missing dependency

Merging this change closes #21708

FUTURE_COPYBARA_INTEGRATE_REVIEW=#21708 from olupton:numa 175c5f6
PiperOrigin-RevId: 722688719
  • Loading branch information
olupton authored and Google-ML-Automation committed Feb 5, 2025
1 parent 383d9bd commit d261874
Show file tree
Hide file tree
Showing 8 changed files with 195 additions and 72 deletions.
142 changes: 95 additions & 47 deletions xla/stream_executor/cuda/cuda_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,32 @@ std::string GetPCIBusID(CUdevice device) {
LOG(ERROR) << "PCI bus id is not null terminated.";
return "";
}
return std::string(raw_pci_bus_id.data());
// Lower the hex characters to match sysfs.
return absl::AsciiStrToLower(absl::string_view(raw_pci_bus_id.data()));
}

bool HostRegister(Context* context, void* location, uint64_t size) {
ScopedActivateContext activation(context);
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
auto status = cuda::ToStatus(
cuMemHostRegister(location, size, CU_MEMHOSTREGISTER_PORTABLE));
if (!status.ok()) {
LOG(ERROR) << "error registering host memory at " << location << ": "
<< status;
return false;
}
return true;
}

bool HostUnregister(Context* context, void* location) {
ScopedActivateContext activation(context);
auto status = cuda::ToStatus(cuMemHostUnregister(location));
if (!status.ok()) {
LOG(ERROR) << "error unregistering host memory at " << location << ": "
<< status;
return false;
}
return true;
}

// Allocates memory on the GPU device.
Expand Down Expand Up @@ -502,34 +527,69 @@ void DeviceDeallocate(Context* context, void* location) {
}

// Allocates memory on the host.
absl::StatusOr<void*> HostAllocate(Context* context, uint64_t bytes) {
ScopedActivateContext activation(context);
void* host_mem = nullptr;
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
TF_RETURN_IF_ERROR(cuda::ToStatus(
cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE)));
return host_mem;
absl::StatusOr<void*> HostAllocate(Context* context, int numa_node,
uint64_t size) {
if (numa_node != tsl::port::kNUMANoAffinity) {
// CUDA programming guide: "Any address of a variable ... returned by one
// of the memory allocation routines from the driver ... API is always
// aligned to at least 256 bytes."
auto* buffer =
tsl::port::NUMAMalloc(numa_node, size, /* minimum_alignment=*/256);
if (buffer == nullptr && size > 0) {
return absl::InternalError(absl::StrFormat(
"Failed to allocate host memory of size %d pinned to NUMA node %d",
size, numa_node));
}
if (size > 0 && !HostRegister(context, buffer, size)) {
tsl::port::NUMAFree(buffer, size);
return absl::InternalError(
absl::StrFormat("Failed to register host memory of size %d pinned to "
"NUMA node %d with the GPU driver",
size, numa_node));
}
return buffer;
} else {
ScopedActivateContext activation(context);
void* buffer = nullptr;
// "Portable" memory is visible to all CUDA contexts. Safe for our use
// model.
TF_RETURN_IF_ERROR(cuda::ToStatus(
cuMemHostAlloc(&buffer, size, CU_MEMHOSTALLOC_PORTABLE)));
if (!buffer && size > 0) {
return absl::InternalError(absl::StrFormat(
"Failed to allocate pinned host memory of size %d", size));
}
return buffer;
}
}

// Deallocates memory allocated via HostAllocate.
void HostDeallocate(Context* context, void* location) {
ScopedActivateContext activation(context);
auto status = cuda::ToStatus(cuMemFreeHost(location));
if (!status.ok()) {
LOG(ERROR) << "error deallocating host memory at " << location << ": "
<< status;
void HostDeallocate(Context* context, int numa_node, void* location,
uint64_t size) {
if (numa_node != tsl::port::kNUMANoAffinity) {
if (size > 0) {
HostUnregister(context, location);
}
tsl::port::NUMAFree(location, size);
} else {
ScopedActivateContext activation(context);
auto status = cuda::ToStatus(cuMemFreeHost(location));
if (!status.ok()) {
LOG(ERROR) << "error deallocating host memory at " << location << ": "
<< status;
}
}
}

// Creates a MemoryAllocation wrapping the given host buffer.
absl::StatusOr<std::unique_ptr<MemoryAllocation>> AllocateHostMemory(
CudaContext* cuda_context, uint64_t size) {
TF_ASSIGN_OR_RETURN(void* ptr, HostAllocate(cuda_context, size));
CudaContext* cuda_context, int numa_node, uint64_t size) {
TF_ASSIGN_OR_RETURN(void* ptr, HostAllocate(cuda_context, numa_node, size));
VLOG(2) << "allocated " << ptr << " for context " << cuda_context << " of "
<< size << " bytes of host memory";
return std::make_unique<GenericMemoryAllocation>(
ptr, size, [cuda_context](void* location, uint64_t size) {
HostDeallocate(cuda_context, location);
ptr, size, [cuda_context, numa_node](void* location, uint64_t size) {
HostDeallocate(cuda_context, numa_node, location, size);
VLOG(2) << "deallocated collective memory at " << location
<< " for context " << cuda_context;
});
Expand Down Expand Up @@ -622,7 +682,7 @@ CudaExecutor::CreateMemoryAllocator(MemoryType type) {
});
} else if (type == MemoryType::kHost) {
return std::make_unique<GenericMemoryAllocator>([this](uint64_t size) {
return AllocateHostMemory(cuda_context_, size);
return AllocateHostMemory(cuda_context_, numa_node_, size);
});
}
return absl::UnimplementedError(
Expand All @@ -636,6 +696,12 @@ absl::Status CudaExecutor::Init() {
cuda_context_ = context;
TF_RETURN_IF_ERROR(GetComputeCapability(&cc_major_, &cc_minor_, device_));
TF_ASSIGN_OR_RETURN(delay_kernels_supported_, DelayKernelIsSupported());
numa_node_ = ReadNumaNode(GetPCIBusID(device_), device_ordinal())
.value_or(tsl::port::kNUMANoAffinity);
if (numa_node_ == tsl::port::kNUMANoAffinity) {
VLOG(2) << "Could not determine NUMA node of device ordinal "
<< device_ordinal();
}
return absl::OkStatus();
}

Expand Down Expand Up @@ -936,7 +1002,7 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
return DeviceMemoryBase(result.value(), size);
} else if (memory_space ==
static_cast<int64_t>(stream_executor::MemoryType::kHost)) {
auto result = HostAllocate(cuda_context_, size);
auto result = HostAllocate(cuda_context_, numa_node_, size);
if (!result.ok()) {
return DeviceMemoryBase(nullptr, 0);
}
Expand All @@ -948,7 +1014,7 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {

absl::StatusOr<std::unique_ptr<MemoryAllocation>>
CudaExecutor::HostMemoryAllocate(uint64_t size) {
return AllocateHostMemory(cuda_context_, size);
return AllocateHostMemory(cuda_context_, numa_node_, size);
}

void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
Expand All @@ -959,7 +1025,7 @@ void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
}
auto memory_space = status_or_memory_space.value();
if (memory_space == MemoryType::kHost) {
HostDeallocate(cuda_context_, mem->opaque());
HostDeallocate(cuda_context_, numa_node_, mem->opaque(), mem->size());
} else {
DeviceDeallocate(cuda_context_, mem->opaque());
}
Expand All @@ -972,30 +1038,12 @@ bool CudaExecutor::SynchronizeAllActivity() {
bool CudaExecutor::HostMemoryRegister(void* location, uint64_t size) {
VLOG(1) << "Called StreamExecutor::HostMemoryRegister(data=" << location
<< ")";

std::unique_ptr<ActivateContext> activation = Activate();
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
auto status = cuda::ToStatus(
cuMemHostRegister(location, size, CU_MEMHOSTREGISTER_PORTABLE));
if (!status.ok()) {
LOG(ERROR) << "error registering host memory at " << location << ": "
<< status;
return false;
}
return true;
return HostRegister(cuda_context_, location, size);
}

bool CudaExecutor::HostMemoryUnregister(void* location) {
VLOG(1) << "Called StreamExecutor::HostUnregister(data=" << location << ")";

std::unique_ptr<ActivateContext> activation = Activate();
auto status = cuda::ToStatus(cuMemHostUnregister(location));
if (!status.ok()) {
LOG(ERROR) << "error unregistering host memory at " << location << ": "
<< status;
return false;
}
return true;
return HostUnregister(cuda_context_, location);
}

absl::Status CudaExecutor::SynchronousMemZero(DeviceMemoryBase* location,
Expand Down Expand Up @@ -1235,14 +1283,14 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {

{
std::string pci_bus_id = GetPCIBusID(device);

// Lower the hex characters to match sysfs.
pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
desc.set_pci_bus_id(pci_bus_id);

// Read the NUMA node corresponding to the PCI bus ID out of sysfs.
int numa_node = ReadNumaNode(pci_bus_id, device_ordinal);
desc.set_numa_node(numa_node);
std::optional<int> numa_node = ReadNumaNode(pci_bus_id, device_ordinal);
// If the kernel reports -1, adjust to 0; leave as -1 if no value could be
// obtained.
desc.set_numa_node(numa_node.has_value() ? std::max(0, *numa_node)
: tsl::port::kNUMANoAffinity);
}

{
Expand Down
3 changes: 3 additions & 0 deletions xla/stream_executor/cuda/cuda_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ class CudaExecutor : public GpuExecutor {
// The minor version of the compute capability for device_.
int cc_minor_;

// The NUMA node of the CPU closest to device_
int numa_node_;

// Reader/writer lock for mutable data structures on this object.
absl::Mutex mu_;

Expand Down
1 change: 1 addition & 0 deletions xla/stream_executor/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,7 @@ xla_test(
"@com_google_absl//absl/status:statusor",
"@com_google_absl//absl/strings",
"@com_google_googletest//:gtest_main",
"@tsl//tsl/platform:platform_port",
"@tsl//tsl/platform:statusor",
"@tsl//tsl/platform:test",
] + if_cuda([
Expand Down
52 changes: 52 additions & 0 deletions xla/stream_executor/gpu/gpu_executor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ limitations under the License.
#include "xla/stream_executor/platform_manager.h"
#include "xla/stream_executor/stream_executor.h"
#include "xla/tsl/platform/statusor.h"
#include "tsl/platform/numa.h"

namespace stream_executor {

Expand All @@ -45,6 +46,17 @@ TEST_F(GetPointerMemorySpaceTest, Host) {
EXPECT_EQ(memory_space, MemoryType::kHost);
}

TEST_F(GetPointerMemorySpaceTest, HostAllocatedWithMemoryKind) {
StreamExecutor* executor = GetPlatform()->ExecutorForDevice(0).value();
DeviceMemoryBase host_ptr = executor->Allocate(
64, static_cast<int64_t>(stream_executor::MemoryType::kHost));
EXPECT_FALSE(host_ptr.is_null());
TF_ASSERT_OK_AND_ASSIGN(MemoryType memory_space,
executor->GetPointerMemorySpace(host_ptr.opaque()))
EXPECT_EQ(memory_space, MemoryType::kHost);
executor->Deallocate(&host_ptr);
}

TEST_F(GetPointerMemorySpaceTest, Device) {
StreamExecutor* executor = GetPlatform()->ExecutorForDevice(0).value();
auto mem = executor->Allocate(64);
Expand All @@ -63,4 +75,44 @@ TEST_F(GetPointerMemorySpaceTest, Collective) {
executor->Deallocate(&mem);
}

using HostMemoryAllocateTest = GpuExecutorTest;

TEST_F(HostMemoryAllocateTest, Numa) {
Platform* platform = GetPlatform();
constexpr uint64_t kSize = 1024;
const int num_devices = platform->VisibleDeviceCount();
for (int device = 0; device < num_devices; ++device) {
TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
platform->ExecutorForDevice(device));
ASSERT_TRUE(executor);
const DeviceDescription& device_desc = executor->GetDeviceDescription();
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> host_ptr,
executor->HostMemoryAllocate(kSize));
ASSERT_TRUE(host_ptr);
EXPECT_NE(host_ptr->opaque(), nullptr);
const int numa_node = tsl::port::NUMAGetMemAffinity(host_ptr->opaque());
if (numa_node == tsl::port::kNUMANoAffinity) {
// Could be because `executor` could not determine its own NUMA node, in
// which case numa_node() will be -1 or 0, depending on the failure mode.
EXPECT_LE(device_desc.numa_node(), 0);
EXPECT_GE(device_desc.numa_node(), -1);
} else {
EXPECT_EQ(device_desc.numa_node(), numa_node);
}
}
}

TEST_F(HostMemoryAllocateTest, TooBig) {
Platform* platform = GetPlatform();
constexpr uint64_t kTooBig = 1125899906842624; // 1 PiB
const int num_devices = platform->VisibleDeviceCount();
for (int device = 0; device < num_devices; ++device) {
TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
platform->ExecutorForDevice(device));
ASSERT_TRUE(executor);
auto should_fail = executor->HostMemoryAllocate(kTooBig);
EXPECT_FALSE(should_fail.ok());
}
}

} // namespace stream_executor
26 changes: 12 additions & 14 deletions xla/stream_executor/gpu/read_numa_node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,17 @@ limitations under the License.

namespace stream_executor::gpu {

int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal) {
if (!tsl::port::NUMAEnabled()) {
// NUMA is not currently enabled. Return node 0.
return 0;
std::optional<int> ReadNumaNode(absl::string_view pci_bus_id,
int device_ordinal) {
if (tsl::port::NUMANumNodes() < 2) {
// NUMA support is not currently enabled, or there is only one node.
return tsl::port::kNUMANoAffinity;
}
VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
static const int kUnknownNumaNode = -1;

if (pci_bus_id.empty()) {
LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
return kUnknownNumaNode;
return std::nullopt;
}

std::string filename =
Expand All @@ -49,12 +49,13 @@ int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal) {
if (file == nullptr) {
LOG(INFO) << "could not open file to read NUMA node: " << filename
<< "\nYour kernel may have been built without NUMA support.";
return kUnknownNumaNode;
return std::nullopt;
}

std::string content;
char buf[32];
size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
fclose(file);
buf[did_read] = '\0';
content = buf;

Expand All @@ -63,24 +64,21 @@ int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal) {
if (value < 0) { // See http://b/18228951 for details on this path.
LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
<< value
<< "), but there must be at least one NUMA node"
", so returning NUMA node zero."
<< "), but there must be at least one NUMA node so this will "
" be massaged to NUMA node zero in some places."
" See more at "
"https://github.com/torvalds/linux/blob/v6.0/Documentation/"
"ABI/testing/sysfs-bus-pci#L344-L355";
fclose(file);
return 0;
return tsl::port::kNUMANoAffinity;
}
fclose(file);
return value;
}

LOG(WARNING)
<< "could not convert SysFS file contents to integral NUMA node value: "
<< content;

fclose(file);
return kUnknownNumaNode;
return std::nullopt;
}

} // namespace stream_executor::gpu
Loading

0 comments on commit d261874

Please sign in to comment.