Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PR #21708: NUMA-pin host memory buffers for D2H/H2D transfers #22243

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 95 additions & 47 deletions xla/stream_executor/cuda/cuda_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,32 @@ std::string GetPCIBusID(CUdevice device) {
LOG(ERROR) << "PCI bus id is not null terminated.";
return "";
}
return std::string(raw_pci_bus_id.data());
// Lower the hex characters to match sysfs.
return absl::AsciiStrToLower(absl::string_view(raw_pci_bus_id.data()));
}

bool HostRegister(Context* context, void* location, uint64_t size) {
ScopedActivateContext activation(context);
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
auto status = cuda::ToStatus(
cuMemHostRegister(location, size, CU_MEMHOSTREGISTER_PORTABLE));
if (!status.ok()) {
LOG(ERROR) << "error registering host memory at " << location << ": "
<< status;
return false;
}
return true;
}

bool HostUnregister(Context* context, void* location) {
ScopedActivateContext activation(context);
auto status = cuda::ToStatus(cuMemHostUnregister(location));
if (!status.ok()) {
LOG(ERROR) << "error unregistering host memory at " << location << ": "
<< status;
return false;
}
return true;
}

// Allocates memory on the GPU device.
Expand Down Expand Up @@ -502,34 +527,69 @@ void DeviceDeallocate(Context* context, void* location) {
}

// Allocates memory on the host.
absl::StatusOr<void*> HostAllocate(Context* context, uint64_t bytes) {
ScopedActivateContext activation(context);
void* host_mem = nullptr;
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
TF_RETURN_IF_ERROR(cuda::ToStatus(
cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE)));
return host_mem;
absl::StatusOr<void*> HostAllocate(Context* context, int numa_node,
uint64_t size) {
if (numa_node != tsl::port::kNUMANoAffinity) {
// CUDA programming guide: "Any address of a variable ... returned by one
// of the memory allocation routines from the driver ... API is always
// aligned to at least 256 bytes."
auto* buffer =
tsl::port::NUMAMalloc(numa_node, size, /* minimum_alignment=*/256);
if (buffer == nullptr && size > 0) {
return absl::InternalError(absl::StrFormat(
"Failed to allocate host memory of size %d pinned to NUMA node %d",
size, numa_node));
}
if (size > 0 && !HostRegister(context, buffer, size)) {
tsl::port::NUMAFree(buffer, size);
return absl::InternalError(
absl::StrFormat("Failed to register host memory of size %d pinned to "
"NUMA node %d with the GPU driver",
size, numa_node));
}
return buffer;
} else {
ScopedActivateContext activation(context);
void* buffer = nullptr;
// "Portable" memory is visible to all CUDA contexts. Safe for our use
// model.
TF_RETURN_IF_ERROR(cuda::ToStatus(
cuMemHostAlloc(&buffer, size, CU_MEMHOSTALLOC_PORTABLE)));
if (!buffer && size > 0) {
return absl::InternalError(absl::StrFormat(
"Failed to allocate pinned host memory of size %d", size));
}
return buffer;
}
}

// Deallocates memory allocated via HostAllocate.
void HostDeallocate(Context* context, void* location) {
ScopedActivateContext activation(context);
auto status = cuda::ToStatus(cuMemFreeHost(location));
if (!status.ok()) {
LOG(ERROR) << "error deallocating host memory at " << location << ": "
<< status;
void HostDeallocate(Context* context, int numa_node, void* location,
uint64_t size) {
if (numa_node != tsl::port::kNUMANoAffinity) {
if (size > 0) {
HostUnregister(context, location);
}
tsl::port::NUMAFree(location, size);
} else {
ScopedActivateContext activation(context);
auto status = cuda::ToStatus(cuMemFreeHost(location));
if (!status.ok()) {
LOG(ERROR) << "error deallocating host memory at " << location << ": "
<< status;
}
}
}

// Creates a MemoryAllocation wrapping the given host buffer.
absl::StatusOr<std::unique_ptr<MemoryAllocation>> AllocateHostMemory(
CudaContext* cuda_context, uint64_t size) {
TF_ASSIGN_OR_RETURN(void* ptr, HostAllocate(cuda_context, size));
CudaContext* cuda_context, int numa_node, uint64_t size) {
TF_ASSIGN_OR_RETURN(void* ptr, HostAllocate(cuda_context, numa_node, size));
VLOG(2) << "allocated " << ptr << " for context " << cuda_context << " of "
<< size << " bytes of host memory";
return std::make_unique<GenericMemoryAllocation>(
ptr, size, [cuda_context](void* location, uint64_t size) {
HostDeallocate(cuda_context, location);
ptr, size, [cuda_context, numa_node](void* location, uint64_t size) {
HostDeallocate(cuda_context, numa_node, location, size);
VLOG(2) << "deallocated collective memory at " << location
<< " for context " << cuda_context;
});
Expand Down Expand Up @@ -622,7 +682,7 @@ CudaExecutor::CreateMemoryAllocator(MemoryType type) {
});
} else if (type == MemoryType::kHost) {
return std::make_unique<GenericMemoryAllocator>([this](uint64_t size) {
return AllocateHostMemory(cuda_context_, size);
return AllocateHostMemory(cuda_context_, numa_node_, size);
});
}
return absl::UnimplementedError(
Expand All @@ -636,6 +696,12 @@ absl::Status CudaExecutor::Init() {
cuda_context_ = context;
TF_RETURN_IF_ERROR(GetComputeCapability(&cc_major_, &cc_minor_, device_));
TF_ASSIGN_OR_RETURN(delay_kernels_supported_, DelayKernelIsSupported());
numa_node_ = ReadNumaNode(GetPCIBusID(device_), device_ordinal())
.value_or(tsl::port::kNUMANoAffinity);
if (numa_node_ == tsl::port::kNUMANoAffinity) {
VLOG(2) << "Could not determine NUMA node of device ordinal "
<< device_ordinal();
}
return absl::OkStatus();
}

Expand Down Expand Up @@ -936,7 +1002,7 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
return DeviceMemoryBase(result.value(), size);
} else if (memory_space ==
static_cast<int64_t>(stream_executor::MemoryType::kHost)) {
auto result = HostAllocate(cuda_context_, size);
auto result = HostAllocate(cuda_context_, numa_node_, size);
if (!result.ok()) {
return DeviceMemoryBase(nullptr, 0);
}
Expand All @@ -948,7 +1014,7 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {

absl::StatusOr<std::unique_ptr<MemoryAllocation>>
CudaExecutor::HostMemoryAllocate(uint64_t size) {
return AllocateHostMemory(cuda_context_, size);
return AllocateHostMemory(cuda_context_, numa_node_, size);
}

void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
Expand All @@ -959,7 +1025,7 @@ void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
}
auto memory_space = status_or_memory_space.value();
if (memory_space == MemoryType::kHost) {
HostDeallocate(cuda_context_, mem->opaque());
HostDeallocate(cuda_context_, numa_node_, mem->opaque(), mem->size());
} else {
DeviceDeallocate(cuda_context_, mem->opaque());
}
Expand All @@ -972,30 +1038,12 @@ bool CudaExecutor::SynchronizeAllActivity() {
bool CudaExecutor::HostMemoryRegister(void* location, uint64_t size) {
VLOG(1) << "Called StreamExecutor::HostMemoryRegister(data=" << location
<< ")";

std::unique_ptr<ActivateContext> activation = Activate();
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
auto status = cuda::ToStatus(
cuMemHostRegister(location, size, CU_MEMHOSTREGISTER_PORTABLE));
if (!status.ok()) {
LOG(ERROR) << "error registering host memory at " << location << ": "
<< status;
return false;
}
return true;
return HostRegister(cuda_context_, location, size);
}

bool CudaExecutor::HostMemoryUnregister(void* location) {
VLOG(1) << "Called StreamExecutor::HostUnregister(data=" << location << ")";

std::unique_ptr<ActivateContext> activation = Activate();
auto status = cuda::ToStatus(cuMemHostUnregister(location));
if (!status.ok()) {
LOG(ERROR) << "error unregistering host memory at " << location << ": "
<< status;
return false;
}
return true;
return HostUnregister(cuda_context_, location);
}

absl::Status CudaExecutor::SynchronousMemZero(DeviceMemoryBase* location,
Expand Down Expand Up @@ -1235,14 +1283,14 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {

{
std::string pci_bus_id = GetPCIBusID(device);

// Lower the hex characters to match sysfs.
pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
desc.set_pci_bus_id(pci_bus_id);

// Read the NUMA node corresponding to the PCI bus ID out of sysfs.
int numa_node = ReadNumaNode(pci_bus_id, device_ordinal);
desc.set_numa_node(numa_node);
std::optional<int> numa_node = ReadNumaNode(pci_bus_id, device_ordinal);
// If the kernel reports -1, adjust to 0; leave as -1 if no value could be
// obtained.
desc.set_numa_node(numa_node.has_value() ? std::max(0, *numa_node)
: tsl::port::kNUMANoAffinity);
}

{
Expand Down
3 changes: 3 additions & 0 deletions xla/stream_executor/cuda/cuda_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ class CudaExecutor : public GpuExecutor {
// The minor version of the compute capability for device_.
int cc_minor_;

// The NUMA node of the CPU closest to device_
int numa_node_;

// Reader/writer lock for mutable data structures on this object.
absl::Mutex mu_;

Expand Down
1 change: 1 addition & 0 deletions xla/stream_executor/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,7 @@ xla_test(
"@com_google_absl//absl/status:statusor",
"@com_google_absl//absl/strings",
"@com_google_googletest//:gtest_main",
"@tsl//tsl/platform:platform_port",
"@tsl//tsl/platform:statusor",
"@tsl//tsl/platform:test",
] + if_cuda([
Expand Down
52 changes: 52 additions & 0 deletions xla/stream_executor/gpu/gpu_executor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ limitations under the License.
#include "xla/stream_executor/platform_manager.h"
#include "xla/stream_executor/stream_executor.h"
#include "xla/tsl/platform/statusor.h"
#include "tsl/platform/numa.h"

namespace stream_executor {

Expand All @@ -45,6 +46,17 @@ TEST_F(GetPointerMemorySpaceTest, Host) {
EXPECT_EQ(memory_space, MemoryType::kHost);
}

TEST_F(GetPointerMemorySpaceTest, HostAllocatedWithMemoryKind) {
StreamExecutor* executor = GetPlatform()->ExecutorForDevice(0).value();
DeviceMemoryBase host_ptr = executor->Allocate(
64, static_cast<int64_t>(stream_executor::MemoryType::kHost));
EXPECT_FALSE(host_ptr.is_null());
TF_ASSERT_OK_AND_ASSIGN(MemoryType memory_space,
executor->GetPointerMemorySpace(host_ptr.opaque()))
EXPECT_EQ(memory_space, MemoryType::kHost);
executor->Deallocate(&host_ptr);
}

TEST_F(GetPointerMemorySpaceTest, Device) {
StreamExecutor* executor = GetPlatform()->ExecutorForDevice(0).value();
auto mem = executor->Allocate(64);
Expand All @@ -63,4 +75,44 @@ TEST_F(GetPointerMemorySpaceTest, Collective) {
executor->Deallocate(&mem);
}

using HostMemoryAllocateTest = GpuExecutorTest;

TEST_F(HostMemoryAllocateTest, Numa) {
Platform* platform = GetPlatform();
constexpr uint64_t kSize = 1024;
const int num_devices = platform->VisibleDeviceCount();
for (int device = 0; device < num_devices; ++device) {
TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
platform->ExecutorForDevice(device));
ASSERT_TRUE(executor);
const DeviceDescription& device_desc = executor->GetDeviceDescription();
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> host_ptr,
executor->HostMemoryAllocate(kSize));
ASSERT_TRUE(host_ptr);
EXPECT_NE(host_ptr->opaque(), nullptr);
const int numa_node = tsl::port::NUMAGetMemAffinity(host_ptr->opaque());
if (numa_node == tsl::port::kNUMANoAffinity) {
// Could be because `executor` could not determine its own NUMA node, in
// which case numa_node() will be -1 or 0, depending on the failure mode.
EXPECT_LE(device_desc.numa_node(), 0);
EXPECT_GE(device_desc.numa_node(), -1);
} else {
EXPECT_EQ(device_desc.numa_node(), numa_node);
}
}
}

TEST_F(HostMemoryAllocateTest, TooBig) {
Platform* platform = GetPlatform();
constexpr uint64_t kTooBig = 1125899906842624; // 1 PiB
const int num_devices = platform->VisibleDeviceCount();
for (int device = 0; device < num_devices; ++device) {
TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
platform->ExecutorForDevice(device));
ASSERT_TRUE(executor);
auto should_fail = executor->HostMemoryAllocate(kTooBig);
EXPECT_FALSE(should_fail.ok());
}
}

} // namespace stream_executor
26 changes: 12 additions & 14 deletions xla/stream_executor/gpu/read_numa_node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,17 @@ limitations under the License.

namespace stream_executor::gpu {

int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal) {
if (!tsl::port::NUMAEnabled()) {
// NUMA is not currently enabled. Return node 0.
return 0;
std::optional<int> ReadNumaNode(absl::string_view pci_bus_id,
int device_ordinal) {
if (tsl::port::NUMANumNodes() < 2) {
// NUMA support is not currently enabled, or there is only one node.
return tsl::port::kNUMANoAffinity;
}
VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
static const int kUnknownNumaNode = -1;

if (pci_bus_id.empty()) {
LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
return kUnknownNumaNode;
return std::nullopt;
}

std::string filename =
Expand All @@ -49,12 +49,13 @@ int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal) {
if (file == nullptr) {
LOG(INFO) << "could not open file to read NUMA node: " << filename
<< "\nYour kernel may have been built without NUMA support.";
return kUnknownNumaNode;
return std::nullopt;
}

std::string content;
char buf[32];
size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
fclose(file);
buf[did_read] = '\0';
content = buf;

Expand All @@ -63,24 +64,21 @@ int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal) {
if (value < 0) { // See http://b/18228951 for details on this path.
LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
<< value
<< "), but there must be at least one NUMA node"
", so returning NUMA node zero."
<< "), but there must be at least one NUMA node so this will "
" be massaged to NUMA node zero in some places."
" See more at "
"https://github.com/torvalds/linux/blob/v6.0/Documentation/"
"ABI/testing/sysfs-bus-pci#L344-L355";
fclose(file);
return 0;
return tsl::port::kNUMANoAffinity;
}
fclose(file);
return value;
}

LOG(WARNING)
<< "could not convert SysFS file contents to integral NUMA node value: "
<< content;

fclose(file);
return kUnknownNumaNode;
return std::nullopt;
}

} // namespace stream_executor::gpu
Loading
Loading