diff --git a/xla/stream_executor/cuda/cuda_executor.cc b/xla/stream_executor/cuda/cuda_executor.cc index ebae36c2cfe59..8fe906746d985 100644 --- a/xla/stream_executor/cuda/cuda_executor.cc +++ b/xla/stream_executor/cuda/cuda_executor.cc @@ -461,7 +461,32 @@ std::string GetPCIBusID(CUdevice device) { LOG(ERROR) << "PCI bus id is not null terminated."; return ""; } - return std::string(raw_pci_bus_id.data()); + // Lower the hex characters to match sysfs. + return absl::AsciiStrToLower(absl::string_view(raw_pci_bus_id.data())); +} + +bool HostRegister(Context* context, void* location, uint64_t size) { + ScopedActivateContext activation(context); + // "Portable" memory is visible to all CUDA contexts. Safe for our use model. + auto status = cuda::ToStatus( + cuMemHostRegister(location, size, CU_MEMHOSTREGISTER_PORTABLE)); + if (!status.ok()) { + LOG(ERROR) << "error registering host memory at " << location << ": " + << status; + return false; + } + return true; +} + +bool HostUnregister(Context* context, void* location) { + ScopedActivateContext activation(context); + auto status = cuda::ToStatus(cuMemHostUnregister(location)); + if (!status.ok()) { + LOG(ERROR) << "error unregistering host memory at " << location << ": " + << status; + return false; + } + return true; } // Allocates memory on the GPU device. @@ -502,34 +527,69 @@ void DeviceDeallocate(Context* context, void* location) { } // Allocates memory on the host. -absl::StatusOr HostAllocate(Context* context, uint64_t bytes) { - ScopedActivateContext activation(context); - void* host_mem = nullptr; - // "Portable" memory is visible to all CUDA contexts. Safe for our use model. - TF_RETURN_IF_ERROR(cuda::ToStatus( - cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE))); - return host_mem; +absl::StatusOr HostAllocate(Context* context, int numa_node, + uint64_t size) { + if (numa_node != tsl::port::kNUMANoAffinity) { + // CUDA programming guide: "Any address of a variable ... returned by one + // of the memory allocation routines from the driver ... API is always + // aligned to at least 256 bytes." + auto* buffer = + tsl::port::NUMAMalloc(numa_node, size, /* minimum_alignment=*/256); + if (buffer == nullptr && size > 0) { + return absl::InternalError(absl::StrFormat( + "Failed to allocate host memory of size %d pinned to NUMA node %d", + size, numa_node)); + } + if (size > 0 && !HostRegister(context, buffer, size)) { + tsl::port::NUMAFree(buffer, size); + return absl::InternalError( + absl::StrFormat("Failed to register host memory of size %d pinned to " + "NUMA node %d with the GPU driver", + size, numa_node)); + } + return buffer; + } else { + ScopedActivateContext activation(context); + void* buffer = nullptr; + // "Portable" memory is visible to all CUDA contexts. Safe for our use + // model. + TF_RETURN_IF_ERROR(cuda::ToStatus( + cuMemHostAlloc(&buffer, size, CU_MEMHOSTALLOC_PORTABLE))); + if (!buffer && size > 0) { + return absl::InternalError(absl::StrFormat( + "Failed to allocate pinned host memory of size %d", size)); + } + return buffer; + } } // Deallocates memory allocated via HostAllocate. -void HostDeallocate(Context* context, void* location) { - ScopedActivateContext activation(context); - auto status = cuda::ToStatus(cuMemFreeHost(location)); - if (!status.ok()) { - LOG(ERROR) << "error deallocating host memory at " << location << ": " - << status; +void HostDeallocate(Context* context, int numa_node, void* location, + uint64_t size) { + if (numa_node != tsl::port::kNUMANoAffinity) { + if (size > 0) { + HostUnregister(context, location); + } + tsl::port::NUMAFree(location, size); + } else { + ScopedActivateContext activation(context); + auto status = cuda::ToStatus(cuMemFreeHost(location)); + if (!status.ok()) { + LOG(ERROR) << "error deallocating host memory at " << location << ": " + << status; + } } } // Creates a MemoryAllocation wrapping the given host buffer. absl::StatusOr> AllocateHostMemory( - CudaContext* cuda_context, uint64_t size) { - TF_ASSIGN_OR_RETURN(void* ptr, HostAllocate(cuda_context, size)); + CudaContext* cuda_context, int numa_node, uint64_t size) { + TF_ASSIGN_OR_RETURN(void* ptr, HostAllocate(cuda_context, numa_node, size)); VLOG(2) << "allocated " << ptr << " for context " << cuda_context << " of " << size << " bytes of host memory"; return std::make_unique( - ptr, size, [cuda_context](void* location, uint64_t size) { - HostDeallocate(cuda_context, location); + ptr, size, [cuda_context, numa_node](void* location, uint64_t size) { + HostDeallocate(cuda_context, numa_node, location, size); VLOG(2) << "deallocated collective memory at " << location << " for context " << cuda_context; }); @@ -622,7 +682,7 @@ CudaExecutor::CreateMemoryAllocator(MemoryType type) { }); } else if (type == MemoryType::kHost) { return std::make_unique([this](uint64_t size) { - return AllocateHostMemory(cuda_context_, size); + return AllocateHostMemory(cuda_context_, numa_node_, size); }); } return absl::UnimplementedError( @@ -636,6 +696,12 @@ absl::Status CudaExecutor::Init() { cuda_context_ = context; TF_RETURN_IF_ERROR(GetComputeCapability(&cc_major_, &cc_minor_, device_)); TF_ASSIGN_OR_RETURN(delay_kernels_supported_, DelayKernelIsSupported()); + numa_node_ = ReadNumaNode(GetPCIBusID(device_), device_ordinal()) + .value_or(tsl::port::kNUMANoAffinity); + if (numa_node_ == tsl::port::kNUMANoAffinity) { + VLOG(2) << "Could not determine NUMA node of device ordinal " + << device_ordinal(); + } return absl::OkStatus(); } @@ -936,7 +1002,7 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) { return DeviceMemoryBase(result.value(), size); } else if (memory_space == static_cast(stream_executor::MemoryType::kHost)) { - auto result = HostAllocate(cuda_context_, size); + auto result = HostAllocate(cuda_context_, numa_node_, size); if (!result.ok()) { return DeviceMemoryBase(nullptr, 0); } @@ -948,7 +1014,7 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) { absl::StatusOr> CudaExecutor::HostMemoryAllocate(uint64_t size) { - return AllocateHostMemory(cuda_context_, size); + return AllocateHostMemory(cuda_context_, numa_node_, size); } void CudaExecutor::Deallocate(DeviceMemoryBase* mem) { @@ -959,7 +1025,7 @@ void CudaExecutor::Deallocate(DeviceMemoryBase* mem) { } auto memory_space = status_or_memory_space.value(); if (memory_space == MemoryType::kHost) { - HostDeallocate(cuda_context_, mem->opaque()); + HostDeallocate(cuda_context_, numa_node_, mem->opaque(), mem->size()); } else { DeviceDeallocate(cuda_context_, mem->opaque()); } @@ -972,30 +1038,12 @@ bool CudaExecutor::SynchronizeAllActivity() { bool CudaExecutor::HostMemoryRegister(void* location, uint64_t size) { VLOG(1) << "Called StreamExecutor::HostMemoryRegister(data=" << location << ")"; - - std::unique_ptr activation = Activate(); - // "Portable" memory is visible to all CUDA contexts. Safe for our use model. - auto status = cuda::ToStatus( - cuMemHostRegister(location, size, CU_MEMHOSTREGISTER_PORTABLE)); - if (!status.ok()) { - LOG(ERROR) << "error registering host memory at " << location << ": " - << status; - return false; - } - return true; + return HostRegister(cuda_context_, location, size); } bool CudaExecutor::HostMemoryUnregister(void* location) { VLOG(1) << "Called StreamExecutor::HostUnregister(data=" << location << ")"; - - std::unique_ptr activation = Activate(); - auto status = cuda::ToStatus(cuMemHostUnregister(location)); - if (!status.ok()) { - LOG(ERROR) << "error unregistering host memory at " << location << ": " - << status; - return false; - } - return true; + return HostUnregister(cuda_context_, location); } absl::Status CudaExecutor::SynchronousMemZero(DeviceMemoryBase* location, @@ -1235,14 +1283,14 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) { { std::string pci_bus_id = GetPCIBusID(device); - - // Lower the hex characters to match sysfs. - pci_bus_id = absl::AsciiStrToLower(pci_bus_id); desc.set_pci_bus_id(pci_bus_id); // Read the NUMA node corresponding to the PCI bus ID out of sysfs. - int numa_node = ReadNumaNode(pci_bus_id, device_ordinal); - desc.set_numa_node(numa_node); + std::optional numa_node = ReadNumaNode(pci_bus_id, device_ordinal); + // If the kernel reports -1, adjust to 0; leave as -1 if no value could be + // obtained. + desc.set_numa_node(numa_node.has_value() ? std::max(0, *numa_node) + : tsl::port::kNUMANoAffinity); } { diff --git a/xla/stream_executor/cuda/cuda_executor.h b/xla/stream_executor/cuda/cuda_executor.h index 3729998f3c388..99a150a956b11 100644 --- a/xla/stream_executor/cuda/cuda_executor.h +++ b/xla/stream_executor/cuda/cuda_executor.h @@ -191,6 +191,9 @@ class CudaExecutor : public GpuExecutor { // The minor version of the compute capability for device_. int cc_minor_; + // The NUMA node of the CPU closest to device_ + int numa_node_; + // Reader/writer lock for mutable data structures on this object. absl::Mutex mu_; diff --git a/xla/stream_executor/gpu/BUILD b/xla/stream_executor/gpu/BUILD index ff0efee5d653e..4ca459433d957 100644 --- a/xla/stream_executor/gpu/BUILD +++ b/xla/stream_executor/gpu/BUILD @@ -733,6 +733,7 @@ xla_test( "@com_google_absl//absl/status:statusor", "@com_google_absl//absl/strings", "@com_google_googletest//:gtest_main", + "@tsl//tsl/platform:platform_port", "@tsl//tsl/platform:statusor", "@tsl//tsl/platform:test", ] + if_cuda([ diff --git a/xla/stream_executor/gpu/gpu_executor_test.cc b/xla/stream_executor/gpu/gpu_executor_test.cc index 3e24f0d59d6b1..97f83f534520b 100644 --- a/xla/stream_executor/gpu/gpu_executor_test.cc +++ b/xla/stream_executor/gpu/gpu_executor_test.cc @@ -23,6 +23,7 @@ limitations under the License. #include "xla/stream_executor/platform_manager.h" #include "xla/stream_executor/stream_executor.h" #include "xla/tsl/platform/statusor.h" +#include "tsl/platform/numa.h" namespace stream_executor { @@ -45,6 +46,17 @@ TEST_F(GetPointerMemorySpaceTest, Host) { EXPECT_EQ(memory_space, MemoryType::kHost); } +TEST_F(GetPointerMemorySpaceTest, HostAllocatedWithMemoryKind) { + StreamExecutor* executor = GetPlatform()->ExecutorForDevice(0).value(); + DeviceMemoryBase host_ptr = executor->Allocate( + 64, static_cast(stream_executor::MemoryType::kHost)); + EXPECT_FALSE(host_ptr.is_null()); + TF_ASSERT_OK_AND_ASSIGN(MemoryType memory_space, + executor->GetPointerMemorySpace(host_ptr.opaque())) + EXPECT_EQ(memory_space, MemoryType::kHost); + executor->Deallocate(&host_ptr); +} + TEST_F(GetPointerMemorySpaceTest, Device) { StreamExecutor* executor = GetPlatform()->ExecutorForDevice(0).value(); auto mem = executor->Allocate(64); @@ -63,4 +75,44 @@ TEST_F(GetPointerMemorySpaceTest, Collective) { executor->Deallocate(&mem); } +using HostMemoryAllocateTest = GpuExecutorTest; + +TEST_F(HostMemoryAllocateTest, Numa) { + Platform* platform = GetPlatform(); + constexpr uint64_t kSize = 1024; + const int num_devices = platform->VisibleDeviceCount(); + for (int device = 0; device < num_devices; ++device) { + TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor, + platform->ExecutorForDevice(device)); + ASSERT_TRUE(executor); + const DeviceDescription& device_desc = executor->GetDeviceDescription(); + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr host_ptr, + executor->HostMemoryAllocate(kSize)); + ASSERT_TRUE(host_ptr); + EXPECT_NE(host_ptr->opaque(), nullptr); + const int numa_node = tsl::port::NUMAGetMemAffinity(host_ptr->opaque()); + if (numa_node == tsl::port::kNUMANoAffinity) { + // Could be because `executor` could not determine its own NUMA node, in + // which case numa_node() will be -1 or 0, depending on the failure mode. + EXPECT_LE(device_desc.numa_node(), 0); + EXPECT_GE(device_desc.numa_node(), -1); + } else { + EXPECT_EQ(device_desc.numa_node(), numa_node); + } + } +} + +TEST_F(HostMemoryAllocateTest, TooBig) { + Platform* platform = GetPlatform(); + constexpr uint64_t kTooBig = 1125899906842624; // 1 PiB + const int num_devices = platform->VisibleDeviceCount(); + for (int device = 0; device < num_devices; ++device) { + TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor, + platform->ExecutorForDevice(device)); + ASSERT_TRUE(executor); + auto should_fail = executor->HostMemoryAllocate(kTooBig); + EXPECT_FALSE(should_fail.ok()); + } +} + } // namespace stream_executor diff --git a/xla/stream_executor/gpu/read_numa_node.cc b/xla/stream_executor/gpu/read_numa_node.cc index 57e4cf208bf0b..9a9a5f1eea871 100644 --- a/xla/stream_executor/gpu/read_numa_node.cc +++ b/xla/stream_executor/gpu/read_numa_node.cc @@ -26,17 +26,17 @@ limitations under the License. namespace stream_executor::gpu { -int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal) { - if (!tsl::port::NUMAEnabled()) { - // NUMA is not currently enabled. Return node 0. - return 0; +std::optional ReadNumaNode(absl::string_view pci_bus_id, + int device_ordinal) { + if (tsl::port::NUMANumNodes() < 2) { + // NUMA support is not currently enabled, or there is only one node. + return tsl::port::kNUMANoAffinity; } VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal; - static const int kUnknownNumaNode = -1; if (pci_bus_id.empty()) { LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal; - return kUnknownNumaNode; + return std::nullopt; } std::string filename = @@ -49,12 +49,13 @@ int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal) { if (file == nullptr) { LOG(INFO) << "could not open file to read NUMA node: " << filename << "\nYour kernel may have been built without NUMA support."; - return kUnknownNumaNode; + return std::nullopt; } std::string content; char buf[32]; size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file); + fclose(file); buf[did_read] = '\0'; content = buf; @@ -63,15 +64,13 @@ int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal) { if (value < 0) { // See http://b/18228951 for details on this path. LOG(INFO) << "successful NUMA node read from SysFS had negative value (" << value - << "), but there must be at least one NUMA node" - ", so returning NUMA node zero." + << "), but there must be at least one NUMA node so this will " + " be massaged to NUMA node zero in some places." " See more at " "https://github.com/torvalds/linux/blob/v6.0/Documentation/" "ABI/testing/sysfs-bus-pci#L344-L355"; - fclose(file); - return 0; + return tsl::port::kNUMANoAffinity; } - fclose(file); return value; } @@ -79,8 +78,7 @@ int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal) { << "could not convert SysFS file contents to integral NUMA node value: " << content; - fclose(file); - return kUnknownNumaNode; + return std::nullopt; } } // namespace stream_executor::gpu diff --git a/xla/stream_executor/gpu/read_numa_node.h b/xla/stream_executor/gpu/read_numa_node.h index 93245bfe524a8..7d339663e1d4d 100644 --- a/xla/stream_executor/gpu/read_numa_node.h +++ b/xla/stream_executor/gpu/read_numa_node.h @@ -16,13 +16,17 @@ limitations under the License. #ifndef XLA_STREAM_EXECUTOR_GPU_READ_NUMA_NODE_H_ #define XLA_STREAM_EXECUTOR_GPU_READ_NUMA_NODE_H_ -#include +#include + +#include "absl/strings/string_view.h" namespace stream_executor::gpu { // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out -// of SysFS. Returns -1 if it cannot. -int ReadNumaNode(const std::string& pci_bus_id, int device_ordinal); +// of SysFS. Returns an empty optional if no value could be determined, returns +// tsl::port::kNUMANoAffinity if the kernel reports a negative value. +std::optional ReadNumaNode(absl::string_view pci_bus_id, + int device_ordinal); } // namespace stream_executor::gpu diff --git a/xla/stream_executor/rocm/rocm_executor.cc b/xla/stream_executor/rocm/rocm_executor.cc index 559d217ef5d97..9d582d8593382 100644 --- a/xla/stream_executor/rocm/rocm_executor.cc +++ b/xla/stream_executor/rocm/rocm_executor.cc @@ -1026,8 +1026,11 @@ RocmExecutor::CreateDeviceDescription(int device_ordinal) { desc.set_pci_bus_id(pci_bus_id); // Read the NUMA node corresponding to the PCI bus ID out of sysfs. - int numa_node = ReadNumaNode(pci_bus_id, device_ordinal); - desc.set_numa_node(numa_node); + std::optional numa_node = ReadNumaNode(pci_bus_id, device_ordinal); + // If the kernel reports -1, adjust to 0; leave as -1 if no value could be + // obtained. + desc.set_numa_node(numa_node.has_value() ? std::max(0, *numa_node) + : tsl::port::kNUMANoAffinity); } hipDeviceProp_t prop; diff --git a/xla/tsl/BUILD b/xla/tsl/BUILD index 6cb475260290e..6b0b4ac474c62 100644 --- a/xla/tsl/BUILD +++ b/xla/tsl/BUILD @@ -416,12 +416,6 @@ config_setting( visibility = ["//visibility:public"], ) -config_setting( - name = "with_numa_support", - define_values = {"with_numa_support": "true"}, - visibility = ["//visibility:public"], -) - config_setting( name = "fuchsia", constraint_values = if_google( @@ -491,6 +485,26 @@ selects.config_setting_group( visibility = ["//visibility:public"], ) +selects.config_setting_group( + name = "linux_any", + match_any = [ + ":linux_aarch64", + ":linux_armhf", + ":linux_ppc64le", + ":linux_s390x", + ":linux_x86_64", + ], + visibility = ["//visibility:public"], +) + +# Enable NUMA on Linux platforms; hwloc does not work on Android +selects.config_setting_group( + name = "with_numa_support", + match_any = [ + ":linux_any", + ], +) + # TODO(jakeharmon): Remove equivalent from tensorflow/BUILD config_setting( name = "fuchsia_x86_64",