diff --git a/.bazelrc b/.bazelrc index b94693e05efab..11783a8012ddb 100644 --- a/.bazelrc +++ b/.bazelrc @@ -351,6 +351,13 @@ build:windows --features=archive_param_file build:windows --copt=/d2ReducedOptimizeHugeFunctions build:windows --host_copt=/d2ReducedOptimizeHugeFunctions +# Before VS 2017 15.8, the member "type" would non-conformingly have an +# alignment of only alignof(max_align_t). VS 2017 15.8 was fixed to handle this +# correctly, but the fix inherently changes layout and breaks binary +# compatibility (*only* for uses of aligned_storage with extended alignments). +build:windows --copt=-D_ENABLE_EXTENDED_ALIGNED_STORAGE +build:windows --host_copt=-D_ENABLE_EXTENDED_ALIGNED_STORAGE + # Enable the runfiles symlink tree on Windows. This makes it possible to build # the pip package on Windows without an intermediate data-file archive, as the # build_pip_package script in its current form (as of Aug 2023) uses the diff --git a/third_party/tsl/.bazelrc b/third_party/tsl/.bazelrc index b94693e05efab..11783a8012ddb 100644 --- a/third_party/tsl/.bazelrc +++ b/third_party/tsl/.bazelrc @@ -351,6 +351,13 @@ build:windows --features=archive_param_file build:windows --copt=/d2ReducedOptimizeHugeFunctions build:windows --host_copt=/d2ReducedOptimizeHugeFunctions +# Before VS 2017 15.8, the member "type" would non-conformingly have an +# alignment of only alignof(max_align_t). VS 2017 15.8 was fixed to handle this +# correctly, but the fix inherently changes layout and breaks binary +# compatibility (*only* for uses of aligned_storage with extended alignments). +build:windows --copt=-D_ENABLE_EXTENDED_ALIGNED_STORAGE +build:windows --host_copt=-D_ENABLE_EXTENDED_ALIGNED_STORAGE + # Enable the runfiles symlink tree on Windows. This makes it possible to build # the pip package on Windows without an intermediate data-file archive, as the # build_pip_package script in its current form (as of Aug 2023) uses the diff --git a/xla/service/cpu/runtime/thunk_executor.cc b/xla/service/cpu/runtime/thunk_executor.cc index f25fd6119a284..805840ad855e9 100644 --- a/xla/service/cpu/runtime/thunk_executor.cc +++ b/xla/service/cpu/runtime/thunk_executor.cc @@ -122,6 +122,9 @@ absl::StatusOr ThunkExecutor::Create( return ThunkExecutor(std::move(thunk_sequence), std::move(defs), options); } +ThunkExecutor::ExecuteState::Node::Node(const NodeDef& node_def) + : counter(node_def.in_edges.size()), out_edges(&node_def.out_edges) {} + ThunkExecutor::ExecuteState::ExecuteState(ThunkExecutor* executor, Thunk::TaskRunner* runner) : executor(executor), @@ -133,11 +136,9 @@ ThunkExecutor::ExecuteState::ExecuteState(ThunkExecutor* executor, DCHECK(runner == nullptr || static_cast(*runner)) << "`runner` must be nullptr or a valid TaskRunner"; - Node* node = nodes.data(); + NodeStorage* node = nodes.data(); for (const NodeDef& node_def : executor->nodes_defs()) { - node->counter.store(node_def.in_edges.size(), std::memory_order_release); - node->out_edges = &node_def.out_edges; - ++node; + new (node++) Node(node_def); } } @@ -271,7 +272,7 @@ void ThunkExecutor::Execute(ExecuteState* state, for (int64_t i = 0; i < ready_queue.size(); ++i) { NodeId id = ready_queue[i]; - ExecuteState::Node& node = state->nodes[id]; + ExecuteState::Node& node = state->node(id); int64_t cnt = node.counter.load(std::memory_order_acquire); DCHECK_EQ(cnt, 0) << "Node counter must be 0"; // Crash Ok @@ -375,7 +376,7 @@ void ThunkExecutor::ProcessOutEdges( // Append ready nodes to the back of the ready queue. for (NodeId out_edge : *node.out_edges) { - ExecuteState::Node& out_node = state->nodes[out_edge]; + ExecuteState::Node& out_node = state->node(out_edge); int64_t cnt = out_node.counter.fetch_sub(1, std::memory_order_release); DCHECK_GE(cnt, 1) << "Node counter can't drop below 0"; diff --git a/xla/service/cpu/runtime/thunk_executor.h b/xla/service/cpu/runtime/thunk_executor.h index 67a66c422bf5c..10df02c45a938 100644 --- a/xla/service/cpu/runtime/thunk_executor.h +++ b/xla/service/cpu/runtime/thunk_executor.h @@ -22,6 +22,7 @@ limitations under the License. #include #include #include +#include #include #include "absl/base/thread_annotations.h" @@ -113,16 +114,27 @@ class ThunkExecutor { // At run time NodeDef instantiated as a Node with an atomic counter that // drops to zero when all `in_edges` are ready. struct Node { + explicit Node(const NodeDef& node_def); + alignas(kAtomicAlignment) std::atomic counter; const std::vector* out_edges; }; + static_assert(std::is_trivially_destructible_v, + "Node must be trivially destructible"); + + // We use indirection via NodeStorage to be able to allocate uninitialized + // memory and do not pay the cost of default initializing all nodes. + using NodeStorage = std::aligned_storage_t; + ExecuteState(ThunkExecutor* executor, Thunk::TaskRunner* runner); + Node& node(NodeId id) { return *reinterpret_cast(&nodes[id]); } + ThunkExecutor* executor; Thunk::TaskRunner* runner; - absl::FixedArray nodes; + absl::FixedArray nodes; tsl::AsyncValueRef execute_event; // Once the number of pending sink nodes drops to zero, the execution is