Skip to content

Commit

Permalink
Use a smart pointer to store profiler and use DumpProtoToDirectory
Browse files Browse the repository at this point in the history
…to store XSpace

to create directories automatically for when `xla_gpu_dump_xspace_to` doesnt exist

PiperOrigin-RevId: 722694175
  • Loading branch information
juliagmt-google authored and Google-ML-Automation committed Feb 3, 2025
1 parent a42a623 commit 1c951d2
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 12 deletions.
3 changes: 3 additions & 0 deletions xla/tools/multihost_hlo_runner/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ cc_library(
"//xla/pjrt/distributed:key_value_store_interface",
"//xla/service:computation_layout",
"//xla/service:computation_placer_hdr",
"//xla/service:dump",
"//xla/service:hlo_module_config",
"//xla/service:hlo_proto_cc",
"//xla/tests:test_utils",
Expand All @@ -180,6 +181,7 @@ cc_library(
"@tsl//tsl/platform:env",
"@tsl//tsl/platform:errors",
"@tsl//tsl/platform:logging",
"@tsl//tsl/platform:path",
"@tsl//tsl/platform:status",
"@tsl//tsl/platform:statusor",
"@tsl//tsl/profiler/lib:profiler_factory_impl",
Expand Down Expand Up @@ -242,5 +244,6 @@ xla_test(
"@com_google_absl//absl/time",
"@com_google_googletest//:gtest",
"@tsl//tsl/platform:path",
"@tsl//tsl/platform:test",
],
)
15 changes: 10 additions & 5 deletions xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ limitations under the License.
#include "xla/primitive_util.h"
#include "xla/service/computation_layout.h"
#include "xla/service/computation_placer.h"
#include "xla/service/dump.h"
#include "xla/service/hlo.pb.h"
#include "xla/service/hlo_module_config.h"
#include "xla/shape_util.h"
Expand All @@ -73,6 +74,7 @@ limitations under the License.
#include "xla/util.h"
#include "xla/xla.pb.h"
#include "xla/xla_data.pb.h"
#include "tsl/platform/path.h"
#include "tsl/profiler/lib/profiler_session.h"

namespace xla {
Expand Down Expand Up @@ -1147,7 +1149,7 @@ FunctionalHloRunner::RunInternal(
}
if (repeat == running_options.num_repeats - 1) {
execute_options.untuple_result = default_untuple_result;
if (running_options.profiler != nullptr) {
if (running_options.profiler) {
running_options.profiler->CreateSession();
}
}
Expand Down Expand Up @@ -1190,7 +1192,7 @@ FunctionalHloRunner::RunInternal(
FetchAndLogOutput(client, output_buffers,
running_options.module_output_mode,
running_options.log_input_output()));
if (running_options.profiler != nullptr) {
if (running_options.profiler) {
running_options.profiler->UploadSession();
}
return results;
Expand Down Expand Up @@ -1624,9 +1626,12 @@ void GPURunnerProfiler::UploadSession() {

CHECK(!dump_path_.empty());

LOG(INFO) << "Saving xspace result to " << dump_path_;
// Save in binary format to create xprof sessions and extract device stats.
CHECK_OK(WriteBinaryProto(tsl::Env::Default(), dump_path_, *xspace_.get()));
std::string file_path =
tsl::io::JoinPath(dump_path_, SanitizeFileName("xspace")) + ".pb";
LOG(INFO) << "Dumped HLO text to " << file_path;
std::string path;
TF_CHECK_OK(
DumpProtoToDirectory(*xspace_.get(), dump_path_, file_path, &path));
if (!keep_xspace_) {
xspace_ = nullptr;
}
Expand Down
2 changes: 1 addition & 1 deletion xla/tools/multihost_hlo_runner/functional_hlo_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ class FunctionalHloRunner {
// This indicates whether we log the inputs and outputs to stderr.
LogOutputMode log_input_output_mode = LogOutputMode::kNotLogOutput;
const MultiSliceConfig* multi_slice_config = nullptr;
ProfilerInterface* profiler = nullptr;
std::unique_ptr<ProfilerInterface> profiler;
// Whether to untuple the result of running HLO module into a vector of
// arrays. If unprovided, use the default in ExecuteOptions.
std::optional<bool> untuple_result = std::nullopt;
Expand Down
50 changes: 46 additions & 4 deletions xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ limitations under the License.
#include <cstdlib>
#include <memory>
#include <string>
#include <utility>
#include <vector>

#include <gmock/gmock.h>
Expand Down Expand Up @@ -151,9 +152,9 @@ TEST_F(FunctionalHloRunnerTest, GPUProfilerKeepXSpaceReturnsNonNullXSpace) {

FunctionalHloRunner::RunningOptions running_options;
TF_ASSERT_OK_AND_ASSIGN(
auto profiler,
std::unique_ptr<GPURunnerProfiler> profiler,
GPURunnerProfiler::Create(profile_dump_path, /*keep_xspace=*/true));
running_options.profiler = profiler.get();
running_options.profiler = std::move(profiler);

profiler->CreateSession();
profiler->UploadSession();
Expand Down Expand Up @@ -187,9 +188,9 @@ TEST_F(FunctionalHloRunnerTest,
GetPjRtEnvironmentForGpu("", gpu_options, absl::Seconds(120)));
FunctionalHloRunner::RunningOptions running_options;
TF_ASSERT_OK_AND_ASSIGN(
auto profiler,
std::unique_ptr<GPURunnerProfiler> profiler,
GPURunnerProfiler::Create(profile_dump_path, /*keep_xspace=*/false));
running_options.profiler = profiler.get();
running_options.profiler = std::move(profiler);

TF_EXPECT_OK(FunctionalHloRunner::LoadAndRunAndDump(
*pjrt_env.client,
Expand All @@ -199,6 +200,47 @@ TEST_F(FunctionalHloRunnerTest,
TF_EXPECT_OK(env->FileExists(profile_dump_path));
}

TEST_F(FunctionalHloRunnerTest,
SingleDeviceHloWithNonExistingDirectorySavesXSpaceToDisk) {
if (IsTestingCpu()) {
GTEST_SKIP() << "GPU-only test";
}

GpuClientOptions gpu_options;
gpu_options.node_id = 0;
gpu_options.num_nodes = 16;
gpu_options.enable_mock_nccl = true;

std::string non_existing_directory = "non_existing_directory";
std::string profile_dump_path =
tsl::io::JoinPath(tsl::testing::TmpDir(), non_existing_directory);

tsl::Env* env = tsl::Env::Default();
tsl::FileSystem* fs = nullptr;
TF_ASSERT_OK(env->GetFileSystemForFile(profile_dump_path, &fs));
EXPECT_THAT(fs->IsDirectory(profile_dump_path),
tsl::testing::StatusIs(tsl::error::NOT_FOUND));

FunctionalHloRunner::RawCompileOptions raw_compile_options;
raw_compile_options.xla_gpu_dump_xspace_to = profile_dump_path;

TF_ASSERT_OK_AND_ASSIGN(
xla::PjRtEnvironment pjrt_env,
GetPjRtEnvironmentForGpu("", gpu_options, absl::Seconds(120)));
FunctionalHloRunner::RunningOptions running_options;
TF_ASSERT_OK_AND_ASSIGN(
auto profiler,
GPURunnerProfiler::Create(profile_dump_path, /*keep_xspace=*/false));
running_options.profiler = std::move(profiler);

TF_EXPECT_OK(FunctionalHloRunner::LoadAndRunAndDump(
*pjrt_env.client,
/* debug_options= */ {}, /* preproc_options= */ {}, raw_compile_options,
running_options, {GetHloPath("single_device.hlo")}, InputFormat::kText));
EXPECT_THAT(fs->IsDirectory(profile_dump_path),
tsl::testing::StatusIs(tsl::error::OK));
}

TEST_F(FunctionalHloRunnerTest, Sharded2Devices) {
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
GetPjRtClient());
Expand Down
5 changes: 3 additions & 2 deletions xla/tools/multihost_hlo_runner/hlo_runner_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ limitations under the License.
#include <memory>
#include <optional>
#include <string>
#include <utility>
#include <vector>

#include "absl/log/check.h"
Expand Down Expand Up @@ -248,10 +249,10 @@ static absl::Status RunMultihostHloRunner(int argc, char** argv,
// Create a GPURunnerProfiler to profile GPU executions to save xspace data
// to disk.
if (env.client != nullptr && !opts.xla_gpu_dump_xspace_to.empty()) {
TF_ASSIGN_OR_RETURN(auto profiler,
TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::GPURunnerProfiler> profiler,
GPURunnerProfiler::Create(opts.xla_gpu_dump_xspace_to,
/*keep_xspace=*/false));
running_options.profiler = profiler.get();
running_options.profiler = std::move(profiler);
}
} else if (opts.device_type_str == "host") {
TF_ASSIGN_OR_RETURN(env, xla::GetPjRtEnvironmentForHostCpu());
Expand Down

0 comments on commit 1c951d2

Please sign in to comment.