Skip to content

Commit

Permalink
update to fill XSpace
Browse files Browse the repository at this point in the history
  • Loading branch information
cj401-ai committed Nov 20, 2024
1 parent 91535fb commit d4ca46c
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 102 deletions.
61 changes: 59 additions & 2 deletions xla/backends/profiler/gpu/device_tracer_rocm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,14 @@ class GpuTracer : public profiler::ProfilerInterface {
absl::Status Start() override;
absl::Status Stop() override;
absl::Status CollectData(XSpace* space) override;

private:
absl::Status DoStart();
absl::Status DoStop();

RocmTracerOptions GetRocmTracerOptions();
RocmTraceCollectorOptions GetRocmTraceCollectorOptions(uint32_t num_gpus);

enum State {
kNotStarted,
kStartedOk,
Expand All @@ -95,15 +98,46 @@ class GpuTracer : public profiler::ProfilerInterface {
State profiling_state_ = State::kNotStarted;

RocmTracer* rocm_tracer_;
std::unique_ptr<RocmTraceCollector> rocm_trace_collector_;
};

RocmTracerOptions GpuTracer::GetRocmTracerOptions() {
// TODO(rocm-profiler): We need support for context similar to CUDA ?
RocmTracerOptions options;
return options;
}

RocmTraceCollectorOptions GpuTracer::GetRocmTraceCollectorOptions(
uint32_t num_gpus) {
RocmTraceCollectorOptions options;
options.max_callback_api_events = 2 * 1024 * 1024;
options.max_activity_api_events = 2 * 1024 * 1024;
options.max_annotation_strings = 1024 * 1024;
options.num_gpus = num_gpus;
return options;
}

absl::Status GpuTracer::DoStart() {
/*
if (!rocm_tracer_->IsAvailable()) {
return tsl::errors::Unavailable("Another profile session running.");
}

*/
// AnnotationStack::Enable(true);

RocmTraceCollectorOptions trace_collector_options =
GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus());
uint64_t start_gputime_ns = rocm_tracer_->GetTimestamp();
uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();
rocm_trace_collector_ = CreateRocmCollector(
trace_collector_options, start_walltime_ns, start_gputime_ns);

RocmTracerOptions tracer_options = GetRocmTracerOptions();
rocm_tracer_->Enable(tracer_options, rocm_trace_collector_.get());
LOG(ERROR) << "cj rocm_tracer_collector = " << rocm_trace_collector_.get();
LOG(ERROR) << "cj rocm_tracer_ collector = " << rocm_tracer_->get_collector();
LOG(ERROR) << "DO START ...";

rocm_tracer_->setup();
rocm_tracer_->start();
return absl::OkStatus();
Expand Down Expand Up @@ -134,6 +168,28 @@ absl::Status GpuTracer::Stop() {
return absl::OkStatus();
}

absl::Status GpuTracer::CollectData(XSpace* space) {
switch (profiling_state_) {
case State::kNotStarted:
VLOG(3) << "No trace data collected, session wasn't started";
return absl::OkStatus();
case State::kStartedOk:
return tsl::errors::FailedPrecondition(
"Cannot collect trace before stopping");
case State::kStartedError:
LOG(ERROR) << "Cannot collect, roctracer failed to start";
return absl::OkStatus();
case State::kStoppedError:
VLOG(3) << "No trace data collected";
return absl::OkStatus();
case State::kStoppedOk: {
if (rocm_trace_collector_) rocm_trace_collector_->Export(space);
return absl::OkStatus();
}
}
return tsl::errors::Internal("Invalid profiling state: ", profiling_state_);
}

// Not in anonymous namespace for testing purposes.
std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
const ProfileOptions& options) {
Expand All @@ -144,6 +200,7 @@ std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(

profiler::RocmTracer* rocm_tracer =
profiler::RocmTracer::GetRocmTracerSingleton();
LOG(ERROR) << "cj rocm_tracer is available = " << rocm_tracer->IsAvailable();
if (!rocm_tracer->IsAvailable()) {
return nullptr;
}
Expand Down
6 changes: 5 additions & 1 deletion xla/backends/profiler/gpu/rocm_collector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,6 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
// std::vector<RocmTracerEvent>
RocmTracerEvent_t events_ TF_GUARDED_BY(event_maps_mutex_);
absl::flat_hash_map<uint32_t, PerDeviceCollector> per_device_collector_;

};

void RocmTraceCollectorImpl::AddEvent(RocmTracerEvent& event) {
Expand All @@ -466,13 +465,18 @@ void RocmTraceCollectorImpl::Flush() {
auto device_id = event.device_id;
per_device_collector_[device_id].AddEvent(std::move(event));
}
LOG(ERROR) << "Complete RocmTraceCollectorImpl::Flush";
events_.clear();
LOG(ERROR) << "Complete RocmTraceCollectorImpl events_.clear";
}

void RocmTraceCollectorImpl::Export(XSpace* space) {
uint64_t end_gputime_ns = get_timestamp();
LOG(ERROR) << "Starting RocmTraceCollectorImpl::Export";
LOG(ERROR) << "test space = " << space;
XPlaneBuilder host_plane(FindOrAddMutablePlaneWithName(
space, tsl::profiler::kRoctracerApiPlaneName));
LOG(ERROR) << "Starting RocmTraceCollectorImpl, host_plane";

for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
std::string name = GpuPlaneName(device_ordinal);
Expand Down
156 changes: 76 additions & 80 deletions xla/backends/profiler/gpu/rocm_tracer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,6 @@ limitations under the License.
#include <thread>
#include <unordered_set>
#include <vector>
// #include <execution>
using tsl::profiler::XSpace;
XSpace* space;

extern "C" rocprofiler_tool_configure_result_t* rocprofiler_configure(
uint32_t version, const char* runtime_version, uint32_t priority,
Expand Down Expand Up @@ -169,6 +166,8 @@ tool_tracing_callback(rocprofiler_context_id_t context,
assert(user_data != nullptr);
assert(drop_count == 0 && "drop count should be zero for lossless policy");

auto rocmtracer_singleton = xla::profiler::RocmTracer::GetRocmTracerSingleton();

/*
if(num_headers == 0)
throw std::runtime_error{
Expand All @@ -178,6 +177,8 @@ tool_tracing_callback(rocprofiler_context_id_t context,
"array of headers. this should never happen"};
*/

// auto rocm_trace_collector_ = reinterpret_cast<RocmTraceCollector*>(tool_data);

LOG(INFO) << "Number of heads = " << num_headers;
LOG(INFO) << "Tracing category = " << ROCPROFILER_BUFFER_CATEGORY_TRACING;
for(size_t i = 0; i < num_headers; ++i)
Expand Down Expand Up @@ -241,17 +242,18 @@ tool_tracing_callback(rocprofiler_context_id_t context,
// throw std::runtime_error{msg.str()};
}

/*
auto tmp_str = client_name_info[record->kind][record->operation].data();

static_cast<RocmTracerEvent_t*>(user_data)->emplace_back(
RocmTracerEvent{RocmTracerEventType::HIP_RUNTIME_API,
tmp_str,
record->start_timestamp,
record->end_timestamp,
0, // how to access device id,
record->correlation_id.internal,
record->thread_id,
0});
auto tmp = RocmTracerEvent{RocmTracerEventType::HIP_RUNTIME_API,
tmp_str,
record->start_timestamp,
record->end_timestamp,
0, // how to access device id,
record->correlation_id.internal,
record->thread_id,
0};
rocmtracer_singleton->get_collector()->AddEvent(tmp);
*/
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING &&
header->kind == ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH)
Expand Down Expand Up @@ -296,16 +298,22 @@ tool_tracing_callback(rocprofiler_context_id_t context,
if(record->start_timestamp > record->end_timestamp)
printf("kernel dispatch: start > end");
// throw std::runtime_error("kernel dispatch: start > end");
LOG(ERROR) << "CJ kernel dispatch: " << info.str();

static_cast<RocmTracerEvent_t*>(user_data)->emplace_back(
RocmTracerEvent{RocmTracerEventType::KERNEL_DISPATCH,
auto tmp = RocmTracerEvent{RocmTracerEventType::KERNEL_DISPATCH,
client_kernels.at(record->dispatch_info.kernel_id).kernel_name,
record->start_timestamp,
record->end_timestamp,
0, // how to access device id,
record->correlation_id.internal,
record->thread_id,
0});
0};

LOG(ERROR) << "CJ after tmp : " << info.str();
LOG(ERROR) << "CJ number of GPU = " << rocmtracer_singleton->NumGpus();
LOG(ERROR) << "cj collector = " << rocmtracer_singleton->get_collector();

rocmtracer_singleton->get_collector()->AddEvent(tmp);
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING &&
header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY)
Expand All @@ -329,16 +337,17 @@ tool_tracing_callback(rocprofiler_context_id_t context,
if(record->start_timestamp > record->end_timestamp)
printf("memory copy: start > end \n");
// throw std::runtime_error("memory copy: start > end");

static_cast<RocmTracerEvent_t*>(user_data)->emplace_back(
RocmTracerEvent{RocmTracerEventType::MEMORY_COPY,
/*
auto tmp = RocmTracerEvent{RocmTracerEventType::MEMORY_COPY,
client_name_info[record->kind][record->operation].data(),
record->start_timestamp,
record->end_timestamp,
0, // how to access device id,
record->correlation_id.internal,
record->thread_id,
0});
0};
rocmtracer_singleton->get_collector()->AddEvent(tmp);
*/
}
else
{
Expand Down Expand Up @@ -415,78 +424,31 @@ int tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
return -1;
}

/*
auto rocm_trace_collector_ = reinterpret_cast<RocmTraceCollector*>(tool_data);
RocmTraceCollectorOptions trace_collector_options = GetRocmTraceCollectorOptions(NumGpus());
uint64_t start_gputime_ns = GetTimestamp();
uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();
*rocm_trace_collector_ = CreateRocmCollector(trace_collector_options, start_walltime_ns, start_gputime_ns);
*/
ROCPROFILER_CALL(se::wrap::rocprofiler_start_context(client_ctx), "rocprofiler context start");

// no errors
return 0;
}

RocmTraceCollectorOptions GetRocmTraceCollectorOptions(uint32_t num_gpus) {
RocmTraceCollectorOptions options;
options.max_callback_api_events = 2 * 1024 * 1024;
options.max_activity_api_events = 2 * 1024 * 1024;
options.max_annotation_strings = 1024 * 1024;
options.num_gpus = num_gpus;
return options;
}

int NumGpus() {
static int num_gpus = []() -> int {
if (hipInit(0) != hipSuccess) {
return 0;
}
int gpu_count;
if (hipGetDeviceCount(&gpu_count) != hipSuccess) {
return 0;
}
LOG(ERROR) << "Profiler found " << gpu_count << " GPUs.";
return gpu_count;
}();
return num_gpus;
}

/*static*/ uint64_t GetTimestamp() {
uint64_t ts;
rocprofiler_status_t CHECKSTATUS = se::wrap::rocprofiler_get_timestamp(&ts);
if (CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) {
const char* errstr = se::wrap::rocprofiler_get_status_string(CHECKSTATUS);
LOG(ERROR) << "function rocprofiler_get_timestamp failed with error "
<< errstr;
return 0;
}
return ts;
}

void tool_fini(void* tool_data){
assert(tool_data != nullptr);

RocmTraceCollectorOptions trace_collector_options = GetRocmTraceCollectorOptions(NumGpus());

uint64_t start_gputime_ns = GetTimestamp();
uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();
auto rocm_trace_collector_ = CreateRocmCollector(trace_collector_options, start_walltime_ns, start_gputime_ns);
LOG(ERROR) << "tool_fini interrupted ...";

auto* tmp_events = static_cast<RocmTracerEvent_t*>(tool_data);

if (tmp_events && rocm_trace_collector_) {
size_t failed_additions = 0;
for (auto& itr : *tmp_events) {
rocm_trace_collector_->AddEvent(itr);
}
LOG(ERROR) << "Failed to add " << failed_additions << " events.";
// std::for_each(std::execution::par, tmp_events->begin(), tmp_events->end(),
// [&](const auto& itr) { rocm_trace_collector_->AddEvent(itr); });

}
auto rocmtracer_singleton = xla::profiler::RocmTracer::GetRocmTracerSingleton();
rocmtracer_singleton->get_collector()->Flush();

rocm_trace_collector_->Flush();
rocm_trace_collector_->Export(space);
}
} // end of namespace



void RocmTracer::setup(){
if(int status = 0;
se::wrap::rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0){
Expand All @@ -510,15 +472,49 @@ void RocmTracer::stop(){
ROCPROFILER_CALL(se::wrap::rocprofiler_stop_context(client_ctx), "context stop");
}


/* static */ RocmTracer* RocmTracer::GetRocmTracerSingleton() {
static RocmTracer singleton; // Changed to a direct static instance
return &singleton;
static auto* singleton = new RocmTracer();
return singleton;
}

bool RocmTracer::IsAvailable() const {
return is_available_;
}

int RocmTracer::NumGpus() {
static int num_gpus = []() -> int {
if (hipInit(0) != hipSuccess) {
return 0;
}
int gpu_count;
if (hipGetDeviceCount(&gpu_count) != hipSuccess) {
return 0;
}
LOG(ERROR) << "Profiler found " << gpu_count << " GPUs.";
return gpu_count;
}();
return num_gpus;
}

/*static*/ uint64_t RocmTracer::GetTimestamp() {
uint64_t ts;
rocprofiler_status_t CHECKSTATUS = se::wrap::rocprofiler_get_timestamp(&ts);
if (CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) {
const char* errstr = se::wrap::rocprofiler_get_status_string(CHECKSTATUS);
LOG(ERROR) << "function rocprofiler_get_timestamp failed with error "
<< errstr;
return 0;
}
return ts;
}

void RocmTracer::Enable(const RocmTracerOptions& options, RocmTraceCollector* collector) {
options_ = options;
collector_ = collector;
}


} // namespace profiler
} // namespace xla

Expand Down
Loading

0 comments on commit d4ca46c

Please sign in to comment.