forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
profiler_kineto.h
186 lines (162 loc) · 6.46 KB
/
profiler_kineto.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#pragma once
#include <string>
#include <vector>
#include <torch/csrc/profiler/api.h>
#include <torch/csrc/profiler/events.h>
#include <torch/csrc/profiler/stubs/base.h>
#include <torch/csrc/profiler/util.h>
namespace torch {
namespace profiler {
namespace impl {
struct Result;
namespace kineto {
struct ActivityTraceWrapper;
} // namespace kineto
} // namespace impl
} // namespace profiler
namespace autograd {
namespace profiler {
using experimental_event_t = std::shared_ptr<torch::profiler::impl::Result>;
struct TORCH_API KinetoEvent {
KinetoEvent(
std::shared_ptr<const torch::profiler::impl::Result>,
const bool verbose);
uint64_t startThreadId() const;
uint64_t endThreadId() const;
uint8_t activityType() const;
uint64_t fwdThreadId() const;
bool hasShapes() const;
const c10::ArrayRef<std::vector<int64_t>> shapes() const;
bool hasTypes() const;
const c10::ArrayRef<std::string> dtypes() const;
uint64_t flops() const;
int64_t sequenceNr() const;
bool hasStack() const;
const c10::ArrayRef<std::string> stack() const;
uint8_t scope() const;
bool hasModuleHierarchy() const;
const c10::ArrayRef<std::string> moduleHierarchy() const;
int64_t debugHandle() const;
std::string name() const;
c10::DeviceType deviceType() const;
uint8_t deviceIndex() const;
int64_t nBytes() const;
uint64_t startUs() const;
uint64_t durationUs() const;
bool isAsync() const;
uint64_t correlationId() const;
uint64_t linkedCorrelationId() const;
int64_t deviceResourceId() const;
std::string backend() const;
bool isPythonFunction() const;
int64_t cudaElapsedUs() const;
void getPerfEventCounters(torch::profiler::perf_counters_t&) const;
private:
torch::profiler::impl::ProfilerEventStub fallbackStart() const;
torch::profiler::impl::ProfilerEventStub fallbackEnd() const;
std::shared_ptr<const torch::profiler::impl::Result> result_;
std::vector<std::string> python_stack_;
// Copy fields from result so we can return ArrayRefs.
std::vector<std::vector<int64_t>> shapes_;
std::vector<std::string> dtypes_;
};
// Consolidating events returned directly from Kineto
// with events manually created by us (e.g. start/stop marks,
// memory allocation events)
struct TORCH_API ProfilerResult {
ProfilerResult();
ProfilerResult(
uint64_t start_time,
std::vector<KinetoEvent> events,
std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&&
trace,
std::vector<experimental_event_t>&& event_tree);
~ProfilerResult();
uint64_t trace_start_us() const {
return trace_start_us_;
}
const std::vector<KinetoEvent>& events() const {
return events_;
}
const std::vector<experimental_event_t>& event_tree() const {
return event_tree_;
}
void save(const std::string& path);
private:
uint64_t trace_start_us_ = 0;
std::vector<KinetoEvent> events_;
std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper> trace_;
std::vector<experimental_event_t> event_tree_;
};
/*
* This API is used by backends to record latency of events that
* happened in the backend but were not visible to pytorch runtime.
* For example, if part of the model is lowered to a dsp backend, then
* the execution of that part of the model is delegated to the backend.
* When backend finishes execution it has an option to provide profiling
* information (latency only at the moment) corresponding to different operators
* that were executed in the backend.
* When such events are recorded by backend using this API, the event
* records will be collected by active kineto profiler. If no kineto profiler
* is active then the event is ignored.
* This provides us with a way to generate all the profiling information
* for a model regardless of where model (or part of it) executed.
* @param start_time_us: start time in us of the event
* @param end_time_us: end time in us of the event
* @param debug_handle: debug handle to correlate this event/op with
* model level module/source information
* @param scope: scope of the event, e.g. LITE_INTERPRETER, RECORD_FN etc.
* @param event_name: name of the event, e.g. op name
* @param backend_name: name of the backend where the event took place.
*/
TORCH_API void reportBackendEventToActiveKinetoProfiler(
const int64_t start_time_us,
const int64_t end_time_us,
const int64_t debug_handle,
const at::RecordScope scope,
const std::string& event_name,
const std::string& backend_name);
TORCH_API void enableProfiler(
const torch::profiler::impl::ProfilerConfig& config,
const std::set<torch::profiler::impl::ActivityType>& activities,
const std::unordered_set<at::RecordScope>& scopes = {});
/*
* Same as enableProfiler but with callback to do post-processing of
* KinetoEvents.
* enableProfilerWithEventPostProcess enables profiler to capture
* specified activities, with specified RecordFunction scope, if any.
* Additionally, it takes a functor that does in-place post processing of
* events, e.g. populate stack trace or module hierarchy information lazily
* using debug_handle.
* Example usage is with lite interpreter that has recording scope of
* LITE_INTERPRETER. In this case lite interpreter runtime, records debug
* handles in RecordFunction, along with other information. Debug handles are
* eventually passed down to KinetoEvent and recorded as part of the event.
* KinetoEdgeCPUProfiler, in torch/csrc/jit/mobile/profiler_edge.cpp, enables
* profiler using post-processing callback, via
* enableProfilerWithEventPostProcess, that takes these debug handles and
* generates stack trace and module hierarchy information, once profiling is
* done.
*/
using post_process_t = std::function<void(
/*debug_handle */ int64_t,
/*jit_stack */ std::vector<std::string>&,
/*jit_modules */ std::vector<std::string>&)>;
TORCH_API void enableProfilerWithEventPostProcess(
const torch::profiler::impl::ProfilerConfig& config,
const std::set<torch::profiler::impl::ActivityType>& activities,
post_process_t&& cb,
const std::unordered_set<at::RecordScope>& scopes = {});
TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();
TORCH_API void prepareProfiler(
const torch::profiler::impl::ProfilerConfig& config,
const std::set<torch::profiler::impl::ActivityType>& activities);
} // namespace profiler
} // namespace autograd
namespace profiler {
namespace impl {
// Experimental.
TORCH_API void _reportVulkanEventToProfiler(vulkan_id_t id);
} // namespace impl
} // namespace profiler
} // namespace torch