提交 d2bbbb30 编写于 作者: M Megvii Engine Team

feat(mge/profiler): add overall model performance metrics

GitOrigin-RevId: 1394dedc265f898a79cca236475eb7d680c58950
上级 61f3b2e6
......@@ -18,6 +18,7 @@ from ..core._imperative_rt.core2 import (
set_python_backtrace_enabled,
start_profile,
stop_profile,
stop_step,
sync,
)
from ..logger import get_logger
......@@ -134,6 +135,13 @@ class Profiler(ContextDecorator):
_living_profilers.add(self)
set_python_backtrace_enabled(self._origin_enable_bt)
def step(self):
global _running_profiler
assert _running_profiler is not None
stop_step()
return self
def dump(self):
if self._dump_callback is not None:
if not os.path.exists(self._path):
......
......@@ -966,6 +966,10 @@ void init_tensor(py::module m) {
results = nullptr;
};
});
m.def("stop_step", [channel]() {
imperative::Profiler::stop_step();
channel->stop_step();
});
m.def("enable_cupti", &cupti::enable);
m.def("disable_cupti", &cupti::disable);
m.def("cupti_available", &cupti::available);
......
......@@ -116,6 +116,13 @@ struct StopProfile {
const char* get_name() const { return "StopProfile"; }
};
struct StopStep {
template <typename TFunctor>
void get_props(TFunctor&& functor) const {}
const char* get_name() const { return "StopStep"; }
};
struct PushScope {
std::string scope_name;
......@@ -162,7 +169,7 @@ struct StopRegen {
using CommandData = std::variant<
Put, ApplyOp, Del, GetValue, Drop, SetOption, StartProfile, StopProfile,
PushScope, PopScope, StartRegen, StopRegen>;
StopStep, PushScope, PopScope, StartRegen, StopRegen>;
struct Command {
uint64_t id;
......
......@@ -1339,6 +1339,8 @@ void ChannelImpl::process_one_task(Command& icmd) {
CompNode::foreach (
[&](CompNode device) { sample_on_device(device, true); });
MGB_RECORD_EVENT(StopProfileFinishEvent);
} else if constexpr (std::is_same_v<T, StopStep>) {
MGB_RECORD_EVENT(StopStepEvent);
} else if constexpr (std::is_same_v<T, PushScope>) {
MGB_RECORD_EVENT(ScopeEvent, cmd.scope_name);
} else if constexpr (std::is_same_v<T, PopScope>) {
......@@ -1436,6 +1438,15 @@ void ChannelImpl::stop_profile() {
}
}
void ChannelImpl::stop_step() {
MGB_LOCK_GUARD(m_spin);
assert_available();
mgb_assert(Profiler::is_profiling() == true, "Profiler isn't profiling!");
m_worker.add_task(
{Profiler::next_id(), StopStep{},
get_channel_state().stack_manager.dump()});
}
void ChannelImpl::push_scope(std::string name) {
MGB_LOCK_GUARD(m_spin);
assert_available();
......
......@@ -63,6 +63,7 @@ struct ChannelImpl : Interpreter::Channel, NonCopyableObj, NonMoveableObj {
void start_profile() override;
void stop_profile() override;
void stop_step() override;
void push_scope(std::string) override;
void pop_scope(std::string) override;
......
......@@ -14,6 +14,7 @@
#include "./op_trait.h"
#include "./profiler/formats.h"
#include "./profiler/states.h"
namespace mgb {
namespace imperative {
......@@ -54,6 +55,11 @@ void Profiler::stop_profile() {
sm_profiling = false;
}
void Profiler::stop_step() {
mgb_assert(sm_profiling);
MGB_RECORD_EVENT(profiler::StopStepEvent);
}
auto Profiler::get_thread_dict() -> thread_dict_t {
thread_dict_t thread_dict;
for (auto&& [tid, profiler] : sm_profilers) {
......
......@@ -151,6 +151,12 @@ public:
return m_content.back();
}
ChromeTraceEvent& new_event_front() {
ChromeTraceEvent new_event;
m_content.insert(m_content.begin(), new_event);
return m_content.front();
}
std::string& metadata(std::string key) { return m_metadata[key]; }
nlohmann::json to_json() const {
......@@ -196,6 +202,12 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
since_start(time));
}
ChromeTraceEvent& new_event_front(
std::string name, char ph, size_t tid, profiler::HostTime time) {
return trace_events.new_event_front().name(name).ph(ph).pid(pid).tid(tid).ts(
since_start(time));
}
ChromeTraceEvent& new_host_event(std::string name, char ph) {
return trace_events.new_event()
.name(name)
......@@ -205,6 +217,17 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
.ts(since_start(current->time));
}
ChromeTraceEvent& new_host_event(
std::string name, char ph, int tid, profiler::Duration time) {
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts(time);
}
ChromeTraceEvent& new_host_event_front(
std::string name, char ph, int tid, profiler::Duration time) {
return trace_events.new_event_front().name(name).ph(ph).pid(pid).tid(tid).ts(
time);
}
ChromeTraceEvent& new_cupti_event(
std::string name, char ph, cupti::stream_t stream,
cupti::time_point timestamp) {
......@@ -363,6 +386,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
new_host_event("StopProfile", 'B');
} else if constexpr (std::is_same_v<TEvent, StopProfileFinishEvent>) {
new_host_event("StopProfile", 'E');
} else if constexpr (std::is_same_v<TEvent, StopStepEvent>) {
new_host_event("StopStep", 'i');
} else if constexpr (std::is_same_v<TEvent, CustomEvent>) {
new_host_event(event.title, 'B');
if (event.device.valid()) {
......@@ -498,6 +523,23 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
void dump_chrome_timeline(std::string filename, Profiler::bundle_t result) {
ChromeTimelineEventVisitor visitor{};
visitor.process_events(result);
auto gpu_usage_ratio = (double)(std::chrono::duration_cast<std::chrono::microseconds>(visitor.gpu_usage_time()).count() * 100) /
(double)(visitor.get_total_train_time().count());
visitor.new_host_event("gpu_usgae_ratio", 'C', 0, visitor.last_kernel_finish())
.arg("value", gpu_usage_ratio);
visitor.new_host_event_front(
"gpu_usgae_ratio", 'C', 0, std::chrono::microseconds(0))
.arg("value", gpu_usage_ratio);
auto train_time_ratio = (double)(std::chrono::duration_cast<std::chrono::microseconds>(visitor.gpu_usage_time_with_gap()).count() * 100) /
(double)(visitor.get_total_train_time().count());
visitor.new_host_event("train_time_ratio", 'C', 0, visitor.last_kernel_finish())
.arg("value", train_time_ratio);
visitor.new_host_event_front(
"train_time_ratio", 'C', 0, std::chrono::microseconds(0))
.arg("value", train_time_ratio);
visitor.name_threads(result.thread_dict);
auto trace_events = std::move(visitor.trace_events);
trace_events.metadata("localTime") =
......
......@@ -172,6 +172,8 @@ DEF_DUR_EVENT(HostToDevice, {
void* device_ptr;
});
DEF_EVENT(StopStep, { CompNode device; });
// cupti events
DEF_EVENT(CUPTITimestamp, { cupti::clock::time_point timestamp; });
......
#pragma once
#include <algorithm>
#include <chrono>
#include <ctime>
#include <any>
#include <set>
#include <sstream>
......@@ -187,6 +191,18 @@ private:
std::unordered_map<std::string, int64_t> m_counter_table;
std::optional<std::pair<profiler::HostTime, cupti::time_point>> m_cupti_timestamp =
{};
// Record the start and end time of all kernels
std::vector<std::vector<profiler::HostTime>> m_kernel_start_finish_time;
// The sum of all kernel execution times
profiler::Duration m_gpu_usage_time = std::chrono::microseconds(0);
// The sum of all kernel execution times, including the gap time between kernels
// within a step
profiler::Duration m_gpu_usage_time_with_gap = std::chrono::microseconds(0);
// Record the end time of each step
std::vector<profiler::HostTime> m_step_finish_time;
// Record the start time of the first kernel and the end time of the last kernel in
// each step
std::vector<std::vector<profiler::HostTime>> m_step_first_last_kernel;
protected:
Profiler::Record* current;
......@@ -288,9 +304,9 @@ public:
SampleDeviceEvent, SampleDeviceFinishEvent, WorkerExceptionEvent,
ShapeInferEvent, SyncEvent, SyncFinishEvent, StartProfileEvent,
StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent,
TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent,
AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent,
ScopeEvent, ScopeFinishEvent, HostToDeviceEvent,
StopStepEvent, TensorCommandEvent, TensorCommandFinishEvent,
AutoEvictEvent, AutoEvictFinishEvent, CustomEvent, CustomFinishEvent,
RecordDeviceEvent, ScopeEvent, ScopeFinishEvent, HostToDeviceEvent,
HostToDeviceFinishEvent, CUPTITimestampEvent, CUPTIKernelLaunchEvent,
CUPTIKernelLaunchFinishEvent, CUPTIKernelExecuteEvent,
CUPTIMemcpyLaunchEvent, CUPTIMemcpyLaunchFinishEvent, CUPTIMemcpyEvent,
......@@ -335,6 +351,15 @@ public:
}
});
// record step end time
for_each_entry([&](auto&& event) {
using T = std::decay_t<decltype(event)>;
if constexpr (std::is_same_v<T, StopStepEvent>) {
auto step_time = current->time;
m_step_finish_time.push_back(to_device_time(step_time, event.device));
}
});
// register host threads
for_each_entry([&](auto&& event) {
if (!m_host_tid_table.count(current->tid)) {
......@@ -443,9 +468,17 @@ public:
inc_counter("nr_exception", 1);
} else if constexpr (std::is_same_v<T, KernelLaunchFinishEvent>) {
auto& execution = current_op->executions.back();
auto overhead = to_device_time(current->time, event.device) -
to_device_time(execution.begin, event.device);
std::vector<profiler::HostTime> current_kernel_start_finish;
current_kernel_start_finish.emplace_back(
to_device_time(execution.begin, event.device));
current_kernel_start_finish.emplace_back(
to_device_time(current->time, event.device));
m_kernel_start_finish_time.emplace_back(current_kernel_start_finish);
if (execution.reason == "dtr") {
auto overhead = to_device_time(current->time, event.device) -
to_device_time(execution.begin, event.device);
inc_counter(
"dtr_overhead_us",
std::chrono::duration_cast<std::chrono::microseconds>(
......@@ -463,6 +496,109 @@ public:
}
});
}
profiler::Duration last_kernel_finish() {
if (m_kernel_start_finish_time.size() == 0) {
return profiler::Duration::zero();
}
return m_kernel_start_finish_time.back()[1] - m_start_time;
}
// get GPU busy time (union of calculation time and communication time)
profiler::Duration gpu_usage_time() {
if (m_kernel_start_finish_time.size() == 0) {
return profiler::Duration::zero();
}
std::sort(
m_kernel_start_finish_time.begin(), m_kernel_start_finish_time.end(),
[&](std::vector<profiler::HostTime> kernel1,
std::vector<profiler::HostTime> kernel2) {
if (kernel1[0] != kernel2[0]) {
return kernel1[0] < kernel2[0];
}
return kernel1[1] < kernel2[1];
});
HostTime current_start = profiler::HostTime::min();
HostTime current_end = profiler::HostTime::min();
for (size_t i = 0; i < m_kernel_start_finish_time.size(); ++i) {
if (current_start == profiler::HostTime::min()) {
current_start = m_kernel_start_finish_time[i][0];
current_end = m_kernel_start_finish_time[i][1];
} else if (current_end < m_kernel_start_finish_time[i][0]) {
m_gpu_usage_time += profiler::Duration(current_end - current_start);
current_start = m_kernel_start_finish_time[i][0];
current_end = m_kernel_start_finish_time[i][1];
} else if (current_end > m_kernel_start_finish_time[i][0]) {
current_end = max(current_end, m_kernel_start_finish_time[i][1]);
}
}
m_gpu_usage_time += profiler::Duration(current_end - current_start);
return m_gpu_usage_time;
}
// compared to gpu_usage_time, this method adds gap time between kernels in the same
// step
profiler::Duration gpu_usage_time_with_gap() {
if (m_step_finish_time.empty()) {
return profiler::Duration::zero();
}
std::sort(
m_step_finish_time.begin(), m_step_finish_time.end(),
[&](HostTime time1, HostTime time2) { return time1 < time2; });
std::sort(
m_kernel_start_finish_time.begin(), m_kernel_start_finish_time.end(),
[&](std::vector<profiler::HostTime> kernel1,
std::vector<profiler::HostTime> kernel2) {
if (kernel1[0] != kernel2[0]) {
return kernel1[0] < kernel2[0];
}
return kernel1[1] < kernel2[1];
});
int cur_step = 0;
auto kernel_num = m_kernel_start_finish_time.size();
for (size_t i = 0; i < kernel_num; ++i) {
// Record the start time of the first kernel and the end time of the last
// kernel of the current step
std::vector<profiler::HostTime> step_begin_end_time;
step_begin_end_time.emplace_back(m_kernel_start_finish_time[i][0]);
size_t j = i;
while (j < kernel_num && m_kernel_start_finish_time[j][0] <
m_step_finish_time[cur_step + 1]) {
++j;
}
step_begin_end_time.emplace_back(m_kernel_start_finish_time[j - 1][1]);
mgb_assert(
step_begin_end_time.size() == 2,
"step_begin_end_time.size() should be exactly 2!");
m_step_first_last_kernel.emplace_back(step_begin_end_time);
i = j - 1;
++cur_step;
}
for (size_t i = 0; i < m_step_first_last_kernel.size(); ++i) {
m_gpu_usage_time_with_gap += profiler::Duration(
m_step_first_last_kernel[i][1] - m_step_first_last_kernel[i][0]);
}
return m_gpu_usage_time_with_gap;
}
// from the start time of the first kernel of the first step to the end time of the
// last kernel of the last step
std::chrono::microseconds get_total_train_time() {
if (m_kernel_start_finish_time.size() == 0) {
return std::chrono::microseconds(1);
}
return std::chrono::duration_cast<std::chrono::microseconds>(
m_kernel_start_finish_time.back()[1] -
m_kernel_start_finish_time.front()[0]);
}
};
} // namespace mgb::imperative::profiler
......@@ -61,6 +61,7 @@ struct Interpreter {
virtual void start_profile() = 0;
virtual void stop_profile() = 0;
virtual void stop_step() = 0;
virtual void push_scope(std::string name) = 0;
virtual void pop_scope(std::string name) = 0;
......
......@@ -37,6 +37,7 @@ class Timer {
public:
using Time = profiler::Time;
static profiler::Time record_host();
static profiler::Time record_device_time(CompNode device);
static std::shared_ptr<CompNode::Event> record_device(CompNode device);
};
......@@ -77,6 +78,7 @@ private:
std::atomic<Status> m_status = Running;
static std::vector<entry_t> sm_records;
static std::vector<profiler::HostTime> sm_step_time;
static options_t sm_profile_options;
static std::mutex sm_mutex;
// assume std::thread::id is unique
......@@ -187,6 +189,8 @@ public:
static void stop_profile();
static void stop_step();
static thread_dict_t get_thread_dict();
static void dump_profile(std::string basename, std::string format, bundle_t result);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册