diff --git a/imperative/python/megengine/utils/profiler.py b/imperative/python/megengine/utils/profiler.py index db3735e81cd5cc751e398278f9236a6b27b74ad2..00935adb5d2f936d0e08b78044231c7b793d39b0 100644 --- a/imperative/python/megengine/utils/profiler.py +++ b/imperative/python/megengine/utils/profiler.py @@ -18,6 +18,7 @@ from ..core._imperative_rt.core2 import ( set_python_backtrace_enabled, start_profile, stop_profile, + stop_step, sync, ) from ..logger import get_logger @@ -134,6 +135,13 @@ class Profiler(ContextDecorator): _living_profilers.add(self) set_python_backtrace_enabled(self._origin_enable_bt) + def step(self): + global _running_profiler + + assert _running_profiler is not None + stop_step() + return self + def dump(self): if self._dump_callback is not None: if not os.path.exists(self._path): diff --git a/imperative/python/src/tensor.cpp b/imperative/python/src/tensor.cpp index 97785fec36070873e65e6d1530bc515f1c0c5e4a..36c2e9eb06dcaf8c1719e8eb75cc7771a3edefdf 100644 --- a/imperative/python/src/tensor.cpp +++ b/imperative/python/src/tensor.cpp @@ -966,6 +966,10 @@ void init_tensor(py::module m) { results = nullptr; }; }); + m.def("stop_step", [channel]() { + imperative::Profiler::stop_step(); + channel->stop_step(); + }); m.def("enable_cupti", &cupti::enable); m.def("disable_cupti", &cupti::disable); m.def("cupti_available", &cupti::available); diff --git a/imperative/src/impl/interpreter/commands.h b/imperative/src/impl/interpreter/commands.h index b3ce387ce5337b0b20c462528cc0d8ab5c92ddf6..f7729a42bb19c0b56b7d88f2d935a3ec74810759 100644 --- a/imperative/src/impl/interpreter/commands.h +++ b/imperative/src/impl/interpreter/commands.h @@ -116,6 +116,13 @@ struct StopProfile { const char* get_name() const { return "StopProfile"; } }; +struct StopStep { + template + void get_props(TFunctor&& functor) const {} + + const char* get_name() const { return "StopStep"; } +}; + struct PushScope { std::string scope_name; @@ -162,7 +169,7 @@ struct StopRegen { using CommandData = std::variant< Put, ApplyOp, Del, GetValue, Drop, SetOption, StartProfile, StopProfile, - PushScope, PopScope, StartRegen, StopRegen>; + StopStep, PushScope, PopScope, StartRegen, StopRegen>; struct Command { uint64_t id; diff --git a/imperative/src/impl/interpreter/interpreter_impl.cpp b/imperative/src/impl/interpreter/interpreter_impl.cpp index 458ccfbfcae2b4728fbafadaca3af3d42696c686..f376cd98641e2031ef53c97346d56630fb68ca92 100644 --- a/imperative/src/impl/interpreter/interpreter_impl.cpp +++ b/imperative/src/impl/interpreter/interpreter_impl.cpp @@ -1339,6 +1339,8 @@ void ChannelImpl::process_one_task(Command& icmd) { CompNode::foreach ( [&](CompNode device) { sample_on_device(device, true); }); MGB_RECORD_EVENT(StopProfileFinishEvent); + } else if constexpr (std::is_same_v) { + MGB_RECORD_EVENT(StopStepEvent); } else if constexpr (std::is_same_v) { MGB_RECORD_EVENT(ScopeEvent, cmd.scope_name); } else if constexpr (std::is_same_v) { @@ -1436,6 +1438,15 @@ void ChannelImpl::stop_profile() { } } +void ChannelImpl::stop_step() { + MGB_LOCK_GUARD(m_spin); + assert_available(); + mgb_assert(Profiler::is_profiling() == true, "Profiler isn't profiling!"); + m_worker.add_task( + {Profiler::next_id(), StopStep{}, + get_channel_state().stack_manager.dump()}); +} + void ChannelImpl::push_scope(std::string name) { MGB_LOCK_GUARD(m_spin); assert_available(); diff --git a/imperative/src/impl/interpreter/interpreter_impl.h b/imperative/src/impl/interpreter/interpreter_impl.h index 1379fa65fa492e143f68450b84b42be749fb6cca..68f0088a09dcd913f6fd041eb44ee6237e9b4f0c 100644 --- a/imperative/src/impl/interpreter/interpreter_impl.h +++ b/imperative/src/impl/interpreter/interpreter_impl.h @@ -63,6 +63,7 @@ struct ChannelImpl : Interpreter::Channel, NonCopyableObj, NonMoveableObj { void start_profile() override; void stop_profile() override; + void stop_step() override; void push_scope(std::string) override; void pop_scope(std::string) override; diff --git a/imperative/src/impl/profiler.cpp b/imperative/src/impl/profiler.cpp index a1850e62e33fa7e98a7153e1fc1a0c55710b0b5f..e531c5832b6c8241b96604b21da3dbe8e6debc83 100644 --- a/imperative/src/impl/profiler.cpp +++ b/imperative/src/impl/profiler.cpp @@ -14,6 +14,7 @@ #include "./op_trait.h" #include "./profiler/formats.h" +#include "./profiler/states.h" namespace mgb { namespace imperative { @@ -54,6 +55,11 @@ void Profiler::stop_profile() { sm_profiling = false; } +void Profiler::stop_step() { + mgb_assert(sm_profiling); + MGB_RECORD_EVENT(profiler::StopStepEvent); +} + auto Profiler::get_thread_dict() -> thread_dict_t { thread_dict_t thread_dict; for (auto&& [tid, profiler] : sm_profilers) { diff --git a/imperative/src/impl/profiler/chrome_timeline.cpp b/imperative/src/impl/profiler/chrome_timeline.cpp index c646facb922695183d072f96902e6073ab56a1cc..de0cb25ffc9cac57ab9ecd48b3ac2a266a6ce3bd 100644 --- a/imperative/src/impl/profiler/chrome_timeline.cpp +++ b/imperative/src/impl/profiler/chrome_timeline.cpp @@ -151,6 +151,12 @@ public: return m_content.back(); } + ChromeTraceEvent& new_event_front() { + ChromeTraceEvent new_event; + m_content.insert(m_content.begin(), new_event); + return m_content.front(); + } + std::string& metadata(std::string key) { return m_metadata[key]; } nlohmann::json to_json() const { @@ -196,6 +202,12 @@ struct ChromeTimelineEventVisitor : EventVisitor { since_start(time)); } + ChromeTraceEvent& new_event_front( + std::string name, char ph, size_t tid, profiler::HostTime time) { + return trace_events.new_event_front().name(name).ph(ph).pid(pid).tid(tid).ts( + since_start(time)); + } + ChromeTraceEvent& new_host_event(std::string name, char ph) { return trace_events.new_event() .name(name) @@ -205,6 +217,17 @@ struct ChromeTimelineEventVisitor : EventVisitor { .ts(since_start(current->time)); } + ChromeTraceEvent& new_host_event( + std::string name, char ph, int tid, profiler::Duration time) { + return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts(time); + } + + ChromeTraceEvent& new_host_event_front( + std::string name, char ph, int tid, profiler::Duration time) { + return trace_events.new_event_front().name(name).ph(ph).pid(pid).tid(tid).ts( + time); + } + ChromeTraceEvent& new_cupti_event( std::string name, char ph, cupti::stream_t stream, cupti::time_point timestamp) { @@ -363,6 +386,8 @@ struct ChromeTimelineEventVisitor : EventVisitor { new_host_event("StopProfile", 'B'); } else if constexpr (std::is_same_v) { new_host_event("StopProfile", 'E'); + } else if constexpr (std::is_same_v) { + new_host_event("StopStep", 'i'); } else if constexpr (std::is_same_v) { new_host_event(event.title, 'B'); if (event.device.valid()) { @@ -498,6 +523,23 @@ struct ChromeTimelineEventVisitor : EventVisitor { void dump_chrome_timeline(std::string filename, Profiler::bundle_t result) { ChromeTimelineEventVisitor visitor{}; visitor.process_events(result); + + auto gpu_usage_ratio = (double)(std::chrono::duration_cast(visitor.gpu_usage_time()).count() * 100) / + (double)(visitor.get_total_train_time().count()); + visitor.new_host_event("gpu_usgae_ratio", 'C', 0, visitor.last_kernel_finish()) + .arg("value", gpu_usage_ratio); + visitor.new_host_event_front( + "gpu_usgae_ratio", 'C', 0, std::chrono::microseconds(0)) + .arg("value", gpu_usage_ratio); + + auto train_time_ratio = (double)(std::chrono::duration_cast(visitor.gpu_usage_time_with_gap()).count() * 100) / + (double)(visitor.get_total_train_time().count()); + visitor.new_host_event("train_time_ratio", 'C', 0, visitor.last_kernel_finish()) + .arg("value", train_time_ratio); + visitor.new_host_event_front( + "train_time_ratio", 'C', 0, std::chrono::microseconds(0)) + .arg("value", train_time_ratio); + visitor.name_threads(result.thread_dict); auto trace_events = std::move(visitor.trace_events); trace_events.metadata("localTime") = diff --git a/imperative/src/impl/profiler/events.h b/imperative/src/impl/profiler/events.h index e265eef997f8a80c523a8ae7fcd2dab131918c8b..a8e4d511ecd355dfc2f386352ebf7da026eb6b1f 100644 --- a/imperative/src/impl/profiler/events.h +++ b/imperative/src/impl/profiler/events.h @@ -172,6 +172,8 @@ DEF_DUR_EVENT(HostToDevice, { void* device_ptr; }); +DEF_EVENT(StopStep, { CompNode device; }); + // cupti events DEF_EVENT(CUPTITimestamp, { cupti::clock::time_point timestamp; }); diff --git a/imperative/src/impl/profiler/states.h b/imperative/src/impl/profiler/states.h index 33cebd070835635c5f98eac3bb5539456c7f0f33..1f5e5bfb26c7f87c55270a0fecda4276afc819b1 100644 --- a/imperative/src/impl/profiler/states.h +++ b/imperative/src/impl/profiler/states.h @@ -1,5 +1,9 @@ #pragma once +#include +#include +#include + #include #include #include @@ -187,6 +191,18 @@ private: std::unordered_map m_counter_table; std::optional> m_cupti_timestamp = {}; + // Record the start and end time of all kernels + std::vector> m_kernel_start_finish_time; + // The sum of all kernel execution times + profiler::Duration m_gpu_usage_time = std::chrono::microseconds(0); + // The sum of all kernel execution times, including the gap time between kernels + // within a step + profiler::Duration m_gpu_usage_time_with_gap = std::chrono::microseconds(0); + // Record the end time of each step + std::vector m_step_finish_time; + // Record the start time of the first kernel and the end time of the last kernel in + // each step + std::vector> m_step_first_last_kernel; protected: Profiler::Record* current; @@ -288,9 +304,9 @@ public: SampleDeviceEvent, SampleDeviceFinishEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent, StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent, - TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, - AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent, - ScopeEvent, ScopeFinishEvent, HostToDeviceEvent, + StopStepEvent, TensorCommandEvent, TensorCommandFinishEvent, + AutoEvictEvent, AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, + RecordDeviceEvent, ScopeEvent, ScopeFinishEvent, HostToDeviceEvent, HostToDeviceFinishEvent, CUPTITimestampEvent, CUPTIKernelLaunchEvent, CUPTIKernelLaunchFinishEvent, CUPTIKernelExecuteEvent, CUPTIMemcpyLaunchEvent, CUPTIMemcpyLaunchFinishEvent, CUPTIMemcpyEvent, @@ -335,6 +351,15 @@ public: } }); + // record step end time + for_each_entry([&](auto&& event) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + auto step_time = current->time; + m_step_finish_time.push_back(to_device_time(step_time, event.device)); + } + }); + // register host threads for_each_entry([&](auto&& event) { if (!m_host_tid_table.count(current->tid)) { @@ -443,9 +468,17 @@ public: inc_counter("nr_exception", 1); } else if constexpr (std::is_same_v) { auto& execution = current_op->executions.back(); + auto overhead = to_device_time(current->time, event.device) - + to_device_time(execution.begin, event.device); + + std::vector current_kernel_start_finish; + current_kernel_start_finish.emplace_back( + to_device_time(execution.begin, event.device)); + current_kernel_start_finish.emplace_back( + to_device_time(current->time, event.device)); + m_kernel_start_finish_time.emplace_back(current_kernel_start_finish); + if (execution.reason == "dtr") { - auto overhead = to_device_time(current->time, event.device) - - to_device_time(execution.begin, event.device); inc_counter( "dtr_overhead_us", std::chrono::duration_cast( @@ -463,6 +496,109 @@ public: } }); } + + profiler::Duration last_kernel_finish() { + if (m_kernel_start_finish_time.size() == 0) { + return profiler::Duration::zero(); + } + return m_kernel_start_finish_time.back()[1] - m_start_time; + } + + // get GPU busy time (union of calculation time and communication time) + profiler::Duration gpu_usage_time() { + if (m_kernel_start_finish_time.size() == 0) { + return profiler::Duration::zero(); + } + + std::sort( + m_kernel_start_finish_time.begin(), m_kernel_start_finish_time.end(), + [&](std::vector kernel1, + std::vector kernel2) { + if (kernel1[0] != kernel2[0]) { + return kernel1[0] < kernel2[0]; + } + return kernel1[1] < kernel2[1]; + }); + + HostTime current_start = profiler::HostTime::min(); + HostTime current_end = profiler::HostTime::min(); + for (size_t i = 0; i < m_kernel_start_finish_time.size(); ++i) { + if (current_start == profiler::HostTime::min()) { + current_start = m_kernel_start_finish_time[i][0]; + current_end = m_kernel_start_finish_time[i][1]; + } else if (current_end < m_kernel_start_finish_time[i][0]) { + m_gpu_usage_time += profiler::Duration(current_end - current_start); + current_start = m_kernel_start_finish_time[i][0]; + current_end = m_kernel_start_finish_time[i][1]; + } else if (current_end > m_kernel_start_finish_time[i][0]) { + current_end = max(current_end, m_kernel_start_finish_time[i][1]); + } + } + m_gpu_usage_time += profiler::Duration(current_end - current_start); + + return m_gpu_usage_time; + } + + // compared to gpu_usage_time, this method adds gap time between kernels in the same + // step + profiler::Duration gpu_usage_time_with_gap() { + if (m_step_finish_time.empty()) { + return profiler::Duration::zero(); + } + + std::sort( + m_step_finish_time.begin(), m_step_finish_time.end(), + [&](HostTime time1, HostTime time2) { return time1 < time2; }); + + std::sort( + m_kernel_start_finish_time.begin(), m_kernel_start_finish_time.end(), + [&](std::vector kernel1, + std::vector kernel2) { + if (kernel1[0] != kernel2[0]) { + return kernel1[0] < kernel2[0]; + } + return kernel1[1] < kernel2[1]; + }); + + int cur_step = 0; + auto kernel_num = m_kernel_start_finish_time.size(); + for (size_t i = 0; i < kernel_num; ++i) { + // Record the start time of the first kernel and the end time of the last + // kernel of the current step + std::vector step_begin_end_time; + step_begin_end_time.emplace_back(m_kernel_start_finish_time[i][0]); + size_t j = i; + while (j < kernel_num && m_kernel_start_finish_time[j][0] < + m_step_finish_time[cur_step + 1]) { + ++j; + } + step_begin_end_time.emplace_back(m_kernel_start_finish_time[j - 1][1]); + mgb_assert( + step_begin_end_time.size() == 2, + "step_begin_end_time.size() should be exactly 2!"); + m_step_first_last_kernel.emplace_back(step_begin_end_time); + i = j - 1; + ++cur_step; + } + + for (size_t i = 0; i < m_step_first_last_kernel.size(); ++i) { + m_gpu_usage_time_with_gap += profiler::Duration( + m_step_first_last_kernel[i][1] - m_step_first_last_kernel[i][0]); + } + + return m_gpu_usage_time_with_gap; + } + + // from the start time of the first kernel of the first step to the end time of the + // last kernel of the last step + std::chrono::microseconds get_total_train_time() { + if (m_kernel_start_finish_time.size() == 0) { + return std::chrono::microseconds(1); + } + return std::chrono::duration_cast( + m_kernel_start_finish_time.back()[1] - + m_kernel_start_finish_time.front()[0]); + } }; } // namespace mgb::imperative::profiler diff --git a/imperative/src/include/megbrain/imperative/interpreter.h b/imperative/src/include/megbrain/imperative/interpreter.h index 28f410ccd88982b5179f2116312dcaccd30e943a..d2a0a503a5e6dce555d665ba68123896987b1370 100644 --- a/imperative/src/include/megbrain/imperative/interpreter.h +++ b/imperative/src/include/megbrain/imperative/interpreter.h @@ -61,6 +61,7 @@ struct Interpreter { virtual void start_profile() = 0; virtual void stop_profile() = 0; + virtual void stop_step() = 0; virtual void push_scope(std::string name) = 0; virtual void pop_scope(std::string name) = 0; diff --git a/imperative/src/include/megbrain/imperative/profiler.h b/imperative/src/include/megbrain/imperative/profiler.h index 04f104301696b43ae6dfb4653978fb507491d3bf..f51b7adc8e45fa58ea649e6f7ffc18bdefda2a87 100644 --- a/imperative/src/include/megbrain/imperative/profiler.h +++ b/imperative/src/include/megbrain/imperative/profiler.h @@ -37,6 +37,7 @@ class Timer { public: using Time = profiler::Time; static profiler::Time record_host(); + static profiler::Time record_device_time(CompNode device); static std::shared_ptr record_device(CompNode device); }; @@ -77,6 +78,7 @@ private: std::atomic m_status = Running; static std::vector sm_records; + static std::vector sm_step_time; static options_t sm_profile_options; static std::mutex sm_mutex; // assume std::thread::id is unique @@ -187,6 +189,8 @@ public: static void stop_profile(); + static void stop_step(); + static thread_dict_t get_thread_dict(); static void dump_profile(std::string basename, std::string format, bundle_t result);