diff --git a/imperative/python/megengine/utils/profiler.py b/imperative/python/megengine/utils/profiler.py
index db3735e81cd5cc751e398278f9236a6b27b74ad2..00935adb5d2f936d0e08b78044231c7b793d39b0 100644
--- a/imperative/python/megengine/utils/profiler.py
+++ b/imperative/python/megengine/utils/profiler.py
@@ -18,6 +18,7 @@ from ..core._imperative_rt.core2 import (
     set_python_backtrace_enabled,
     start_profile,
     stop_profile,
+    stop_step,
     sync,
 )
 from ..logger import get_logger
@@ -134,6 +135,13 @@ class Profiler(ContextDecorator):
         _living_profilers.add(self)
         set_python_backtrace_enabled(self._origin_enable_bt)
 
+    def step(self):
+        global _running_profiler
+
+        assert _running_profiler is not None
+        stop_step()
+        return self
+
     def dump(self):
         if self._dump_callback is not None:
             if not os.path.exists(self._path):
diff --git a/imperative/python/src/tensor.cpp b/imperative/python/src/tensor.cpp
index 97785fec36070873e65e6d1530bc515f1c0c5e4a..36c2e9eb06dcaf8c1719e8eb75cc7771a3edefdf 100644
--- a/imperative/python/src/tensor.cpp
+++ b/imperative/python/src/tensor.cpp
@@ -966,6 +966,10 @@ void init_tensor(py::module m) {
             results = nullptr;
         };
     });
+    m.def("stop_step", [channel]() {
+        imperative::Profiler::stop_step();
+        channel->stop_step();
+    });
     m.def("enable_cupti", &cupti::enable);
     m.def("disable_cupti", &cupti::disable);
     m.def("cupti_available", &cupti::available);
diff --git a/imperative/src/impl/interpreter/commands.h b/imperative/src/impl/interpreter/commands.h
index b3ce387ce5337b0b20c462528cc0d8ab5c92ddf6..f7729a42bb19c0b56b7d88f2d935a3ec74810759 100644
--- a/imperative/src/impl/interpreter/commands.h
+++ b/imperative/src/impl/interpreter/commands.h
@@ -116,6 +116,13 @@ struct StopProfile {
     const char* get_name() const { return "StopProfile"; }
 };
 
+struct StopStep {
+    template <typename TFunctor>
+    void get_props(TFunctor&& functor) const {}
+
+    const char* get_name() const { return "StopStep"; }
+};
+
 struct PushScope {
     std::string scope_name;
 
@@ -162,7 +169,7 @@ struct StopRegen {
 
 using CommandData = std::variant<
         Put, ApplyOp, Del, GetValue, Drop, SetOption, StartProfile, StopProfile,
-        PushScope, PopScope, StartRegen, StopRegen>;
+        StopStep, PushScope, PopScope, StartRegen, StopRegen>;
 
 struct Command {
     uint64_t id;
diff --git a/imperative/src/impl/interpreter/interpreter_impl.cpp b/imperative/src/impl/interpreter/interpreter_impl.cpp
index 458ccfbfcae2b4728fbafadaca3af3d42696c686..f376cd98641e2031ef53c97346d56630fb68ca92 100644
--- a/imperative/src/impl/interpreter/interpreter_impl.cpp
+++ b/imperative/src/impl/interpreter/interpreter_impl.cpp
@@ -1339,6 +1339,8 @@ void ChannelImpl::process_one_task(Command& icmd) {
             CompNode::foreach (
                     [&](CompNode device) { sample_on_device(device, true); });
             MGB_RECORD_EVENT(StopProfileFinishEvent);
+        } else if constexpr (std::is_same_v<T, StopStep>) {
+            MGB_RECORD_EVENT(StopStepEvent);
         } else if constexpr (std::is_same_v<T, PushScope>) {
             MGB_RECORD_EVENT(ScopeEvent, cmd.scope_name);
         } else if constexpr (std::is_same_v<T, PopScope>) {
@@ -1436,6 +1438,15 @@ void ChannelImpl::stop_profile() {
     }
 }
 
+void ChannelImpl::stop_step() {
+    MGB_LOCK_GUARD(m_spin);
+    assert_available();
+    mgb_assert(Profiler::is_profiling() == true, "Profiler isn't profiling!");
+    m_worker.add_task(
+            {Profiler::next_id(), StopStep{},
+             get_channel_state().stack_manager.dump()});
+}
+
 void ChannelImpl::push_scope(std::string name) {
     MGB_LOCK_GUARD(m_spin);
     assert_available();
diff --git a/imperative/src/impl/interpreter/interpreter_impl.h b/imperative/src/impl/interpreter/interpreter_impl.h
index 1379fa65fa492e143f68450b84b42be749fb6cca..68f0088a09dcd913f6fd041eb44ee6237e9b4f0c 100644
--- a/imperative/src/impl/interpreter/interpreter_impl.h
+++ b/imperative/src/impl/interpreter/interpreter_impl.h
@@ -63,6 +63,7 @@ struct ChannelImpl : Interpreter::Channel, NonCopyableObj, NonMoveableObj {
 
     void start_profile() override;
     void stop_profile() override;
+    void stop_step() override;
 
     void push_scope(std::string) override;
     void pop_scope(std::string) override;
diff --git a/imperative/src/impl/profiler.cpp b/imperative/src/impl/profiler.cpp
index a1850e62e33fa7e98a7153e1fc1a0c55710b0b5f..e531c5832b6c8241b96604b21da3dbe8e6debc83 100644
--- a/imperative/src/impl/profiler.cpp
+++ b/imperative/src/impl/profiler.cpp
@@ -14,6 +14,7 @@
 #include "./op_trait.h"
 
 #include "./profiler/formats.h"
+#include "./profiler/states.h"
 
 namespace mgb {
 namespace imperative {
@@ -54,6 +55,11 @@ void Profiler::stop_profile() {
     sm_profiling = false;
 }
 
+void Profiler::stop_step() {
+    mgb_assert(sm_profiling);
+    MGB_RECORD_EVENT(profiler::StopStepEvent);
+}
+
 auto Profiler::get_thread_dict() -> thread_dict_t {
     thread_dict_t thread_dict;
     for (auto&& [tid, profiler] : sm_profilers) {
diff --git a/imperative/src/impl/profiler/chrome_timeline.cpp b/imperative/src/impl/profiler/chrome_timeline.cpp
index c646facb922695183d072f96902e6073ab56a1cc..de0cb25ffc9cac57ab9ecd48b3ac2a266a6ce3bd 100644
--- a/imperative/src/impl/profiler/chrome_timeline.cpp
+++ b/imperative/src/impl/profiler/chrome_timeline.cpp
@@ -151,6 +151,12 @@ public:
         return m_content.back();
     }
 
+    ChromeTraceEvent& new_event_front() {
+        ChromeTraceEvent new_event;
+        m_content.insert(m_content.begin(), new_event);
+        return m_content.front();
+    }
+
     std::string& metadata(std::string key) { return m_metadata[key]; }
 
     nlohmann::json to_json() const {
@@ -196,6 +202,12 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
                 since_start(time));
     }
 
+    ChromeTraceEvent& new_event_front(
+            std::string name, char ph, size_t tid, profiler::HostTime time) {
+        return trace_events.new_event_front().name(name).ph(ph).pid(pid).tid(tid).ts(
+                since_start(time));
+    }
+
     ChromeTraceEvent& new_host_event(std::string name, char ph) {
         return trace_events.new_event()
                 .name(name)
@@ -205,6 +217,17 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
                 .ts(since_start(current->time));
     }
 
+    ChromeTraceEvent& new_host_event(
+            std::string name, char ph, int tid, profiler::Duration time) {
+        return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts(time);
+    }
+
+    ChromeTraceEvent& new_host_event_front(
+            std::string name, char ph, int tid, profiler::Duration time) {
+        return trace_events.new_event_front().name(name).ph(ph).pid(pid).tid(tid).ts(
+                time);
+    }
+
     ChromeTraceEvent& new_cupti_event(
             std::string name, char ph, cupti::stream_t stream,
             cupti::time_point timestamp) {
@@ -363,6 +386,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
             new_host_event("StopProfile", 'B');
         } else if constexpr (std::is_same_v<TEvent, StopProfileFinishEvent>) {
             new_host_event("StopProfile", 'E');
+        } else if constexpr (std::is_same_v<TEvent, StopStepEvent>) {
+            new_host_event("StopStep", 'i');
         } else if constexpr (std::is_same_v<TEvent, CustomEvent>) {
             new_host_event(event.title, 'B');
             if (event.device.valid()) {
@@ -498,6 +523,23 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
 void dump_chrome_timeline(std::string filename, Profiler::bundle_t result) {
     ChromeTimelineEventVisitor visitor{};
     visitor.process_events(result);
+
+    auto gpu_usage_ratio = (double)(std::chrono::duration_cast<std::chrono::microseconds>(visitor.gpu_usage_time()).count() * 100) /
+                           (double)(visitor.get_total_train_time().count());
+    visitor.new_host_event("gpu_usgae_ratio", 'C', 0, visitor.last_kernel_finish())
+            .arg("value", gpu_usage_ratio);
+    visitor.new_host_event_front(
+                   "gpu_usgae_ratio", 'C', 0, std::chrono::microseconds(0))
+            .arg("value", gpu_usage_ratio);
+
+    auto train_time_ratio = (double)(std::chrono::duration_cast<std::chrono::microseconds>(visitor.gpu_usage_time_with_gap()).count() * 100) /
+                            (double)(visitor.get_total_train_time().count());
+    visitor.new_host_event("train_time_ratio", 'C', 0, visitor.last_kernel_finish())
+            .arg("value", train_time_ratio);
+    visitor.new_host_event_front(
+                   "train_time_ratio", 'C', 0, std::chrono::microseconds(0))
+            .arg("value", train_time_ratio);
+
     visitor.name_threads(result.thread_dict);
     auto trace_events = std::move(visitor.trace_events);
     trace_events.metadata("localTime") =
diff --git a/imperative/src/impl/profiler/events.h b/imperative/src/impl/profiler/events.h
index e265eef997f8a80c523a8ae7fcd2dab131918c8b..a8e4d511ecd355dfc2f386352ebf7da026eb6b1f 100644
--- a/imperative/src/impl/profiler/events.h
+++ b/imperative/src/impl/profiler/events.h
@@ -172,6 +172,8 @@ DEF_DUR_EVENT(HostToDevice, {
     void* device_ptr;
 });
 
+DEF_EVENT(StopStep, { CompNode device; });
+
 // cupti events
 DEF_EVENT(CUPTITimestamp, { cupti::clock::time_point timestamp; });
 
diff --git a/imperative/src/impl/profiler/states.h b/imperative/src/impl/profiler/states.h
index 33cebd070835635c5f98eac3bb5539456c7f0f33..1f5e5bfb26c7f87c55270a0fecda4276afc819b1 100644
--- a/imperative/src/impl/profiler/states.h
+++ b/imperative/src/impl/profiler/states.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#include <algorithm>
+#include <chrono>
+#include <ctime>
+
 #include <any>
 #include <set>
 #include <sstream>
@@ -187,6 +191,18 @@ private:
     std::unordered_map<std::string, int64_t> m_counter_table;
     std::optional<std::pair<profiler::HostTime, cupti::time_point>> m_cupti_timestamp =
             {};
+    // Record the start and end time of all kernels
+    std::vector<std::vector<profiler::HostTime>> m_kernel_start_finish_time;
+    // The sum of all kernel execution times
+    profiler::Duration m_gpu_usage_time = std::chrono::microseconds(0);
+    // The sum of all kernel execution times, including the gap time between kernels
+    // within a step
+    profiler::Duration m_gpu_usage_time_with_gap = std::chrono::microseconds(0);
+    // Record the end time of each step
+    std::vector<profiler::HostTime> m_step_finish_time;
+    // Record the start time of the first kernel and the end time of the last kernel in
+    // each step
+    std::vector<std::vector<profiler::HostTime>> m_step_first_last_kernel;
 
 protected:
     Profiler::Record* current;
@@ -288,9 +304,9 @@ public:
                 SampleDeviceEvent, SampleDeviceFinishEvent, WorkerExceptionEvent,
                 ShapeInferEvent, SyncEvent, SyncFinishEvent, StartProfileEvent,
                 StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent,
-                TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent,
-                AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent,
-                ScopeEvent, ScopeFinishEvent, HostToDeviceEvent,
+                StopStepEvent, TensorCommandEvent, TensorCommandFinishEvent,
+                AutoEvictEvent, AutoEvictFinishEvent, CustomEvent, CustomFinishEvent,
+                RecordDeviceEvent, ScopeEvent, ScopeFinishEvent, HostToDeviceEvent,
                 HostToDeviceFinishEvent, CUPTITimestampEvent, CUPTIKernelLaunchEvent,
                 CUPTIKernelLaunchFinishEvent, CUPTIKernelExecuteEvent,
                 CUPTIMemcpyLaunchEvent, CUPTIMemcpyLaunchFinishEvent, CUPTIMemcpyEvent,
@@ -335,6 +351,15 @@ public:
             }
         });
 
+        // record step end time
+        for_each_entry([&](auto&& event) {
+            using T = std::decay_t<decltype(event)>;
+            if constexpr (std::is_same_v<T, StopStepEvent>) {
+                auto step_time = current->time;
+                m_step_finish_time.push_back(to_device_time(step_time, event.device));
+            }
+        });
+
         // register host threads
         for_each_entry([&](auto&& event) {
             if (!m_host_tid_table.count(current->tid)) {
@@ -443,9 +468,17 @@ public:
                 inc_counter("nr_exception", 1);
             } else if constexpr (std::is_same_v<T, KernelLaunchFinishEvent>) {
                 auto& execution = current_op->executions.back();
+                auto overhead = to_device_time(current->time, event.device) -
+                                to_device_time(execution.begin, event.device);
+
+                std::vector<profiler::HostTime> current_kernel_start_finish;
+                current_kernel_start_finish.emplace_back(
+                        to_device_time(execution.begin, event.device));
+                current_kernel_start_finish.emplace_back(
+                        to_device_time(current->time, event.device));
+                m_kernel_start_finish_time.emplace_back(current_kernel_start_finish);
+
                 if (execution.reason == "dtr") {
-                    auto overhead = to_device_time(current->time, event.device) -
-                                    to_device_time(execution.begin, event.device);
                     inc_counter(
                             "dtr_overhead_us",
                             std::chrono::duration_cast<std::chrono::microseconds>(
@@ -463,6 +496,109 @@ public:
             }
         });
     }
+
+    profiler::Duration last_kernel_finish() {
+        if (m_kernel_start_finish_time.size() == 0) {
+            return profiler::Duration::zero();
+        }
+        return m_kernel_start_finish_time.back()[1] - m_start_time;
+    }
+
+    // get GPU busy time (union of calculation time and communication time)
+    profiler::Duration gpu_usage_time() {
+        if (m_kernel_start_finish_time.size() == 0) {
+            return profiler::Duration::zero();
+        }
+
+        std::sort(
+                m_kernel_start_finish_time.begin(), m_kernel_start_finish_time.end(),
+                [&](std::vector<profiler::HostTime> kernel1,
+                    std::vector<profiler::HostTime> kernel2) {
+                    if (kernel1[0] != kernel2[0]) {
+                        return kernel1[0] < kernel2[0];
+                    }
+                    return kernel1[1] < kernel2[1];
+                });
+
+        HostTime current_start = profiler::HostTime::min();
+        HostTime current_end = profiler::HostTime::min();
+        for (size_t i = 0; i < m_kernel_start_finish_time.size(); ++i) {
+            if (current_start == profiler::HostTime::min()) {
+                current_start = m_kernel_start_finish_time[i][0];
+                current_end = m_kernel_start_finish_time[i][1];
+            } else if (current_end < m_kernel_start_finish_time[i][0]) {
+                m_gpu_usage_time += profiler::Duration(current_end - current_start);
+                current_start = m_kernel_start_finish_time[i][0];
+                current_end = m_kernel_start_finish_time[i][1];
+            } else if (current_end > m_kernel_start_finish_time[i][0]) {
+                current_end = max(current_end, m_kernel_start_finish_time[i][1]);
+            }
+        }
+        m_gpu_usage_time += profiler::Duration(current_end - current_start);
+
+        return m_gpu_usage_time;
+    }
+
+    // compared to gpu_usage_time, this method adds gap time between kernels in the same
+    // step
+    profiler::Duration gpu_usage_time_with_gap() {
+        if (m_step_finish_time.empty()) {
+            return profiler::Duration::zero();
+        }
+
+        std::sort(
+                m_step_finish_time.begin(), m_step_finish_time.end(),
+                [&](HostTime time1, HostTime time2) { return time1 < time2; });
+
+        std::sort(
+                m_kernel_start_finish_time.begin(), m_kernel_start_finish_time.end(),
+                [&](std::vector<profiler::HostTime> kernel1,
+                    std::vector<profiler::HostTime> kernel2) {
+                    if (kernel1[0] != kernel2[0]) {
+                        return kernel1[0] < kernel2[0];
+                    }
+                    return kernel1[1] < kernel2[1];
+                });
+
+        int cur_step = 0;
+        auto kernel_num = m_kernel_start_finish_time.size();
+        for (size_t i = 0; i < kernel_num; ++i) {
+            // Record the start time of the first kernel and the end time of the last
+            // kernel of the current step
+            std::vector<profiler::HostTime> step_begin_end_time;
+            step_begin_end_time.emplace_back(m_kernel_start_finish_time[i][0]);
+            size_t j = i;
+            while (j < kernel_num && m_kernel_start_finish_time[j][0] <
+                                             m_step_finish_time[cur_step + 1]) {
+                ++j;
+            }
+            step_begin_end_time.emplace_back(m_kernel_start_finish_time[j - 1][1]);
+            mgb_assert(
+                    step_begin_end_time.size() == 2,
+                    "step_begin_end_time.size() should be exactly 2!");
+            m_step_first_last_kernel.emplace_back(step_begin_end_time);
+            i = j - 1;
+            ++cur_step;
+        }
+
+        for (size_t i = 0; i < m_step_first_last_kernel.size(); ++i) {
+            m_gpu_usage_time_with_gap += profiler::Duration(
+                    m_step_first_last_kernel[i][1] - m_step_first_last_kernel[i][0]);
+        }
+
+        return m_gpu_usage_time_with_gap;
+    }
+
+    // from the start time of the first kernel of the first step to the end time of the
+    // last kernel of the last step
+    std::chrono::microseconds get_total_train_time() {
+        if (m_kernel_start_finish_time.size() == 0) {
+            return std::chrono::microseconds(1);
+        }
+        return std::chrono::duration_cast<std::chrono::microseconds>(
+                m_kernel_start_finish_time.back()[1] -
+                m_kernel_start_finish_time.front()[0]);
+    }
 };
 
 }  // namespace mgb::imperative::profiler
diff --git a/imperative/src/include/megbrain/imperative/interpreter.h b/imperative/src/include/megbrain/imperative/interpreter.h
index 28f410ccd88982b5179f2116312dcaccd30e943a..d2a0a503a5e6dce555d665ba68123896987b1370 100644
--- a/imperative/src/include/megbrain/imperative/interpreter.h
+++ b/imperative/src/include/megbrain/imperative/interpreter.h
@@ -61,6 +61,7 @@ struct Interpreter {
 
         virtual void start_profile() = 0;
         virtual void stop_profile() = 0;
+        virtual void stop_step() = 0;
 
         virtual void push_scope(std::string name) = 0;
         virtual void pop_scope(std::string name) = 0;
diff --git a/imperative/src/include/megbrain/imperative/profiler.h b/imperative/src/include/megbrain/imperative/profiler.h
index 04f104301696b43ae6dfb4653978fb507491d3bf..f51b7adc8e45fa58ea649e6f7ffc18bdefda2a87 100644
--- a/imperative/src/include/megbrain/imperative/profiler.h
+++ b/imperative/src/include/megbrain/imperative/profiler.h
@@ -37,6 +37,7 @@ class Timer {
 public:
     using Time = profiler::Time;
     static profiler::Time record_host();
+    static profiler::Time record_device_time(CompNode device);
     static std::shared_ptr<CompNode::Event> record_device(CompNode device);
 };
 
@@ -77,6 +78,7 @@ private:
     std::atomic<Status> m_status = Running;
 
     static std::vector<entry_t> sm_records;
+    static std::vector<profiler::HostTime> sm_step_time;
     static options_t sm_profile_options;
     static std::mutex sm_mutex;
     // assume std::thread::id is unique
@@ -187,6 +189,8 @@ public:
 
     static void stop_profile();
 
+    static void stop_step();
+
     static thread_dict_t get_thread_dict();
 
     static void dump_profile(std::string basename, std::string format, bundle_t result);