refactor(profiler): refactor event processing

GitOrigin-RevId: 26f0c5a6e1e9af669d809776995a380e8dadf5f0

refactor(profiler): refactor event processing
GitOrigin-RevId: 26f0c5a6e1e9af669d809776995a380e8dadf5f0
a841d8e7 · Megvii Engine Team · dff7719e · a841d8e7 · a841d8e7 · a841d8e7
16 changed file
--- a/imperative/CMakeLists.txt
+++ b/imperative/CMakeLists.txt
@@ -39,6 +39,9 @@ endif()
 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 ${PROJECT_BINARY_DIR}/third_party/range-v3)
 target_link_libraries(${MODULE_NAME} PRIVATE range-v3)

+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/Json ${PROJECT_BINARY_DIR}/third_party/Json)
+target_link_libraries(${MODULE_NAME} PRIVATE nlohmann_json::nlohmann_json)
+
 target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${MGB_OPDEF_OUT_DIR})
 target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME})
 target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter)

--- a/imperative/python/src/tensor.cpp
+++ b/imperative/python/src/tensor.cpp
@@ -1013,9 +1013,8 @@ void init_tensor(py::module m) {
              interpreter_for_py->sync();
              imperative::Profiler::stop_profile();
              auto results = imperative::Profiler::collect();
-              auto options = imperative::Profiler::get_options();
-              return [results=std::move(results), options=std::move(options)](std::string basename, std::string format){
-                  imperative::Profiler::dump_profile(basename, format, results, options);
+              return [results=std::move(results)](std::string basename, std::string format){
+                  imperative::Profiler::dump_profile(basename, format, results);
              };
          }, py::call_guard<py::gil_scoped_release>());
    m.def("sync",

--- a/imperative/src/impl/interpreter/commands.h
+++ b/imperative/src/impl/interpreter/commands.h
@@ -19,6 +19,7 @@
 #include "megbrain/imperative/op_def.h"
 #include "megbrain/imperative/utils/to_string.h"

+#include "./stack_manager.h"
 #include "./tensor_info.h"

 namespace mgb::imperative {
@@ -193,7 +194,7 @@ struct PopScope {
    }
 };

-using Command = std::variant<Put,
+using CommandData = std::variant<Put,
                             ApplyOp,
                             Del,
                             GetValue,
@@ -206,14 +207,20 @@ using Command = std::variant<Put,
                             PushScope,
                             PopScope>;

-using IdentifiedCommand = std::pair<uint64_t, Command>;
+struct Command {
+    uint64_t id;
+    CommandData data;
+    StackManager::Trace trace;
+    
+};
+// using IdentifiedCommand = std::pair<uint64_t, Command>;

 }

 template <>
 struct ToStringTrait<interpreter::intl::Command>{
    std::string operator()(const interpreter::intl::Command& cmd) const {
-        return std::visit([](const auto& cmd){
+        std::string content = std::visit([](const auto& cmd){
            std::string result = cmd.get_name();
            result += "{";
            cmd.get_props([&](const char* key, auto&& value) {
@@ -224,7 +231,8 @@ struct ToStringTrait<interpreter::intl::Command>{
            });
            result += "}";
            return result;
-        }, cmd);
+        }, cmd.data);
+        return content;
    }
 };


--- a/imperative/src/impl/interpreter/interpreter_impl.cpp
+++ b/imperative/src/impl/interpreter/interpreter_impl.cpp
@@ -127,9 +127,8 @@ Handle ChannelImpl::put(const HostTensorND& value, bool no_cache) {
    MGB_LOCK_GUARD(m_spin);
    mgb_assert(check_available(), "Channel already closed");
    auto& state = get_channel_state();
-    state.scopes.push("Put");
+    auto _ = StackManager::Guard{"Put", &state.stack_manager};
    auto info = put_impl(value, no_cache);
-    state.scopes.pop("Put");
    return info;
 }

@@ -158,16 +157,15 @@ Handle ChannelImpl::put(const DeviceTensorND& data, const HostTensorND& hvalue)
 }
 TensorInfo* ChannelImpl::put_impl(const DeviceTensorND& data, const HostTensorND& hvalue) {
    auto& state = get_channel_state();
-    state.scopes.push("Put");
+    auto _ = StackManager::Guard{"Put", &state.stack_manager};
    auto info = alloc();
-    RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandEvent::Put);
+    RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandKind::Put);
    init(info, {data.layout(), data.comp_node()});
    info->mem_desc.id = StorageIdentifier::make(++m_storage_id);
    info->ptr = Tensor::make(data, hvalue);
    RECORD_EVENT(TensorProduceEvent, info->id, info->desc.layout, info->desc.comp_node, data.raw_ptr());
    info->status = TensorInfo::Produced;
-    RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandFinishEvent::Put);
-    state.scopes.pop("Put");
+    RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandKind::Put);
    return info;
 }

@@ -230,7 +228,7 @@ void ChannelImpl::dispatch_default_cpu(
    auto& state = get_channel_state();

    auto name = op->trait()->make_name(*op);
-    state.scopes.push(name);
+    auto _ = StackManager::Guard(name, &state.stack_manager);

    auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs);
    RECORD_EVENT(ShapeInferEvent, validated);
@@ -291,9 +289,9 @@ void ChannelImpl::dispatch_default_cpu(
        }
        return op_info;
    };
-    RECORD_EVENT(OpDispatchEvent, op_id, op->trait()->name, op_info_getter, tinfo_to_tid(input_infos), tinfo_to_tid(output_infos));
-
-    state.scopes.pop(name);
+    RECORD_EVENT(OpDispatchEvent, op_id, op->trait()->name, op_info_getter,
+                 tinfo_to_tid(input_infos), tinfo_to_tid(output_infos),
+                 state.stack_manager.dump());
 }

 void ChannelImpl::dispatch_kernel(
@@ -305,7 +303,7 @@ void ChannelImpl::dispatch_kernel(
    auto& options = state.options;

    auto name = op->trait()->make_name(*op);
-    state.scopes.push(name);
+    auto _  = StackManager::Guard{name, &state.stack_manager};

    auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs);
    RECORD_EVENT(ShapeInferEvent, validated);
@@ -334,7 +332,9 @@ void ChannelImpl::dispatch_kernel(
        }
        return op_info;
    };
-    RECORD_EVENT(OpDispatchEvent, cmd.id, cmd.op->trait()->name, op_info_getter, tinfo_to_tid(cmd.inputs), tinfo_to_tid(cmd.outputs));
+    RECORD_EVENT(OpDispatchEvent, cmd.id, cmd.op->trait()->name, op_info_getter,
+                 tinfo_to_tid(cmd.inputs), tinfo_to_tid(cmd.outputs),
+                 state.stack_manager.dump());
    m_buffer.enqueue(std::move(cmd));
    if (!validated && options.async_level == 1) {
        sync_impl();
@@ -346,7 +346,6 @@ void ChannelImpl::dispatch_kernel(
            info->ptr->comp_node().sync();
        }
    }
-    state.scopes.pop(name);
 }

 SmallVector<Handle> ChannelImpl::apply_op(
@@ -505,7 +504,8 @@ TensorInfo* ChannelImpl::alloc() {
    }();
    info->id = Profiler::next_id();
    if (Profiler::is_profiling()) {
-        info->name = state.scopes.next_tensor_name();
+        size_t tensor_id = state.stack_manager.current()->next_id("tensor");
+        info->name = state.stack_manager.dump().to_string() + ssprintf(":%zu", tensor_id);
    }
    return info;
 }
@@ -554,7 +554,7 @@ void ChannelImpl::free(TensorInfo* ptr) {
 }

 void ChannelImpl::recursive_free(TensorInfo* ptr) {
-    RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandEvent::RecFree);
+    RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandKind::RecFree);
    SmallVector<TensorInfo*> inps;
    if (ptr->producer) {
        for (auto i : ptr->producer->inputs) {
@@ -569,7 +569,7 @@ void ChannelImpl::recursive_free(TensorInfo* ptr) {
            recursive_free(i);
        }
    }
-    RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandFinishEvent::RecFree);
+    RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandKind::RecFree);
 }

 void ChannelImpl::real_free(TensorInfo* ptr) {
@@ -625,9 +625,9 @@ void ChannelImpl::regenerate(TensorInfo* dest) {
        m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest});
        if (!m_applying) flush_apply_stack();
    } else if (dest->evict_type == EvictType::SWAP) {
-        RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandEvent::ReGen);
+        RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen);
        produce_tensor(dest, Tensor::make(dest->h_value));
-        RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandFinishEvent::ReGen);
+        RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandKind::ReGen);
    }
 }

@@ -721,22 +721,24 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
        // refcnt --, owners: [tensor_inputs]
        // if it's decreased to 1, would be detected at @see: proxy_graph_detail::apply_on_physical_tensor
        uint64_t del_id = del->id;
-        RECORD_EVENT(OpDelEvent, del_id);
+        RECORD_EVENT(TensorCommandEvent, del_id, TensorCommandKind::Del);
        free(del);
-        RECORD_EVENT(OpDelFinishEvent, del_id);
+        RECORD_EVENT(TensorCommandFinishEvent, del_id, TensorCommandKind::Del);
    }
    // Before wait
    //TODO: split operator wait and execute so that OpWait could be corrected recorded.
    // Before execute
    for (auto&& [device, kernel_id]: kernels) {
-        RECORD_EVENT(KernelExecuteEvent, apply_id, kernel_id, Timer::record_event(device));
+        RECORD_EVENT(KernelLaunchEvent, apply_id, kernel_id, device);
+        RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device));
    }
    // Apply op
    // Here std::move is REQUIRED for removing duplicated references.
    auto outputs = apply_on_physical_tensor(apply_on_physical_tensor, *cmd.op, inputs);
    // After execute
    for (auto&& [device, kernel_id]: kernels) {
-        RECORD_EVENT(KernelExecuteFinishEvent, apply_id, kernel_id, Timer::record_event(device));
+        RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device));
+        RECORD_EVENT(KernelLaunchFinishEvent, apply_id, kernel_id, device);
    }
    // End profiling operator
    mgb_assert(outputs.size() == cmd.outputs.size());
@@ -787,7 +789,7 @@ void ChannelImpl::flush_apply_stack() {
                m_dtr.pin(cmd.inputs);
            }
            if (recomp) {
-                RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandEvent::ReGen);
+                RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandKind::ReGen);
            }
        }
        bool regen = false;
@@ -810,7 +812,7 @@ void ChannelImpl::flush_apply_stack() {
        m_apply_stack.pop();
        do_apply_op(cmd_backup);
        if (recomp_backup) {
-            RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandFinishEvent::ReGen);
+            RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen);
            for (auto o : cmd_backup.outputs) {
                if (o) {
                    m_dtr.update_dsu_after_recompute(o);
@@ -902,7 +904,7 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) {
        check_worker_exc_unsafe();
        return require_host ? host_available() : static_cast<bool>(info->ptr);
    });
-    RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop, m_waitee == nullptr);
+    RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop);
    m_waitee = nullptr;
    return info->ptr;
 }
@@ -1003,7 +1005,7 @@ std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPt
    return {outputs_desc, alloc_storage(outputs_desc), alloc_storage(workspaces_desc)};
 }

-void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
+void ChannelImpl::process_one_task(Command& icmd) {
    using namespace ranges;
    using namespace ranges::views;
    auto& state = get_worker_state();
@@ -1012,10 +1014,12 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
    auto cmd_visitor = [&](const auto& cmd) {
            using T = std::decay_t<decltype(cmd)>;
            if constexpr (std::is_same_v<T, Put>) {
-                RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Put);
+                RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Put);
+                RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node()));
                auto value = cmd.no_cache ? std::make_shared<Tensor>(cmd.value) : Tensor::make(cmd.value);
+                RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node()));
                produce_tensor(cmd.dest, std::move(value));
-                RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Put);
+                RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Put);
                sample_on_device(cmd.dest->desc.comp_node, false);
            } else if constexpr (std::is_same_v<T, ApplyOp>) {
                for (auto& i : cmd.inputs) {
@@ -1084,11 +1088,11 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
                    }
                }
            } else if constexpr (std::is_same_v<T, Del>) {
-                RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Del);
+                RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Del);
                CompNode device = cmd.dest->desc.comp_node;
                uint64_t tensor_id = cmd.dest->id;
                free(cmd.dest);
-                RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandFinishEvent::Del);
+                RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandKind::Del);
                sample_on_device(device, false);
            } else if constexpr (std::is_same_v<T, GetValue>) {
                if (cmd.dest->invalid) return;
@@ -1102,26 +1106,26 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
                imperative_log_profile_end("GetValue");
            } else if constexpr (std::is_same_v<T, SwapIn>) {
                if (cmd.dest->invalid) return;
-                RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapIn);
+                RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapIn);
                produce_tensor(cmd.dest, Tensor::make(cmd.dest->h_value));
-                RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapIn);
+                RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapIn);
                sample_on_device(cmd.dest->desc.comp_node, false);
            } else if constexpr (std::is_same_v<T, SwapOut>) {
                if (cmd.dest->invalid) return;
-                RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapOut);
+                RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapOut);
                cmd.dest->h_value = cmd.dest->ptr->get_value();
                if (cmd.dest->evict_type == EvictType::NONE) {
                    cmd.dest->evict_type = EvictType::SWAP;
                    cmd.dest->status = TensorInfo::Swapped;
                    release_tensor(cmd.dest);
                }
-                RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapOut);
+                RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapOut);
                sample_on_device(cmd.dest->desc.comp_node, false);
            } else if constexpr (std::is_same_v<T, Drop>) {
                if (cmd.dest->invalid) return;
-                RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Drop);
+                RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Drop);
                do_drop(cmd.dest, true);
-                RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Drop);
+                RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Drop);
            } else if constexpr (std::is_same_v<T, SetOption>) {
                options.set_option(cmd.key, cmd.value);
            } else if constexpr (std::is_same_v<T, StartProfile>) {
@@ -1138,6 +1142,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
                    if (Profiler::get_option("sample_rate", 0)) {
                        sample_on_device(device, true);
                    }
+                    RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device));
                });
                RECORD_EVENT(StartProfileFinishEvent);
            } else if constexpr (std::is_same_v<T, StopProfile>) {
@@ -1186,7 +1191,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
                notify_tensor_unsafe(m_waitee);
            }
        }
-    }, icmd.second);
+    }, icmd.data);
 }

 void ChannelImpl::check_worker_exc_unsafe() {
@@ -1203,12 +1208,13 @@ void ChannelImpl::check_worker_exc_unsafe() {
    }
 }

-void ChannelImpl::CommandBuffer::enqueue(Command cmd) {
+void ChannelImpl::CommandBuffer::enqueue(CommandData cmd) {
+    auto& state = m_owner->get_channel_state();
    if (std::get_if<Del>(&cmd) && fuse_del(std::get<Del>(cmd))) {
        return;
    }
    // mgb_log_debug("%s Enqueued", to_string(cmd).c_str());
-    m_commands.push_back(std::move(cmd));
+    m_commands.push_back({Profiler::next_id(), std::move(cmd), state.stack_manager.dump()});
    auto flush_pos = flush_pos_for(m_commands.back());
    flush(flush_pos);
 }
@@ -1222,7 +1228,7 @@ void ChannelImpl::CommandBuffer::flush(Handle pos) {
        if (Profiler::is_profiling()) {
            mgb_log_debug("%s Flushed", to_string(*iter).c_str());
        }
-        m_owner->m_worker.add_task(IdentifiedCommand{Profiler::next_id(), std::move(*iter)});
+        m_owner->m_worker.add_task(std::move(*iter));
    }
    m_commands.erase(m_commands.begin(), pos);
 }
@@ -1248,7 +1254,7 @@ auto ChannelImpl::CommandBuffer::flush_pos_for(const Command& cmd) -> Handle {
            return m_commands.begin() + (m_commands.size() - buffer_length);
        }
        return m_commands.begin();
-    }, cmd);
+    }, cmd.data);
 }

 /**
@@ -1261,7 +1267,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) {
    // TODO: eliminate Puts
    auto begin = m_commands.begin(), end = m_commands.end();
    auto apply_iter = std::find_if(begin, end, [dest](const Command& cmd){
-        if (auto* apply = std::get_if<ApplyOp>(&cmd)) {
+        if (auto* apply = std::get_if<ApplyOp>(&cmd.data)) {
            return std::count(apply->inputs.begin(), apply->inputs.end(), dest) > 0;
        }
        return false;
@@ -1270,7 +1276,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) {
        return false;
    }
    // mgb_log_debug("%s Fused", to_string(Command{cmd}).c_str());
-    std::get<ApplyOp>(*apply_iter).dels.push_back(dest);
+    std::get<ApplyOp>(apply_iter->data).dels.push_back(dest);
    return true;
 }

@@ -1297,7 +1303,7 @@ auto ChannelImpl::CommandBuffer::find_last_usage(TensorInfo* dest, Range range)
                    found = iter;
                }
            }
-        }, *iter);
+        }, iter->data);
    };
    return found;
 }
@@ -1313,7 +1319,7 @@ auto ChannelImpl::CommandBuffer::find_produce(TensorInfo* dest, Range range)
                return cmd.dest == dest;
            }
            return false;
-        }, cmd);
+        }, cmd.data);
    });
 }

@@ -1340,7 +1346,7 @@ void ChannelImpl::push_scope(std::string name) {
    MGB_LOCK_GUARD(m_spin);
    mgb_assert(check_available(), "Channel already closed");
    auto& state = get_channel_state();
-    state.scopes.push(name);
+    state.stack_manager.enter(name);
    RECORD_EVENT(ScopeEvent, name);
    m_buffer.enqueue(PushScope{name});
 }
@@ -1349,7 +1355,7 @@ void ChannelImpl::pop_scope(std::string name) {
    MGB_LOCK_GUARD(m_spin);
    mgb_assert(check_available(), "Channel already closed");
    auto& state = get_channel_state();
-    state.scopes.pop(name);
+    state.stack_manager.exit(name);
    RECORD_EVENT(ScopeFinishEvent, name);
    m_buffer.enqueue(PopScope{name});
 }

--- a/imperative/src/impl/interpreter/interpreter_impl.h
+++ b/imperative/src/impl/interpreter/interpreter_impl.h
@@ -26,6 +26,7 @@
 #include "./commands.h"
 #include "./tensor_info.h"
 #include "./option_manager.h"
+#include "./stack_manager.h"

 #include "../profiler/events.h"

@@ -94,7 +95,7 @@ private:
    TensorPtr wait_tensor(TensorInfo* info, profiler::TensorProp prop);
    void notify_tensor_unsafe(TensorInfo* info);

-    void process_one_task(IdentifiedCommand&);
+    void process_one_task(Command&);

    void check_worker_exc_unsafe();

@@ -129,10 +130,10 @@ private:
    void assert_in_worker();
    std::thread::id get_worker_tid();

-    template <typename TCommand>
-    void enqueue_command(TCommand&& cmd) {
-        m_buffer.enqueue(Command{std::forward<TCommand>(cmd)});
-    }
+    // template <typename TCommand>
+    // void enqueue_command(TCommand&& cmd) {
+    //     m_buffer.enqueue(Command{std::forward<TCommand>(cmd)});
+    // }

    void sample_on_device(CompNode device, bool force);

@@ -153,13 +154,13 @@ private:
    bool m_applying = false;
    bool m_closed = false;

-    struct WorkQueue : AsyncQueueSC<IdentifiedCommand, WorkQueue> {
+    struct WorkQueue : AsyncQueueSC<Command, WorkQueue> {
        // set max_spin=0 to prevent Queue fetch task in busy wait manner.
        // this won't affect throughput when python interpreter is sending enough task,
        // but will significantly save CPU time when waiting for task, e.g. wait for data input
        // limit pending tasks to 10000
        WorkQueue(ChannelImpl* owner)
-                : AsyncQueueSC<IdentifiedCommand, WorkQueue>(0, 10000), m_owner(owner) {
+                : AsyncQueueSC<Command, WorkQueue>(0, 10000), m_owner(owner) {
            sys::set_thread_name("interpreter");
            if (const char* env_val = MGB_GETENV("MEGENGINE_ASYNC_QUEUE_SIZE")) {
                int len = strlen(env_val);
@@ -171,7 +172,7 @@ private:
                update_max_items(val);
            }
        }
-        void process_one_task(IdentifiedCommand& icmd) {
+        void process_one_task(Command& icmd) {
            m_owner->process_one_task(icmd);
        }
        void on_async_queue_worker_thread_start() override;
@@ -193,7 +194,7 @@ private:
     */
    struct CommandBuffer {
        CommandBuffer(ChannelImpl* owner) : m_owner(owner) {}
-        void enqueue(Command cmd);
+        void enqueue(CommandData cmd);
        bool empty() const {
            return m_commands.empty();
        }
@@ -224,91 +225,13 @@ private:
    //! level 0: both sync.
    int m_async_level = 2;

-    struct Scope {
-        std::string name;
-        std::unordered_map<std::string, std::unique_ptr<Scope>> children;
-        size_t version = 0;
-        size_t parent_version = 0;
-        size_t tensor_count = 0;
-        Scope* active_child = nullptr;
-        Scope* parent = nullptr;
-
-        Scope* enter(std::string name) {
-            auto& child = children[name];
-            if (!child) {
-                child = std::make_unique<Scope>();
-                child->name = name;
-                child->parent = this;
-            }
-            if (version != child->parent_version) {
-                child->version = 0;
-                child->parent_version = version;
-            } else {
-                child->version++;
-            }
-            child->tensor_count = 0;
-            return active_child = child.get();
-        }
-
-        Scope* exit(std::string name) {
-            mgb_assert(this->name == name, "scope name mismatch");
-            parent->active_child = nullptr;
-            return parent;
-        }
-    };
-
-    class ScopeManager {
-    private:
-        Scope m_root;
-        Scope* m_current_scope = &m_root;
-    public:
-        class ScopeGuard{
-        private:
-            ScopeManager* m_manager;
-            std::string m_name;
-        public:
-            ScopeGuard(ScopeManager* manager, std::string name): m_manager{manager}, m_name{name} {
-                m_manager->push(m_name);
-            }
-            ~ScopeGuard() {
-                m_manager->pop(m_name);
-            }
-        };
-        void push(std::string name) {
-            m_current_scope = m_current_scope->enter(name);
-        }
-        void pop(std::string name) {
-            m_current_scope = m_current_scope->exit(name);
-        }
-        std::string next_tensor_name() {
-            std::string builder;
-            Scope* scope = &m_root;
-            while (true) {
-                builder.append(scope->name);
-                if (scope->version != 0) {
-                    builder.append(ssprintf("(%ld)", scope->version));
-                }
-                if (scope != &m_root) {
-                    builder.append(".");
-                }
-                if (scope->active_child == nullptr) {
-                    builder.append(ssprintf(":%%%ld", scope->tensor_count++));
-                    break;
-                } else {
-                    scope = scope->active_child;
-                }
-            }
-            return builder;
-        }
-    };
-
    struct State {
        std::thread::id tid;
        OptionManager options;
    };

    struct ChannelState: State {
-        ScopeManager scopes;
+        StackManager stack_manager;
    };

    struct WorkerState: State {};

--- a/imperative/src/impl/interpreter/stack_manager.h
+++ b/imperative/src/impl/interpreter/stack_manager.h
+/**
+ * \file imperative/src/impl/interpreter/stack_manager.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include <string>
+#include <memory>
+#include <unordered_map>
+
+#include "megbrain/utils/metahelper.h"
+#include "megbrain/utils/small_vector.h"
+
+namespace mgb::imperative::interpreter::intl{
+
+class StackSnapshot;
+
+class StackManager: public NonCopyableObj {
+public:
+    class Node;
+    class Guard;
+    struct Frame;
+    class Trace;
+private:
+    std::unique_ptr<Node> m_root = nullptr;
+    Node* m_current = nullptr;
+    SmallVector<uint64_t> m_trace_id_stack;
+    uint64_t m_last_trace_id = 0;
+public:
+    StackManager();
+    std::pair<Node*, uint64_t> enter(std::string name);
+    void exit(std::string name);
+    Trace dump();
+    Node* current();
+};
+
+class StackManager::Node: public NonCopyableObj {
+private:
+    std::string m_name;
+    std::unordered_map<std::string, std::unique_ptr<Node>> m_children;
+    std::unordered_map<std::string, size_t> m_id_table;
+    Node* m_parent = nullptr;
+    int64_t m_depth = -1;
+    uint64_t m_version = 0;
+    explicit Node(std::string name, Node* parent): m_name{name}, m_parent{parent} {
+        if (parent) {
+            m_depth = parent->m_depth + 1;
+        }
+    }
+public:
+    const std::string& name() const {
+        return m_name;
+    }
+    Node* operator[](const std::string& name) {
+        auto& child = m_children[name];
+        if (child == nullptr) {
+            child.reset(new Node(name, this));
+        }
+        return child.get();
+    }
+    Node* parent() {
+        return m_parent;
+    }
+    bool is_root() {
+        return m_parent == nullptr;
+    }
+    uint64_t version() const {
+        return m_version;
+    }
+    void update_version() {
+        ++m_version;
+        for (auto&& [key, child]: m_children) {
+            child->reset_version();
+        }
+        m_id_table.clear();
+    }
+    void reset_version() {
+        m_version = 0;
+        m_id_table.clear();
+    }
+    int64_t depth() const {
+        return m_depth;
+    }
+    uint64_t next_id(std::string key) {
+        return m_id_table[key]++;
+    }
+    static std::unique_ptr<Node> make() {
+        return std::unique_ptr<Node>(new Node("", nullptr));
+    }
+};
+
+class StackManager::Guard {
+private:
+    std::string m_name;
+    StackManager* m_manager;
+public:
+    Guard(std::string name, StackManager* manager): m_name{name}, m_manager{manager}{
+        if (m_manager) {
+            m_manager->enter(name);
+        }
+    }
+    ~Guard() {
+        release();
+    }
+    void release() {
+        if (m_manager) {
+            m_manager->exit(m_name);
+            m_manager = nullptr;
+        }
+    }
+};
+
+struct StackManager::Frame {
+    StackManager::Node* node;
+    uint64_t version;
+};
+
+class StackManager::Trace {
+private:
+    SmallVector<StackManager::Frame> m_frames;
+    uint64_t m_id = 0;
+public:
+    explicit Trace(StackManager::Node* top, uint64_t id): m_id{id} {
+        int64_t nr_frames = top->depth() + 1;
+        m_frames = SmallVector<StackManager::Frame>(nr_frames);
+        StackManager::Node* node = top;
+        for (int64_t i = 0; i < nr_frames; ++i) {
+            m_frames[m_frames.size()-1-i] = {node, node->version()};
+            node = node->parent();
+        }
+        mgb_assert(node->is_root() , "");
+    }
+    Trace() = default;
+    std::string to_string() const {
+        std::string buffer;
+        for (auto&& [node, version]: m_frames) {
+            if (!buffer.empty()) {
+                buffer.append(".");
+            }
+            buffer.append(node->name());
+            if (version != 0) {
+                buffer.append(ssprintf("[%zu]", version));
+            }
+        }
+        return buffer;
+    }
+    const SmallVector<StackManager::Frame>& frames() const {
+        return m_frames;
+    }
+    uint64_t id() const {
+        return m_id;
+    }
+};
+
+inline StackManager::StackManager() {
+    m_root = Node::make();
+    m_current = m_root.get();
+}
+
+inline std::pair<StackManager::Node*, uint64_t> StackManager::enter(std::string name) {
+    m_current = (*m_current)[name];
+    m_trace_id_stack.push_back(++m_last_trace_id);
+    return {m_current, m_current->version()};
+}
+
+inline void StackManager::exit(std::string name) {
+    mgb_assert(m_current->name() == name, "scope name mismatch");
+    m_current = m_current->parent();
+    m_trace_id_stack.pop_back();
+    m_current->update_version();
+}
+
+inline StackManager::Trace StackManager::dump() {
+    return Trace(m_current, m_trace_id_stack.empty() ? 0 : m_trace_id_stack.back());
+}
+
+inline StackManager::Node* StackManager::current() {
+    return m_current;
+}
+
+}
--- a/imperative/src/impl/ops/opr_attr.cpp
+++ b/imperative/src/impl/ops/opr_attr.cpp
@@ -102,7 +102,8 @@ std::vector<std::pair<const char*, std::string>> props(const OpDef& def) {
 }

 std::string make_name(const OpDef& def) {
-    return "OprAttr";
+    auto&& attr = def.cast_final_safe<OprAttr>();
+    return attr.type;
 }

 OP_TRAIT_REG(OprAttr, OprAttr)

--- a/imperative/src/impl/profiler.cpp
+++ b/imperative/src/impl/profiler.cpp
@@ -27,25 +27,11 @@
 namespace mgb {
 namespace imperative {

-uint64_t Timer::get_nsecs() {
-    using namespace std::chrono;
-    auto finish = steady_clock::now();
-    auto duration = duration_cast<nanoseconds>(finish - m_start);
-    return duration.count();
+profiler::Time Timer::record_host() {
+    return std::chrono::high_resolution_clock::now();
 }

-uint64_t Timer::get_started_at() {
-    return m_started_at;
-}
-
-void Timer::reset() {
-    using namespace std::chrono;
-    m_start = steady_clock::now();
-    auto now_ns = duration_cast<nanoseconds>(std::chrono::system_clock::now().time_since_epoch());
-    m_started_at = now_ns.count();
-}
-
-std::shared_ptr<CompNode::Event> Timer::record_event(CompNode device) {
+std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) {
    auto event = EventPool::with_timer().alloc_shared(device);
    event->record();
    return event;
@@ -55,13 +41,13 @@ Profiler::options_t Profiler::sm_profile_options;
 std::mutex Profiler::sm_mutex;
 std::unordered_map<std::thread::id, Profiler*> Profiler::sm_profilers;
 Timer Profiler::sm_timer;
+profiler::HostTime Profiler::sm_start_at;
 std::atomic_uint64_t Profiler::sm_last_id = 0;
 bool Profiler::sm_profiling = false;
 thread_local std::unique_ptr<Profiler> Profiler::tm_profiler = std::make_unique<Profiler>();
 std::atomic_size_t Profiler::sm_preferred_capacity;

 auto Profiler::get_thread_dict() -> thread_dict_t {
-    MGB_LOCK_GUARD(sm_mutex);
    thread_dict_t thread_dict;
    for (auto&& [tid, profiler]: sm_profilers) {
        thread_dict[tid] = profiler->m_thread_name;
@@ -69,15 +55,13 @@ auto Profiler::get_thread_dict() -> thread_dict_t {
    return thread_dict;
 }

-void Profiler::dump_profile(std::string basename, std::string format, results_t results, options_t options) {
-    auto thread_dict = get_thread_dict();
-    if (format == "chrome_timeline.json") {
-        profiler::dump_chrome_timeline(basename, options, thread_dict, results);
-    } else if (format == "memory_flow.svg") {
-        profiler::dump_memory_flow(basename, options, thread_dict, results);
-    } else {
+void Profiler::dump_profile(std::string basename, std::string format, bundle_t result) {
+    std::unordered_map<std::string, void(*)(std::string, bundle_t)> format_table;
+    auto iter = format_table.find(format);
+    if (iter == format_table.end()) {
        mgb_log_error("unsupported profiling format %s", format.c_str());
    }
+    return (iter->second)(basename, result);
 }

 }  // namespace imperative

--- a/imperative/src/impl/profiler/chrome_timeline.cpp
+++ b/imperative/src/impl/profiler/chrome_timeline.cpp
@@ -17,6 +17,10 @@
 #error Unsupported platform
 #endif

+#include "nlohmann/json.hpp"
+
+#include "megbrain/utils/debug.h"
+
 #include "./formats.h"
 #include "./states.h"

@@ -53,8 +57,9 @@ public:
        m_idx = idx;
        return *this;
    }
-    ChromeTraceEvent& ts(uint64_t ts) {
-        m_ts = ts;
+    template <typename TDuration>
+    ChromeTraceEvent& ts(TDuration ts) {
+        m_ts = std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(ts).count();
        return *this;
    }
    ChromeTraceEvent& dur(uint64_t dur) {
@@ -69,51 +74,46 @@ public:
        m_bp = bp;
        return *this;
    }
-    ChromeTraceEvent& args(std::shared_ptr<json::Object> args) {
+    ChromeTraceEvent& args(nlohmann::json args) {
        m_args = std::move(args);
        return *this;
    }
    ChromeTraceEvent& arg(std::string key, std::string value) {
-        if (!m_args) {
-            m_args = json::Object::make();
-        }
-        (*m_args)[key] = json::String::make(value);
+        m_args[key] = value;
        return *this;
    }
    ChromeTraceEvent& arg(std::string key, double value) {
-        if (!m_args) {
-            m_args = json::Object::make();
-        }
-        (*m_args)[key] = json::Number::make(value);
+        m_args[key] = value;
        return *this;
    }
-    ChromeTraceEvent& arg(std::string key, std::shared_ptr<json::Value> value) {
-        if (!m_args) {
-            m_args = json::Object::make();
-        }
-        (*m_args)[key] = value;
+    ChromeTraceEvent& arg(std::string key, nlohmann::json value) {
+        m_args[key] = value;
+        return *this;
+    }
+    ChromeTraceEvent& stack(Trace trace) {
+        m_stack = std::move(trace);
        return *this;
    }

-    std::shared_ptr<json::Object> to_json() const {
-        auto result = json::Object::make();
+    nlohmann::json to_json() const {
+        nlohmann::json result;
        auto prop_str = [&](auto key, auto value) {
            if (value.empty()) {
                return;
            }
-            (*result)[key] = json::String::make(value);
+            result[key] = value;
        };
        auto prop_num = [&](auto key, auto value) {
            if (!value) {
                return;
            }
-            (*result)[key] = json::Number::make(value.value());
+            result[key] = value.value();
        };
        auto prop_char = [&](auto key, auto value) {
            if (!value) {
                return;
            }
-            (*result)[key] = json::String::make(std::string{} + value.value());
+            result[key] = std::string{} + value.value();
        };
        prop_str("name", m_name);
        prop_str("cat", m_cat);
@@ -126,8 +126,16 @@ public:
        prop_num("dur", m_dur);
        prop_char("ph", m_ph);
        prop_char("bp", m_bp);
-        if (m_args) {
-            (*result)["args"] = m_args;
+        if (!m_args.empty()) {
+            result["args"] = m_args;
+        }
+        if (m_stack) {
+            nlohmann::json stack;
+            for (auto&& frame: m_stack->frames()) {
+                stack.push_back(ssprintf("%s%ld", frame.node->name().c_str(), frame.version));
+            }
+            std::reverse(stack.begin(), stack.end());
+            result["stack"] = stack;
        }
        return result;
    }
@@ -140,11 +148,12 @@ private:
    std::optional<uint64_t> m_pid;
    std::optional<uint64_t> m_id;
    std::optional<uint64_t> m_idx;
-    std::optional<uint64_t> m_ts;
+    std::optional<double> m_ts;
    std::optional<uint64_t> m_dur;
    std::optional<char> m_ph;
    std::optional<char> m_bp;
-    std::shared_ptr<json::Object> m_args;
+    nlohmann::json m_args;
+    std::optional<Trace> m_stack;
 };

 class ChromeTraceEvents {
@@ -154,368 +163,218 @@ public:
        return m_content.back();
    }

-    std::shared_ptr<json::Value> to_json() const {
-        auto result = json::Object::make();
-        auto event_list = json::Array::make();
+    std::string metadata(std::string key) {
+        return m_metadata[key];
+    }
+
+    nlohmann::json to_json() const {
+        nlohmann::json result;
+        nlohmann::json event_list;
+        nlohmann::json metadata;
        for (auto&& event: m_content) {
-            event_list->add(event.to_json());
+            event_list.push_back(event.to_json());
+        }
+        for (auto&& [key, value]: m_metadata) {
+            metadata[key] = value;
        }
-        (*result)["traceEvents"] = event_list;
+        result["traceEvents"] = event_list;
+        result["metadata"] = metadata;
        return result;
    }
+
+    std::string to_string() const {
+        auto json = to_json();
+        return "{" "traceEvents:" + nlohmann::to_string(json["traceEvents"]) + ","
+            "metadata:" + nlohmann::to_string(json["metadata"]) + "}";
+    }
 private:
    std::vector<ChromeTraceEvent> m_content;
+    std::unordered_map<std::string, std::string> m_metadata;
 };


-void dump_chrome_timeline(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results){
-    auto pid = getpid();
-
-    ProfileDataCollector collector;
-    ProfileState state;
-#define HANDLE_EVENT(type, ...) \
-    collector.handle<type>([&](uint64_t id, std::thread::id tid, uint64_t time, type event) __VA_ARGS__ );
-
+struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
    ChromeTraceEvents trace_events;
+    decltype(getpid()) pid = getpid();
+    std::string pid_str = std::to_string(pid);

-    #define NEW_HOST(NAME, PH) trace_events.new_event().name(NAME).pid(pid).tid(state[tid].index).ph(PH).ts((double)time/1e3)
-
-    #define NEW_DEVICE(NAME, PH) trace_events.new_event().name(NAME).pid(pid).tid(256+state[event.event->comp_node()].index).ph(PH).ts((double)get_device_time(event.event, time)/1e3)
-
-    #define OP_NAME op_state.name
-
-    #define OP_KERNEL_NAME (op_state.name + "")
-
-    #define OP_PROPS get_op_args(op_state)
-
-    #define OP_ID event.op_id
-
-    #define TENSOR_PROPS get_tensor_args(tensor_state, time)
-
-    #define TENSOR_INFO get_tensor_info(tensor_state, time)
-
-    #define TENSOR_COMMAND_KIND print_tensor_command_kind(event.kind)
-
-    #define HANDLE_PLAIN_EVENT(START, FINISH, NAME_EXPR)\
-                HANDLE_EVENT(START, { NEW_HOST(NAME_EXPR, 'B'); })\
-                HANDLE_EVENT(FINISH, { NEW_HOST(NAME_EXPR, 'E'); })
-
-    #define HANDLE_TENSOR_EVENT(START, FINISH, NAME_EXPR)\
-                HANDLE_EVENT(START, { NEW_HOST(NAME_EXPR, 'B'); })\
-                HANDLE_EVENT(FINISH, { auto& tensor_state = state.tensors[event.tensor_id]; NEW_HOST(NAME_EXPR, 'E').args(TENSOR_PROPS); })
-
-    #define INC_COUNTER(NAME, DELTA)\
-                { state.statics.NAME += DELTA; NEW_HOST(#NAME, 'C').arg(#NAME, state.statics.NAME); }
-
-    auto get_tensor_args = [](const ProfileTensorState& tensor, uint64_t time) -> std::shared_ptr<json::Object> {
-        auto args = json::Object::make();
-        (*args)["id"] = json::Number::make(tensor.id);
-        (*args)["name"] = json::String::make(tensor.name);
-        (*args)["shape"] = json::String::make(tensor.layout.TensorShape::to_string());
-        (*args)["dtype"] = json::String::make(tensor.layout.dtype.name());
-        (*args)["nr_elements"] = json::Number::make(tensor.layout.total_nr_elems());
-        (*args)["device"] = json::String::make(tensor.device.to_string());
-        if (tensor.produced) {
-            (*args)["living_time"] = json::String::make(std::to_string((time - tensor.produced + tensor.living_time)/1e6) + "ms");
-        }
-        return args;
-    };
-
-    auto get_tensor_info = [](const ProfileTensorState& tensor, uint64_t time) -> std::string {
-        std::string name = tensor.name;
-        std::string shape = tensor.layout.TensorShape::to_string();
-        std::string size_in_bytes = std::to_string(tensor.size_in_bytes());
-        std::string device = tensor.device.to_string();
-        std::string dtype = tensor.layout.dtype.name();
-        return ssprintf("%s(%s:%s:%s)", name.c_str(), shape.c_str(), dtype.c_str(), device.c_str());
-    };
+    ChromeTraceEvent& new_event(std::string name, char ph, size_t tid, profiler::HostTime time) {
+        return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts(since_start(time));
+    }

-    auto get_op_args = [&](const ProfileOperatorState& op) -> std::shared_ptr<json::Object> {
-        auto args = json::Object::make();
-        auto params = op.params;
-        for (auto&& [name, value]: params) {
-            (*args)[name] = json::String::make(value);
-        }
-        (*args)["__id__"] = json::Number::make(op.id);
-        (*args)["__name__"] = json::String::make(op.name);
-        (*args)["__device__"] = json::String::make(op.device.to_string());
-        return args;
+    ChromeTraceEvent& new_host_event(std::string name, char ph) {
+        return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(current->tid)).ts(since_start(current->time));
    };

-    auto get_device_time = [&](const std::shared_ptr<CompNode::Event>& event, uint64_t host) -> uint64_t {
-        event->host_wait();
-        auto& device_state = state.devices[event->comp_node()];
-        if (!device_state.base_event) {
-            device_state.base_event = event;
-            device_state.base_time = host;
-            return host;
-        }
-        uint64_t device = device_state.base_event->elapsed_time_until(*event) * 1e9 + device_state.base_time;
-        return std::max(device, host);
+    ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) {
+        using namespace std::literals::chrono_literals;
+        auto time = since_start(to_device_time(current->time, device));
+        return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(device)).ts(time);
    };

-    auto print_tensor_command_kind = [&](int kind) -> const char* {
+    const char* to_cstr(TensorCommandKind kind) {
        switch(kind) {
-            case TensorCommandEvent::Put:
+            case TensorCommandKind::Put:
                return "Put";
-            case TensorCommandEvent::Drop:
+            case TensorCommandKind::Drop:
                return "Drop";
-            case TensorCommandEvent::Del:
+            case TensorCommandKind::Del:
                return "Del";
-            case TensorCommandEvent::SwapIn:
+            case TensorCommandKind::SwapIn:
                return "SwapIn";
-            case TensorCommandEvent::SwapOut:
+            case TensorCommandKind::SwapOut:
                return "SwapOut";
-            case TensorCommandEvent::RecFree:
+            case TensorCommandKind::RecFree:
                return "RecFree";
-            case TensorCommandEvent::ReGen:
+            case TensorCommandKind::ReGen:
                return "ReGen";
+            case TensorCommandKind::GetValue:
+                return "GetValue";
        }
        return "UnknownCommand";
-    };
-
-    HANDLE_EVENT(OpDispatchEvent, {
-        auto& op_state = state.operators[OP_ID] = {};
-        op_state.id = OP_ID;
-        op_state.name = event.op_name;
-        op_state.params = event.op_params();
-        op_state.inputs = event.inputs;
-        op_state.outputs = event.outputs;
-        NEW_HOST("OpDispatch", 'B');
-        NEW_HOST(ssprintf("%d", pid), 's')
-                .cat("OpDispatch")
-                .id(OP_ID)
-                .scope(std::to_string(pid));
-        NEW_HOST("OpDispatch", 'E').args(OP_PROPS);
-        INC_COUNTER(op_enqueue_count, 1);
-    });
-
-    HANDLE_EVENT(OpExecuteEvent, {
-        mgb_assert(OP_ID != 0);
-        mgb_assert(state.operators.count(OP_ID) > 0);
-        auto& op_state = state.operators[OP_ID];
-        op_state.host_begin = time;
-        NEW_HOST(OP_NAME, 'B');
-                //.args(OP_PROPS);
-        NEW_HOST(ssprintf("%d", pid), 't')
-                .cat("OpDispatch")
-                .id(OP_ID)
-                .scope(std::to_string(pid));
-        INC_COUNTER(op_execute_count, 1);
-    });
-
-    HANDLE_EVENT(OpExecuteFinishEvent, {
-        auto& op_state = state.operators[event.op_id];
-        op_state.host_end = time;
-        NEW_HOST(OP_NAME, 'E')
-                .args(OP_PROPS);
-    });
-
-    HANDLE_EVENT(KernelExecuteEvent, {
-        auto& op_state = state.operators[event.op_id];
-        op_state.device_begin = event.event;
-        NEW_HOST(ssprintf("%d", pid), 's')
-                .id(event.kernel_id)
-                .cat("KernelLaunch")
-                .scope(std::to_string(pid));
-        NEW_DEVICE(OP_KERNEL_NAME, 'B')
-                .cat("Kernel");
-                //.args(OP_PROPS);
-        NEW_DEVICE(ssprintf("%d", pid), 'f')
-                .id(event.kernel_id)
-                .bp('e')
-                .cat("KernelLaunch")
-                .scope(std::to_string(pid));
-    });
-
-    HANDLE_EVENT(KernelExecuteFinishEvent, {
-        auto& op_state = state.operators[event.op_id];
-        op_state.device_end = event.event;
-        NEW_DEVICE(OP_KERNEL_NAME, 'E')
-                .cat("Kernel")
-                .args(OP_PROPS);
-    });
-
-    HANDLE_EVENT(TensorDeclareEvent, {
-        auto& tensor_state = state.tensors[event.tensor_id] = {};
-        tensor_state.id = event.tensor_id;
-        tensor_state.name = event.name;
-    });
-
-    HANDLE_EVENT(TensorProduceEvent, {
-        auto& tensor_state = state.tensors[event.tensor_id];
-        tensor_state.device = event.device;
-        tensor_state.layout = event.layout;
-        tensor_state.produced = time;
-        if (!tensor_state.living_time) {
-            NEW_HOST(ssprintf("%d", pid), 's')
-                .id(event.tensor_id)
-                .cat("TensorLink")
-                .scope(std::to_string(pid));
-        } else {
-            NEW_HOST(ssprintf("%d", pid), 't')
-                .id(event.tensor_id)
-                .cat("TensorLink")
-                .scope(std::to_string(pid));
-        }
-        INC_COUNTER(alive_tensor_count, 1);
-        INC_COUNTER(produce_tensor_count, 1);
-        state.tensors_by_size.insert({tensor_state.id, tensor_state.size_in_bytes()});
-        state.tensors_by_produced.insert({tensor_state.id, tensor_state.produced});
-    });
-
-    HANDLE_EVENT(TensorUsageEvent, {
-        NEW_HOST(ssprintf("%d", pid), 't')
-                .id(event.tensor_id)
-                .cat("TensorLink")
-                .scope(std::to_string(pid));
-    });
-
-    HANDLE_EVENT(TensorReleaseEvent, {
-        auto& tensor_state = state.tensors[event.tensor_id];
-        tensor_state.living_time += time - tensor_state.produced;
-        tensor_state.produced = 0;
-        INC_COUNTER(alive_tensor_count, -1);
-        INC_COUNTER(erase_tensor_count, 1);
-        state.tensors_by_size.erase({tensor_state.id, tensor_state.size_in_bytes()});
-        state.tensors_by_produced.erase({tensor_state.id, tensor_state.produced});
-        NEW_HOST(ssprintf("%d", pid), 't')
-                .id(event.tensor_id)
-                .cat("TensorLink")
-                .scope(std::to_string(pid));
-    });
-
-    HANDLE_EVENT(TensorEraseEvent, {
-        auto& tensor_state = state.tensors[event.tensor_id];
-        if (tensor_state.living_time) {
-            NEW_HOST(ssprintf("%d", pid), 'f')
-                .id(event.tensor_id)
-                .bp('e')
-                .cat("TensorLink")
-                .scope(std::to_string(pid));
-        }
-        if (event.use_count == 0) {
-            INC_COUNTER(redundant_tensor_count, 1);
-        }
-    });
-
-    HANDLE_EVENT(TensorGetPropEvent, {
-        auto& tensor_state = state.tensors[event.tensor_id];
-        NEW_HOST("TensorGetProp", 'X')
-                .dur(0).args(TENSOR_PROPS);
-    });
-
-    HANDLE_EVENT(TensorWaitPropEvent, {
-        NEW_HOST("TensorWaitProp", 'B');
-        if (event.prop == TensorProp::HostValue) {
-            INC_COUNTER(wait_value_count, 1);
-        } else if (event.prop == TensorProp::Shape) {
-            INC_COUNTER(wait_shape_count, 1);
-        }
-        INC_COUNTER(wait_prop_count, 1);
-    });
-
-    HANDLE_EVENT(TensorWaitPropFinishEvent, {
-        auto& tensor_state = state.tensors[event.tensor_id];
-        if (event.notified) {
-            NEW_HOST(ssprintf("%d", pid), 'f')
-                .id(event.tensor_id)
-                .bp('e')
-                .cat("TensorProp")
-                .scope(std::to_string(pid));
-        }
-        NEW_HOST("TensorWaitProp", 'E')
-                .args(TENSOR_PROPS);
-    });
-
-    HANDLE_EVENT(TensorNotifyPropEvent, {
-        NEW_HOST(ssprintf("%d", pid), 's')
-                .id(event.tensor_id)
-                .cat("TensorProp")
-                .scope(std::to_string(pid));
-    });
-
-    HANDLE_EVENT(ShapeInferEvent, {
-        if (event.success) {
-            INC_COUNTER(infer_shape_valid_count, 1);
-        } else {
-            INC_COUNTER(infer_shape_invalid_count, 1);
-        }
-    });
-
-    HANDLE_EVENT(SampleDeviceEvent, {
-        NEW_HOST("TopKTensor", 'B');
-    });
-
-    HANDLE_EVENT(SampleDeviceFinishEvent, {
-        std::string device_name = event.device.locator().to_string();
-        std::string prop_name = ssprintf("%s_alloc_memory", device_name.c_str());
-        NEW_HOST(prop_name, 'C')
-                .arg(prop_name, event.total_memory - event.free_memory);
-        auto top_k_tensors = state.top_k_tensor_in_device(event.device, options.at("num_tensor_watch"));
-        auto& top_k_event = NEW_HOST("TopKTensor", 'E');
-        for (size_t i = 0; i < top_k_tensors.size(); ++i) {
-            auto tensor_id = top_k_tensors[i];
-            auto& tensor_state = state.tensors[tensor_id];
-            top_k_event.arg(ssprintf("top%03d", (int)i), TENSOR_INFO); //%03d is always enough
-        }
-    });
-
-    HANDLE_EVENT(WorkerExceptionEvent, {
-        INC_COUNTER(exception_count, 1);
-    });
-
-    HANDLE_EVENT(TensorCommandEvent, {
-        NEW_HOST(ssprintf("%s %zu", TENSOR_COMMAND_KIND, event.tensor_id), 'B');
-    });
-
-    HANDLE_EVENT(TensorCommandFinishEvent, {
-        auto& tensor_state = state.tensors[event.tensor_id];
-        NEW_HOST(ssprintf("%s %zu", TENSOR_COMMAND_KIND, event.tensor_id), 'E')
-            .args(TENSOR_PROPS);
-    });
-
-    HANDLE_EVENT(ScopeEvent, {
-        NEW_HOST(event.name, 'B');
-        state.threads[tid].scope_stack.push_back(event.name);
-    });
-
-    HANDLE_EVENT(ScopeFinishEvent, {
-        NEW_HOST(event.name, 'E');
-        mgb_assert(state.threads[tid].scope_stack.back() == event.name);
-        state.threads[tid].scope_stack.pop_back();
-    });
-
-    HANDLE_TENSOR_EVENT(OpInputEvent, OpInputFinishEvent, ssprintf("Input %zu", event.tensor_id));
-    HANDLE_TENSOR_EVENT(OpOutputEvent, OpOutputFinishEvent, ssprintf("Output %zu", event.tensor_id));
-    HANDLE_TENSOR_EVENT(OpDelEvent, OpDelFinishEvent, ssprintf("Del %zu", event.tensor_id));
-    HANDLE_PLAIN_EVENT(StartProfileEvent, StartProfileFinishEvent, "StartProfile");
-    HANDLE_PLAIN_EVENT(StopProfileEvent, StopProfileFinishEvent, "StopProfile");
-    HANDLE_PLAIN_EVENT(CustomEvent, CustomFinishEvent, event.title);
-    HANDLE_PLAIN_EVENT(AutoEvictEvent, AutoEvictFinishEvent, "AutoEvict");
-
-    if (results.size() > 0) {
-        uint64_t time = results[0].second.time;
-        trace_events.new_event().name("Metadata").ph('I').pid(pid).ts(0).arg("localTime", time/1e3);
    }

-    for (auto&& result: results) {
-        collector(result.second.id, result.first, result.second.time, result.second.data);
-    }
-
-    for (auto&& [tid, thread]: state.threads) {
-        if (!thread_dict.count(tid)) {
-            continue;
+    template <typename TEvent>
+    void visit_event(const TEvent &event) {
+        if constexpr (std::is_same_v<TEvent, OpDispatchEvent>) {
+            new_host_event("OpDispatch", 'B');
+            new_host_event(pid_str, 's')
+                    .cat("OpDispatch")
+                    .id(event.op_id)
+                    .scope(pid_str);
+            new_host_event("OpDispatch", 'E').args(current_op->detail());
+        } else if constexpr (std::is_same_v<TEvent, OpExecuteEvent>) {
+            mgb_assert(event.op_id != 0);
+            current_op->execute_begin = current->time;
+            new_host_event(current_op->name, 'B');
+            new_host_event(pid_str, 't')
+                    .cat("OpDispatch")
+                    .id(current_op->id)
+                    .scope(pid_str);
+        } else if constexpr (std::is_same_v<TEvent, OpExecuteFinishEvent>) {
+            current_op->execute_end = current->time;
+            new_host_event(current_op->name, 'E')
+                    .args(current_op->detail());
+        } else if constexpr (std::is_same_v<TEvent, KernelLaunchEvent>) {
+            new_host_event(pid_str, 's')
+                    .id(event.kernel_id)
+                    .cat("KernelLaunch")
+                    .scope(pid_str);
+            new_device_event(current_op->name, 'B', event.device)
+                    .cat("Kernel");
+            new_device_event(pid_str, 'f', event.device)
+                    .id(event.kernel_id)
+                    .bp('e')
+                    .cat("KernelLaunch")
+                    .scope(pid_str);
+        } else if constexpr (std::is_same_v<TEvent, KernelLaunchFinishEvent>) {
+            new_device_event(current_op->name, 'E', event.device)
+                    .cat("Kernel")
+                    .args(current_op->detail());
+        } else if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) {
+            if (current_tensor->living_time != profiler::Duration::zero()) {
+                new_host_event(pid_str, 's')
+                    .id(event.tensor_id)
+                    .cat("TensorLink")
+                    .scope(pid_str);
+            } else {
+                new_host_event(pid_str, 't')
+                    .id(event.tensor_id)
+                    .cat("TensorLink")
+                    .scope(pid_str);
+            }
+        } else if constexpr (std::is_same_v<TEvent, TensorUsageEvent>) {
+            new_host_event(pid_str, 't')
+                    .id(event.tensor_id)
+                    .cat("TensorLink")
+                    .scope(pid_str);
+        } else if constexpr (std::is_same_v<TEvent, TensorReleaseEvent>) {
+            current_tensor->living_time += current->time - current_tensor->produced;
+            current_tensor->produced = {};
+            new_host_event(pid_str, 't')
+                    .id(event.tensor_id)
+                    .cat("TensorLink")
+                    .scope(pid_str);
+        } else if constexpr (std::is_same_v<TEvent, TensorEraseEvent>) {
+            if (current_tensor->living_time != profiler::Duration::zero()) {
+                new_host_event(pid_str, 'f')
+                        .id(event.tensor_id)
+                        .bp('e')
+                        .cat("TensorLink")
+                        .scope(pid_str);
+            }
+        } else if constexpr (std::is_same_v<TEvent, TensorGetPropEvent>) {
+            new_host_event("TensorGetProp", 'X')
+                    .dur(0).args(current_tensor->detail(current->time));
+        } else if constexpr (std::is_same_v<TEvent, TensorWaitPropFinishEvent>) {
+            new_host_event(pid_str, 'f')
+                    .id(event.tensor_id)
+                    .bp('e')
+                    .cat("TensorProp")
+                    .scope(pid_str);
+            new_host_event("TensorWaitProp", 'E')
+                    .args(current_tensor->detail(current->time));
+        } else if constexpr (std::is_same_v<TEvent, TensorNotifyPropEvent>) {
+            new_host_event(pid_str, 's')
+                    .id(event.tensor_id)
+                    .cat("TensorProp")
+                    .scope(pid_str);
+        } else if constexpr (std::is_same_v<TEvent, SampleDeviceFinishEvent>) {
+            std::string device_name = event.device.locator().to_string();
+            new_host_event("memory", 'C')
+                    .arg(ssprintf("%s_alloc_mem", device_name.c_str()), event.total_memory - event.free_memory);
+        } else if constexpr (std::is_same_v<TEvent, TensorCommandEvent>) {
+            new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'B');
+        } else if constexpr (std::is_same_v<TEvent, TensorCommandFinishEvent>) {
+            new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'E')
+                    .args(current_tensor->detail(current->time));
+        } else if constexpr (std::is_same_v<TEvent, ScopeEvent>) {
+            new_host_event(event.name, 'B');
+        } else if constexpr (std::is_same_v<TEvent, ScopeFinishEvent>) {
+            new_host_event(event.name, 'E');
+        } else if constexpr (std::is_same_v<TEvent, OpInputEvent>) {
+            new_host_event(ssprintf("Input %zu", event.tensor_id), 'B')
+                    .args(current_tensor->detail(current->time));
+        } else if constexpr (std::is_same_v<TEvent, OpInputFinishEvent>) {
+            new_host_event(ssprintf("Input %zu", event.tensor_id), 'E')
+                    .args(current_tensor->detail(current->time));
+        } else if constexpr (std::is_same_v<TEvent, OpOutputEvent>) {
+            new_host_event(ssprintf("Output %zu", event.tensor_id), 'B')
+                    .args(current_tensor->detail(current->time));
+        } else if constexpr (std::is_same_v<TEvent, OpOutputFinishEvent>) {
+            new_host_event(ssprintf("Output %zu", event.tensor_id), 'E')
+                    .args(current_tensor->detail(current->time));
+        } else if constexpr (std::is_same_v<TEvent, StartProfileEvent>) {
+            new_host_event("StartProfile", 'B');
+        } else if constexpr (std::is_same_v<TEvent, StartProfileFinishEvent>) {
+            new_host_event("StartProfile", 'E');
+        } else if constexpr (std::is_same_v<TEvent, StopProfileEvent>) {
+            new_host_event("StopProfile", 'B');
+        } else if constexpr (std::is_same_v<TEvent, StopProfileFinishEvent>) {
+            new_host_event("StopProfile", 'E');
+        } else if constexpr (std::is_same_v<TEvent, CustomEvent>) {
+            new_host_event(event.title, 'B');
+        } else if constexpr (std::is_same_v<TEvent, CustomFinishEvent>) {
+            new_host_event(event.title, 'E');
+        } else if constexpr (std::is_same_v<TEvent, AutoEvictEvent>) {
+            new_host_event("AutoEvict", 'B');
+        } else if constexpr (std::is_same_v<TEvent, AutoEvictFinishEvent>) {
+            new_host_event("AutoEvict", 'E');
        }
-        trace_events.new_event().ts(0).name("thread_name").ph('M').pid(pid).tid(thread.index).arg("name", thread_dict[tid]);
    }

-    for (auto&& [device, device_state]: state.devices) {
-        trace_events.new_event().ts(0).name("thread_name").ph('M').pid(pid).tid(256+device_state.index).arg("name", device.to_string());
+    void notify_counter(std::string key, int64_t old_val, int64_t new_val) {
+        new_host_event(key, 'C').arg("value", new_val);
    }
+};

-    trace_events.to_json()->writeto_fpath(filename);

+void dump_chrome_timeline(std::string filename, Profiler::bundle_t result){
+    ChromeTimelineEventVisitor visitor;
+    visitor.process_events(result);
+    visitor.trace_events.metadata("localTime") = std::to_string(result.start_at.time_since_epoch().count());
+    std::string json_repr = visitor.trace_events.to_string();
+    mgb::debug::write_to_file(filename.c_str(), json_repr);
 }

 }
--- a/imperative/src/impl/profiler/events.h
+++ b/imperative/src/impl/profiler/events.h
@@ -12,7 +12,9 @@
 #pragma once

 #include "megbrain/utils/small_vector.h"
+#include "megbrain/imperative/profiler.h"

+#include "../interpreter/stack_manager.h"
 #include "../op_trait.h"

 namespace mgb::imperative::profiler {
@@ -52,6 +54,11 @@ struct ToStringTrait<profiler::TensorProp>{

 namespace mgb::imperative::profiler {

+using Trace = interpreter::intl::StackManager::Trace;
+
+struct ProfileOperatorState;
+struct ProfileTensorState;
+
 #define DEF_EVENT(X, ...) struct X##Event __VA_ARGS__;
 #define DEF_DUR_EVENT(X, ...) struct X##Event __VA_ARGS__; struct X##FinishEvent __VA_ARGS__;

@@ -61,6 +68,7 @@ DEF_EVENT(OpDispatch, {
    std::function<OpParams()> op_params;
    SmallVector<uint64_t> inputs;
    SmallVector<uint64_t> outputs;
+    Trace trace;
 });

 DEF_DUR_EVENT(OpInput, {
@@ -68,11 +76,6 @@ DEF_DUR_EVENT(OpInput, {
    TensorShape shape;
 });

-DEF_DUR_EVENT(OpDel, {
-    uint64_t tensor_id;
-    TensorShape shape;
-});
-
 DEF_DUR_EVENT(OpOutput, {
    uint64_t tensor_id;
    TensorShape shape;
@@ -80,16 +83,13 @@ DEF_DUR_EVENT(OpOutput, {

 DEF_DUR_EVENT(OpExecute, {
    uint64_t op_id;
+    SmallVector<CompNode> device_list;
 });

-DEF_DUR_EVENT(OpPostExecute, {
-    uint64_t op_id;
-});
-
-DEF_DUR_EVENT(KernelExecute, {
+DEF_DUR_EVENT(KernelLaunch, {
    uint64_t op_id;
    uint64_t kernel_id;
-    std::shared_ptr<CompNode::Event> event;
+    CompNode device;
 });

 DEF_EVENT(TensorDeclare, {
@@ -128,19 +128,12 @@ DEF_EVENT(TensorNotifyProp, {
    TensorProp prop;
 });

-DEF_EVENT(TensorWaitProp, {
+DEF_DUR_EVENT(TensorWaitProp, {
    uint64_t tensor_id;
    uint64_t wait_id;
    TensorProp prop;
 });

-DEF_EVENT(TensorWaitPropFinish, {
-    uint64_t tensor_id;
-    uint64_t wait_id;
-    TensorProp prop;
-    bool notified;
-});
-
 DEF_DUR_EVENT(SampleDevice, {
    CompNode device;
    size_t total_memory;
@@ -157,13 +150,10 @@ DEF_DUR_EVENT(Scope, {
    std::string name;
 });

-DEF_DUR_EVENT(DeviceScope, {
-    std::string name;
-    std::shared_ptr<CompNode::Event> event;
+DEF_DUR_EVENT(Sync, {
+    Trace trace;
 });

-DEF_DUR_EVENT(Sync, {});
-
 DEF_DUR_EVENT(StartProfile, {
    size_t capture_count;
 });
@@ -172,10 +162,13 @@ DEF_DUR_EVENT(StopProfile, {
    size_t escape_count;
 });

+
+enum class TensorCommandKind {
+    Put, Del, SwapIn, SwapOut, Drop, ReGen, RecFree, GetValue
+};
+
 DEF_DUR_EVENT(TensorCommand, {
-    enum Kind {
-        Put, Del, SwapIn, SwapOut, Drop, ReGen, RecFree, GetValue
-    };
+    using Kind = TensorCommandKind;
    uint64_t tensor_id;
    Kind kind;
 });
@@ -187,6 +180,17 @@ DEF_DUR_EVENT(Custom, {
    std::string content;
 });

+DEF_EVENT(RecordDevice, {
+    std::shared_ptr<CompNode::Event> event;
+});
+
+DEF_DUR_EVENT(HostToDevice, {
+    TensorLayout layout;
+    CompNode device;
+    void* host_ptr;
+    void* device_ptr;
+});
+
 #undef DEF_EVENT
 #undef DEF_DUR_EVENT


--- a/imperative/src/impl/profiler/formats.h
+++ b/imperative/src/impl/profiler/formats.h
@@ -15,10 +15,12 @@

 #include "megbrain/imperative/profiler.h"

+#include "./states.h"
+
 namespace mgb::imperative::profiler {

-void dump_chrome_timeline(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results);
+void dump_chrome_timeline(std::string filename, Profiler::bundle_t result);

-void dump_memory_flow(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results);
+void dump_memory_flow(std::string filename, Profiler::bundle_t result);

 }
--- a/imperative/src/impl/profiler/memory_chunk.cpp
+++ b/imperative/src/impl/profiler/memory_chunk.cpp
@@ -89,7 +89,8 @@ struct MemoryChunk {
    std::array<uintptr_t, 2> address;
    std::string name;
    TensorLayout layout;
-    std::array<uint64_t, 2> time;
+    std::array<profiler::Duration, 2> time;
+    std::optional<uint64_t> group;

    bool empty() const {
        return address[1] - address[0] == 0;
@@ -111,9 +112,9 @@ struct MemoryFlow {
        return {addr_begin, addr_end};
    }

-    std::pair<uint64_t, uint64_t> time_range() const {
-        auto time_begin = std::numeric_limits<uint64_t>::max();
-        auto time_end = std::numeric_limits<uint64_t>::min();
+    std::pair<profiler::Duration, profiler::Duration> time_range() const {
+        auto time_begin = profiler::Duration::max();
+        auto time_end = profiler::Duration::min();
        for(auto&& [id, chunk]: chunks) {
            MGB_MARK_USED_VAR(id);
            if (chunk.empty()) continue;
@@ -123,27 +124,6 @@ struct MemoryFlow {
        return {time_begin, time_end};
    }

-    std::shared_ptr<json::Array> to_json() const {
-        auto results = json::Array::make();
-        for(auto&& [id, chunk]: chunks) {
-            MGB_MARK_USED_VAR(id);
-            if (chunk.empty()) continue;
-            auto address = json::Array::make();
-            auto time = json::Array::make();
-            address->add(json::String::make(std::to_string(chunk.address[0])));
-            address->add(json::String::make(std::to_string(chunk.address[1])));
-            time->add(json::String::make(std::to_string(chunk.time[0])));
-            time->add(json::String::make(std::to_string(chunk.time[1])));
-            results->add(json::Object::make({
-                {"address", address},
-                {"name", json::String::make(chunk.name)},
-                {"layout", json::String::make(chunk.layout.to_string())},
-                {"time", time}
-            }));
-        }
-        return results;
-    }
-
    XMLWriter to_svg() const {
        XMLWriter writer;
        auto&& [addr_begin, addr_end] = address_range();
@@ -157,13 +137,13 @@ struct MemoryFlow {
        svg.attr("xmlns:tag", std::string{"https://megengine.org.cn"});
        double time_scale = 1e5;
        double addr_scale = 1e6;
-        svg.attr("width", (time_end-time_begin)/time_scale);
+        svg.attr("width", (time_end-time_begin).count()/time_scale);
        svg.attr("height", (addr_end-addr_begin)/addr_scale);
        {
            auto rect = writer.element("rect");
            rect.attr("x", 0);
            rect.attr("y", 0);
-            rect.attr("width", (time_end-time_begin)/time_scale);
+            rect.attr("width", (time_end-time_begin).count()/time_scale);
            rect.attr("height", (addr_end-addr_begin)/addr_scale);
            rect.attr("fill", std::string{"blue"});
        }
@@ -177,7 +157,7 @@ struct MemoryFlow {
            {1000 * ms, "#888888"},
            {std::numeric_limits<double>::infinity(), "#555555"},
        };
-        auto time2str = [](uint64_t ns){
+        auto time2str = [](profiler::Duration ns){
            using pair_t = std::pair<uint64_t, const char*>;
            static pair_t units[] = {
                {1, "ns "},
@@ -189,9 +169,9 @@ struct MemoryFlow {
            auto comparator = [](const pair_t& lhs, const pair_t& rhs) {
                return lhs.first < rhs.first;
            };
-            while (ns > 0) {
-                auto iter = std::upper_bound(std::begin(units), std::end(units), std::make_pair(ns, ""), comparator) - 1;
-                builder += std::to_string(ns / iter->first) + iter->second;
+            while (ns.count() > 0) {
+                auto iter = std::upper_bound(std::begin(units), std::end(units), std::make_pair(ns.count(), ""), comparator) - 1;
+                builder += std::to_string(ns.count() / iter->first) + iter->second;
                ns = ns % iter->first;
            }
            return builder;
@@ -218,11 +198,11 @@ struct MemoryFlow {
        for (auto&& [id, chunk]: chunks) {
            MGB_MARK_USED_VAR(id);
            if (chunk.empty()) continue;
-            double left = (chunk.time[0]-time_begin)/time_scale;
-            double right = (chunk.time[1]-time_begin)/time_scale;
+            double left = (chunk.time[0]-time_begin).count()/time_scale;
+            double right = (chunk.time[1]-time_begin).count()/time_scale;
            double top = (chunk.address[0]-addr_begin)/addr_scale;
            double bottom = (chunk.address[1]-addr_begin)/addr_scale;
-            double duration = chunk.time[1] - chunk.time[0];
+            double duration = (chunk.time[1] - chunk.time[0]).count();
            {
                auto rect = writer.element("rect");
                rect.attr("x", left);
@@ -241,70 +221,48 @@ struct MemoryFlow {
                mge_attr("produced", time2str(chunk.time[0]));
                mge_attr("erased", time2str(chunk.time[1]));
                mge_attr("duration", time2str(chunk.time[1] - chunk.time[0]));
+                if (chunk.group) {
+                    mge_attr("group", std::to_string(*chunk.group));
+                }
            }
        }
        return writer;
    }
 };

-void dump_memory_flow(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results) {
-    MemoryFlow flow;
-
-    ProfileDataCollector collector;
-    ProfileState state;
-#define HANDLE_EVENT(type, ...) \
-    collector.handle<type>([&](uint64_t id, std::thread::id tid, uint64_t time, type event) __VA_ARGS__ );
-
-    HANDLE_EVENT(TensorDeclareEvent, {
-        auto& tensor_state = state.tensors[event.tensor_id] = {};
-        tensor_state.id = event.tensor_id;
-        tensor_state.name = event.name;
-    });
+struct MemoryFlowVisitor: EventVisitor<MemoryFlowVisitor> {
+    MemoryFlow memory_flow;

-    HANDLE_EVENT(TensorProduceEvent, {
-        auto& tensor_state = state.tensors[event.tensor_id];
-        tensor_state.device = event.device;
-        tensor_state.layout = event.layout;
-        tensor_state.produced = time;
-        state.tensors_by_size.insert({tensor_state.id, tensor_state.size_in_bytes()});
-        state.tensors_by_produced.insert({tensor_state.id, tensor_state.produced});
-        auto& chunk = flow.chunks[event.tensor_id];
-        uintptr_t address = reinterpret_cast<uintptr_t>(event.ptr);
-        auto span = event.layout.span();
-        auto dtype = event.layout.dtype;
-        // assume dtype is not lowbit
-        if (!address) {
-            chunk.address = {0, 0};
-        } else {
-            chunk.address = {address+span.low_elem*dtype.size(), address+span.high_elem*dtype.size()};
+    template <typename TEvent>
+    void visit_event(const TEvent &event) {
+        if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) {
+            auto& chunk = memory_flow.chunks[event.tensor_id];
+            uint64_t address = reinterpret_cast<uintptr_t>(event.ptr);
+            auto span = event.layout.span();
+            auto dtype = event.layout.dtype;
+            // assume dtype is not lowbit
+            if (!address) {
+                chunk.address = {0, 0};
+            } else {
+                chunk.address = {address+span.low_elem*dtype.size(), address+span.high_elem*dtype.size()};
+            }
+            chunk.layout = event.layout;
+            chunk.time[0] = since_start(to_device_time(current->time, current_tensor->device));
+            chunk.name = current_tensor->name;
+            chunk.group = current_tensor->source;
+        } else if constexpr (std::is_same_v<TEvent, TensorReleaseEvent>) {
+            auto& chunk = memory_flow.chunks[event.tensor_id];
+            chunk.time[1] = since_start(to_device_time(current->time, current_tensor->device));
        }
-        chunk.layout = tensor_state.layout;
-        chunk.time[0] = time;
-        chunk.name = tensor_state.name;
-    });
-
-    HANDLE_EVENT(TensorReleaseEvent, {
-        auto& tensor_state = state.tensors[event.tensor_id];
-        state.tensors_by_size.erase({tensor_state.id, tensor_state.size_in_bytes()});
-        state.tensors_by_produced.erase({tensor_state.id, tensor_state.produced});
-        auto& chunk = flow.chunks[event.tensor_id];
-        chunk.time[1] = time;
-    });
-
-    HANDLE_EVENT(ScopeEvent, {
-        state.threads[tid].scope_stack.push_back(event.name);
-    });
-
-    HANDLE_EVENT(ScopeFinishEvent, {
-        mgb_assert(state.threads[tid].scope_stack.back() == event.name);
-        state.threads[tid].scope_stack.pop_back();
-    });
-
-    for (auto&& result: results) {
-        collector(result.second.id, result.first, result.second.time, result.second.data);
    }

-    debug::write_to_file(filename.c_str(), flow.to_svg().to_string());
+    void notify_counter(std::string key, int64_t old_val, int64_t new_val) {}
+};
+
+void dump_memory_flow(std::string filename, Profiler::bundle_t result) {
+    MemoryFlowVisitor visitor;
+    visitor.process_events(std::move(result));
+    debug::write_to_file(filename.c_str(), visitor.memory_flow.to_svg().to_string());
 }

 }
--- a/imperative/src/impl/profiler/states.h
+++ b/imperative/src/impl/profiler/states.h
@@ -3,6 +3,9 @@
 #include <set>
 #include <any>
 #include <typeindex>
+#include <sstream>
+
+#include "nlohmann/json.hpp"

 #include "megbrain/tensor.h"

@@ -10,24 +13,16 @@

 namespace mgb::imperative::profiler {

-struct ProfileDeviceState {
-    int64_t index;
-    CompNode device;
-    std::shared_ptr<CompNode::Event> base_event;
-    uint64_t base_time; //in ns
-};
-
-struct ProfileWorkerState {
-
-};
+using StackManager = interpreter::intl::StackManager;

 struct ProfileTensorState {
-    uint64_t id;
+    uint64_t id = 0;
+    std::optional<uint64_t> source;
    TensorLayout layout;
    CompNode device;
    std::string name;
-    uint64_t produced = 0;
-    uint64_t living_time = 0;
+    profiler::HostTime produced = profiler::HostTime::min();
+    profiler::Duration living_time = profiler::Duration::zero();

    size_t size_in_bytes() const {
        if (!layout.dtype.valid()) {
@@ -35,41 +30,51 @@ struct ProfileTensorState {
        }
        return layout.dtype.size(layout.total_nr_elems());
    }
-};

-struct ProfileStaticsState {
-    size_t op_enqueue_count = 0;
-    size_t op_execute_count = 0;
-    size_t wait_value_count = 0;
-    size_t wait_shape_count = 0;
-    size_t exception_count = 0;
-    size_t infer_shape_valid_count = 0;
-    size_t infer_shape_invalid_count = 0;
-    size_t alive_tensor_count = 0;
-    size_t produce_tensor_count = 0;
-    size_t erase_tensor_count = 0;
-    size_t wait_prop_count = 0;
-    size_t redundant_tensor_count = 0;
+    std::string info(HostTime current_time) {
+        std::string shape = layout.TensorShape::to_string();
+        std::string dtype = layout.dtype.name();
+        return ssprintf("%s(%s:%s:%s)", name.c_str(), shape.c_str(), dtype.c_str(), device.to_string().c_str());
+    }
+
+    nlohmann::json detail(HostTime current_time) {
+        nlohmann::json args;
+        args["id"] = id;
+        args["name"] = name;
+        args["shape"] = layout.TensorShape::to_string();
+        args["dtype"] = layout.dtype.name();
+        args["nr_elements"] = layout.total_nr_elems();
+        args["device"] = device.to_string();
+        if (produced != produced.min()) {
+            double ms_count = std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(current_time - produced + living_time).count();
+            args["living_time"] = ssprintf("%lf ms", ms_count);
+        }
+        return args;
+    }
 };

 struct ProfileOperatorState {
-    uint64_t id;
+    uint64_t id = 0;
    std::string name;
    OpParams params;
    SmallVector<uint64_t> inputs;
    SmallVector<uint64_t> outputs;
    CompNode device;
+    Trace trace;

-    uint64_t host_begin;
-    uint64_t host_end;
-    std::shared_ptr<CompNode::Event> device_begin;
-    std::shared_ptr<CompNode::Event> device_end;
-};
+    profiler::HostTime execute_begin;
+    profiler::HostTime execute_end;

-struct ProfileThreadState {
-    std::thread::id tid;
-    int64_t index;
-    std::vector<std::string> scope_stack;
+    nlohmann::json detail() {
+        nlohmann::json args;
+        for (auto&& [name, value]: params) {
+            args[name] = value;
+        }
+        args["__id__"] = id;
+        args["__name__"] = name;
+        args["__device__"] = device.to_string();
+        return args;
+    }
 };

 template <typename TProp>
@@ -93,37 +98,12 @@ struct ProfileTensorPropPair {
 using ProfileTensorSizePair = ProfileTensorPropPair<size_t>;
 using ProfileTensorProducedPair = ProfileTensorPropPair<uint64_t>;

-struct GeneralTensorEvent {
-    uint64_t tensor_id;
-    std::type_index type;
-};
-
 struct ProfileState {
    std::unordered_map<uint64_t, ProfileTensorState> tensors;
    std::unordered_map<uint64_t, ProfileOperatorState> operators;
    std::unordered_map<std::string, uint64_t> tensor_name_counter;
    std::set<ProfileTensorSizePair> tensors_by_size;
    std::set<ProfileTensorSizePair> tensors_by_produced;
-    ProfileWorkerState worker;
-    ProfileStaticsState statics;
-    std::unordered_map<std::thread::id, ProfileThreadState> threads;
-    CompNode::UnorderedMap<ProfileDeviceState> devices;
-
-    ProfileThreadState& operator[](std::thread::id tid) {
-        if (threads.count(tid) == 0) {
-            threads[tid].tid = tid;
-            threads[tid].index = threads.size();
-        }
-        return threads[tid];
-    }
-
-    ProfileDeviceState& operator[](CompNode device) {
-        if (devices.count(device) == 0) {
-            devices[device].device = device;
-            devices[device].index = devices.size();
-        }
-        return devices[device];
-    }

    std::vector<uint64_t> top_k_tensor_in_device(CompNode device, size_t k) {
        std::vector<uint64_t> results;
@@ -138,19 +118,233 @@ struct ProfileState {
        }
        return results;
    }
+};

-    std::string concat_scope(std::thread::id tid) {
-        auto& scope_stack = threads[tid].scope_stack;
-        if (scope_stack.empty()) {
-            return {};
-        }
-        std::string result = scope_stack[0];
-        for (size_t i = 1; i < scope_stack.size(); ++i) {
-            result += "::";
-            result += scope_stack[i];
+template<typename T, typename = void>
+struct is_op_event : std::false_type { };
+
+template<typename T>
+struct is_op_event<T, decltype(std::declval<T>().op_id, void())> : std::true_type { };
+
+template<typename T, typename = void>
+struct is_tensor_event : std::false_type { };
+
+template<typename T>
+struct is_tensor_event<T, decltype(std::declval<T>().tensor_id, void())> : std::true_type { };
+template<typename T, typename = void>
+struct is_trace_event : std::false_type { };
+template<typename T>
+struct is_trace_event<T, decltype(std::declval<T>().trace, void())> : std::true_type { };
+
+template <typename... TItems>
+class AnyToVariantConverter {
+public:
+    using any_t = std::any;
+    using variant_t = std::variant<TItems...>;
+private:
+    std::unordered_map<std::type_index, std::function<variant_t(any_t)>> m_table;
+
+    template <typename TItem>
+    void register_converter() {
+        m_table[typeid(TItem)] = [](any_t input) {
+            return variant_t(std::any_cast<TItem>(std::move(input)));
+        };
+    }
+public:
+    AnyToVariantConverter() {
+        (register_converter<TItems>(), ...);
+    }
+    variant_t operator()(any_t input) {
+        return m_table[input.type()](std::move(input));
+    }
+};
+
+template <typename TSelf>
+class EventVisitor {
+private:
+    std::unordered_map<size_t, ProfileOperatorState> m_operators;
+    std::unordered_map<size_t, ProfileTensorState> m_tensors;
+    std::unordered_map<size_t, std::vector<Profiler::Record>> m_duration_stack;
+    HostTime m_start_time;
+    CompNode::UnorderedMap<size_t> m_device_tid_table;
+    std::unordered_map<std::thread::id, size_t> m_host_tid_table;
+    CompNode::UnorderedMap<std::map<profiler::HostTime, profiler::RealDuration>> m_device_timeline;
+    std::unordered_map<std::thread::id, std::vector<Trace>> m_trace_stack;
+    std::unordered_map<std::string, int64_t> m_counter_table;
+protected:
+    Profiler::Record* current;
+    ProfileOperatorState* current_op;
+    ProfileTensorState* current_tensor;
+protected:
+    profiler::Duration since_start(profiler::HostTime time) {
+        return time - m_start_time;
+    }
+
+    profiler::HostTime to_device_time(profiler::HostTime time, CompNode device) {
+        auto& device_timeline = m_device_timeline[device];
+        auto upper = device_timeline.lower_bound(time);
+        if (upper == device_timeline.end()) {
+            if (upper == device_timeline.begin()) {
+                return time;
+            } else {
+                --upper;
+                return time + std::chrono::duration_cast<profiler::Duration>(upper->second);
+            }
+        } else if (upper->first == time) {
+            return time + std::chrono::duration_cast<profiler::Duration>(upper->second);
+        } else if (upper == device_timeline.begin()) {
+            return time + std::chrono::duration_cast<profiler::Duration>(upper->second);
        }
+        auto lower = upper;
+        -- lower;
+        double ratio = ((double)(time - lower->first).count() / (double)(upper->first - lower->first).count());
+        mgb_assert(ratio > 0 && ratio < 1, "invalid ratio");
+        mgb_assert(lower->first + lower->second <= upper->first + upper->second, "device time corr");
+        auto shift = lower->second + ratio * (upper->second - lower->second);
+        auto result = time + std::chrono::duration_cast<profiler::Duration>(shift);
        return result;
    }
+
+    size_t to_tid(std::thread::id host_tid) {
+        return m_host_tid_table.at(host_tid);
+    }
+
+    size_t to_tid(CompNode device) {
+        return m_device_tid_table.at(device);
+    }
+
+    void inc_counter(const char* key, int64_t delta) {
+        if (!m_counter_table.count(key)) {
+            m_counter_table[key] = 0;
+        }
+        auto& value = m_counter_table[key];
+        static_cast<TSelf&>(*this).notify_counter(key, value, value + delta);
+        value += delta;
+    }
+public:
+    void process_events(Profiler::bundle_t bundle) {
+        m_start_time = bundle.start_at;
+
+        auto& self = static_cast<TSelf&>(*this);
+        AnyToVariantConverter<OpDispatchEvent, OpExecuteEvent, OpExecuteFinishEvent,
+                KernelLaunchEvent, KernelLaunchFinishEvent,
+                OpInputEvent, OpInputFinishEvent, OpOutputEvent, OpOutputFinishEvent,
+                TensorDeclareEvent, TensorProduceEvent, TensorUsageEvent, TensorReleaseEvent, TensorEraseEvent,
+                TensorGetPropEvent, TensorNotifyPropEvent, TensorWaitPropEvent, TensorWaitPropFinishEvent,
+                SampleDeviceEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent,
+                StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent,
+                TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent,
+                CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent> converter;
+
+        auto for_each_entry = [&](auto&& handler) {
+            for (auto& entry: bundle.entries) {
+                current = &entry;
+                std::visit(handler, converter(entry.data));
+            }
+            current = nullptr;
+        };
+
+        // build device timeline
+        struct DeviceStartPair {
+            profiler::HostTime host;
+            std::shared_ptr<CompNode::Event> device;
+        };
+        CompNode::UnorderedMap<DeviceStartPair> device_start_table;
+
+        for_each_entry([&](auto&& event){
+            using T = std::decay_t<decltype(event)>;
+            if constexpr (std::is_same_v<T, RecordDeviceEvent>) {
+                using namespace std::chrono_literals;
+                DeviceStartPair& device_start = device_start_table[event.event->comp_node()];
+                if (!device_start.device) {
+                    device_start = { current->time, event.event };
+                }
+                event.event->host_wait();
+                auto device_time = (device_start.host - current->time) + std::chrono::duration_cast<profiler::RealDuration>(device_start.device->elapsed_time_until(*event.event) * 1s);
+                m_device_timeline[event.event->comp_node()][current->time] = device_time;
+            }
+        });
+
+        // register host threads
+        for_each_entry([&](auto&& event){
+            if (!m_host_tid_table.count(current->tid)) {
+                m_host_tid_table[current->tid] = {m_device_tid_table.size() + m_host_tid_table.size()};
+            }
+        });
+
+        for_each_entry([&](auto&& event){
+            using T = std::decay_t<decltype(event)>;
+            if constexpr (std::is_same_v<T, OpDispatchEvent>) {
+                auto& op = m_operators[event.op_id];
+                mgb_assert(op.id == 0, "duplicate operator id");
+                op.id = event.op_id;
+                op.name = event.op_name;
+                op.params = event.op_params();
+                op.inputs = event.inputs;
+                op.outputs = event.outputs;
+                op.trace = event.trace;
+                for (auto&& output: event.outputs) {
+                    m_tensors.at(output).source = op.id;
+                }
+            } else if constexpr (std::is_same_v<T, TensorDeclareEvent>) {
+                auto& tensor = m_tensors[event.tensor_id];
+                mgb_assert(tensor.id == 0, "duplicated tensor id");
+                tensor.id = event.tensor_id;
+                tensor.name = event.name;
+            } else if constexpr (std::is_same_v<T, TensorProduceEvent>) {
+                auto& tensor = m_tensors.at(event.tensor_id);
+                if (!m_device_tid_table.count(event.device)) {
+                    m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()};
+                }
+                tensor.device = event.device;
+            }
+        });
+
+        // replay execution
+        using namespace std::placeholders;
+        for_each_entry([&](auto&& event){
+            using T = std::decay_t<decltype(event)>;
+            // update current_op/tensor
+            if constexpr (is_op_event<T>::value) {
+                current_op = &m_operators.at(event.op_id);
+            } else if constexpr (is_tensor_event<T>::value) {
+                current_tensor = &m_tensors.at(event.tensor_id);
+            }
+            if constexpr (std::is_same_v<T, OpExecuteEvent>) {
+                current_op->execute_begin = current->time;
+            } else if constexpr (std::is_same_v<T, OpExecuteFinishEvent>) {
+                current_op->execute_end = current->time;
+            }
+            // update counters
+            if constexpr (std::is_same_v<T, OpDispatchEvent>) {
+                inc_counter("nr_op_pending", 1);
+            } else if constexpr (std::is_same_v<T, OpExecuteEvent>) {
+                inc_counter("nr_op_pending", -1);
+            } else if constexpr (std::is_same_v<T, TensorProduceEvent>) {
+                inc_counter("nr_alive_tensor", 1);
+            } else if constexpr (std::is_same_v<T, TensorReleaseEvent>) {
+                inc_counter("nr_alive_tensor", -1);
+            } else if constexpr (std::is_same_v<T, TensorEraseEvent>) {
+                if (event.use_count == 0) {
+                    inc_counter("nr_redunant_tensor", 1);
+                }
+            } else if constexpr (std::is_same_v<T, ShapeInferEvent>) {
+                if (!event.success) {
+                    inc_counter("nr_shape_infer_failure", 1);
+                }
+            } else if constexpr (std::is_same_v<T, WorkerExceptionEvent>) {
+                inc_counter("nr_exception", 1);
+            }
+            // visit_event_impl
+            self.visit_event(event);
+            // reset current_op/tensor
+            if constexpr (is_op_event<T>::value) {
+                current_op = nullptr;
+            } else if constexpr (is_tensor_event<T>::value) {
+                current_tensor = nullptr;
+            }
+        });
+    }
 };

 }
--- a/imperative/src/impl/profiler_plugin.cpp
+++ b/imperative/src/impl/profiler_plugin.cpp
@@ -25,6 +25,9 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) {
    auto on_seq_start = [this](CompSeqExecBeforeStart const& event) {
        // reset
        mgb_assert(!event.graph->options().imperative_proxy_graph);
+        CompNode::foreach([](CompNode device){
+            Profiler::record<RecordDeviceEvent>(Timer::record_device(device));
+        });
        if (m_opr_dict.empty() && m_var_dict.empty()) {
            init_seq(event.exec);
        }
@@ -122,11 +125,13 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) {
    };
    auto on_before_kern = [this](BeforeKernel const& event) {
        OperatorNodeBase* opr = event.opr;
-        Profiler::record<KernelExecuteEvent>(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node));
+        Profiler::record<KernelLaunchEvent>(get_opr_info(opr).id, get_opr_info(opr).id, event.comp_node);
+        Profiler::record<RecordDeviceEvent>(Timer::record_device(event.comp_node));
    };
    auto on_after_kern = [this](AfterKernel const& event) {
        OperatorNodeBase* opr = event.opr;
-        Profiler::record<KernelExecuteFinishEvent>(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node));
+        Profiler::record<RecordDeviceEvent>(Timer::record_device(event.comp_node));
+        Profiler::record<KernelLaunchEvent>(get_opr_info(opr).id, get_opr_info(opr).id, event.comp_node);
    };
    auto on_graph_compile = [this](const CompSeqOrderDetermined&) {
        m_opr_dict.clear();

--- a/imperative/src/include/megbrain/imperative/profiler.h
+++ b/imperative/src/include/megbrain/imperative/profiler.h
@@ -32,15 +32,22 @@
 namespace mgb {
 namespace imperative {

+namespace profiler {
+
+using HostTime = std::chrono::time_point<std::chrono::high_resolution_clock>;
+
+using Duration = std::chrono::nanoseconds;
+using RealDuration = std::chrono::duration<double, std::nano>;
+
+using Time = HostTime;
+
+}  // namespace profiler
+
 class Timer {
 public:
-    void reset();
-    uint64_t get_nsecs();
-    uint64_t get_started_at();
-    static std::shared_ptr<CompNode::Event> record_event(CompNode device);
-private:
-    decltype(std::chrono::steady_clock::now()) m_start;
-    uint64_t m_started_at;
+    using Time = profiler::Time;
+    static profiler::Time record_host();
+    static std::shared_ptr<CompNode::Event> record_device(CompNode device);
 };


@@ -48,7 +55,8 @@ class Profiler {
 public:
    struct Record {
        uint64_t id;
-        uint64_t time; //in ns
+        std::thread::id tid;
+        profiler::Time time;
        std::any data;
    };
    enum Status: uint8_t {
@@ -56,23 +64,32 @@ public:
        Recording = 1,
        Collecting = 2,
    };
-    using ProfileCollector = std::function<void(std::thread::id, Record)>;
+    struct ResultBundle;
+    using ProfileCollector = std::function<void(Record)>;
    using option_t = uint64_t;
    using options_t = std::unordered_map<std::string, option_t>;
-    using result_t = std::pair<std::thread::id, Record>;
-    using results_t = std::vector<result_t>;
+    using entry_t = Record;
+    using bundle_t = ResultBundle;
    using thread_dict_t = std::unordered_map<std::thread::id, std::string>;
+
+    struct ResultBundle {
+        profiler::HostTime start_at;
+        thread_dict_t thread_dict;
+        options_t options;
+        std::vector<entry_t> entries;
+    };
 private:
    std::thread::id m_thread_id;
    std::vector<Record> m_records;
+    std::vector<std::any> m_duration_stack;
    std::atomic<Status> m_status = Running;
-    uint64_t m_last_time = 0;
    std::string m_thread_name;

    static options_t sm_profile_options;
    static std::mutex sm_mutex;
    static std::unordered_map<std::thread::id, Profiler*> sm_profilers;
    static Timer sm_timer;
+    static profiler::HostTime sm_start_at;
    static std::atomic_uint64_t sm_last_id;
    static std::atomic_size_t sm_preferred_capacity;
    static bool sm_profiling;
@@ -100,7 +117,7 @@ public:

    static void reset() {
        mgb_assert(sm_profilers.size() == 0, "profiler already running");
-        sm_timer.reset();
+        sm_start_at = profiler::HostTime::min();
    }

    static uint64_t next_id() {
@@ -110,16 +127,13 @@ public:
    template <typename T, typename... TArgs>
    static uint64_t record(TArgs&&... args) {
        auto& profiler = get_instance();
-        auto last_time = profiler.m_last_time;
        if constexpr (sm_debug) {
            Status expected = Running;
            mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording));
        }
        uint64_t id = next_id();
-        uint64_t time = sm_timer.get_nsecs();
-        time = std::max(time, last_time + 2000);
-        profiler.m_last_time = time;
-        profiler.m_records.push_back({id, time, T{std::forward<TArgs>(args)...}});
+        profiler::Time time = sm_timer.record_host();
+        profiler.m_records.push_back({id, std::this_thread::get_id(), time, T{std::forward<TArgs>(args)...}});
        if constexpr (sm_debug) {
            Status expected = Recording;
            mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running));
@@ -127,7 +141,8 @@ public:
        return id;
    }

-    static results_t collect() {
+    static bundle_t collect() {
+        bundle_t bundle;
        MGB_LOCK_GUARD(sm_mutex);
        if constexpr (sm_debug) {
            for (auto&& [tid, profiler]: sm_profilers) {
@@ -136,17 +151,17 @@ public:
                mgb_assert(profiler->m_status.compare_exchange_strong(expected, Collecting));
            }
        }
-        std::vector<std::pair<std::thread::id, Record>> profile_data;
+        std::vector<entry_t> profile_data;
        for (auto&& [tid, profiler]: sm_profilers) {
            sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size());
            for (auto& record: profiler->m_records) {
-                profile_data.push_back({tid, std::move(record)});
+                profile_data.push_back(std::move(record));
            }
            profiler->m_records.clear();
            profiler->m_records.reserve(sm_preferred_capacity);
        }
        std::sort(profile_data.begin(), profile_data.end(), [](auto& lhs, auto& rhs){
-            return lhs.second.id < rhs.second.id;
+            return lhs.id < rhs.id;
        });
        if constexpr (sm_debug) {
            for (auto&& [tid, profiler]: sm_profilers) {
@@ -155,7 +170,11 @@ public:
                mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running));
            }
        }
-        return profile_data;
+        bundle.entries = profile_data;
+        bundle.options = get_options();
+        bundle.start_at = sm_start_at;
+        bundle.thread_dict = get_thread_dict();
+        return bundle;
    }

    static option_t get_option(std::string key, option_t default_val) {
@@ -179,6 +198,7 @@ public:

    static void start_profile() {
        mgb_assert(!sm_profiling);
+        sm_start_at = Timer::record_host();
        sm_profiling = true;
    }

@@ -189,7 +209,7 @@ public:

    static thread_dict_t get_thread_dict();

-    static void dump_profile(std::string basename, std::string format, results_t results, options_t options);
+    static void dump_profile(std::string basename, std::string format, bundle_t result);
 };



--- a/imperative/test/CMakeLists.txt
+++ b/imperative/test/CMakeLists.txt
@@ -19,7 +19,7 @@ target_include_directories(imperative_test PRIVATE ${MODULE_SRC_INCLUDE} ${PYTHO
 target_compile_definitions(imperative_test PRIVATE MODULE_NAME=C)
 target_compile_options(imperative_test PRIVATE -Wno-unused-parameter)

-set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3)
+set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3 nlohmann_json::nlohmann_json)

 if(MGE_WITH_CUDA)
    list(APPEND LINK_LIBS cudart)