diff --git a/imperative/CMakeLists.txt b/imperative/CMakeLists.txt index 46d2e1337ac28bc70eb9956e9ee310160747bda6..1f49d331f03f07ab71db6b9e5d8604d2339bd731 100644 --- a/imperative/CMakeLists.txt +++ b/imperative/CMakeLists.txt @@ -39,6 +39,9 @@ endif() add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 ${PROJECT_BINARY_DIR}/third_party/range-v3) target_link_libraries(${MODULE_NAME} PRIVATE range-v3) +add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/Json ${PROJECT_BINARY_DIR}/third_party/Json) +target_link_libraries(${MODULE_NAME} PRIVATE nlohmann_json::nlohmann_json) + target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${MGB_OPDEF_OUT_DIR}) target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME}) target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter) diff --git a/imperative/python/src/tensor.cpp b/imperative/python/src/tensor.cpp index 166bc1ee7fc22eedcb44f642dd8a3e83e2dea1f7..2ec14a3726fa604ee583118ae1f65b2199585434 100644 --- a/imperative/python/src/tensor.cpp +++ b/imperative/python/src/tensor.cpp @@ -1013,9 +1013,8 @@ void init_tensor(py::module m) { interpreter_for_py->sync(); imperative::Profiler::stop_profile(); auto results = imperative::Profiler::collect(); - auto options = imperative::Profiler::get_options(); - return [results=std::move(results), options=std::move(options)](std::string basename, std::string format){ - imperative::Profiler::dump_profile(basename, format, results, options); + return [results=std::move(results)](std::string basename, std::string format){ + imperative::Profiler::dump_profile(basename, format, results); }; }, py::call_guard()); m.def("sync", diff --git a/imperative/src/impl/interpreter/commands.h b/imperative/src/impl/interpreter/commands.h index be51a24bf90eee5dc7eda6b4831cb4d23ade47f1..72003308d37ee318be77b1ac48c9379238c9b810 100644 --- a/imperative/src/impl/interpreter/commands.h +++ b/imperative/src/impl/interpreter/commands.h @@ -19,6 +19,7 @@ #include "megbrain/imperative/op_def.h" #include "megbrain/imperative/utils/to_string.h" +#include "./stack_manager.h" #include "./tensor_info.h" namespace mgb::imperative { @@ -193,7 +194,7 @@ struct PopScope { } }; -using Command = std::variant; -using IdentifiedCommand = std::pair; +struct Command { + uint64_t id; + CommandData data; + StackManager::Trace trace; + +}; +// using IdentifiedCommand = std::pair; } template <> struct ToStringTrait{ std::string operator()(const interpreter::intl::Command& cmd) const { - return std::visit([](const auto& cmd){ + std::string content = std::visit([](const auto& cmd){ std::string result = cmd.get_name(); result += "{"; cmd.get_props([&](const char* key, auto&& value) { @@ -224,7 +231,8 @@ struct ToStringTrait{ }); result += "}"; return result; - }, cmd); + }, cmd.data); + return content; } }; diff --git a/imperative/src/impl/interpreter/interpreter_impl.cpp b/imperative/src/impl/interpreter/interpreter_impl.cpp index 1e4ac9dbfae818a7177b0f91b155e28b1659c19b..f1d0f21ffd26bcdb01f7aae30c195d5d0a01af15 100644 --- a/imperative/src/impl/interpreter/interpreter_impl.cpp +++ b/imperative/src/impl/interpreter/interpreter_impl.cpp @@ -127,9 +127,8 @@ Handle ChannelImpl::put(const HostTensorND& value, bool no_cache) { MGB_LOCK_GUARD(m_spin); mgb_assert(check_available(), "Channel already closed"); auto& state = get_channel_state(); - state.scopes.push("Put"); + auto _ = StackManager::Guard{"Put", &state.stack_manager}; auto info = put_impl(value, no_cache); - state.scopes.pop("Put"); return info; } @@ -158,16 +157,15 @@ Handle ChannelImpl::put(const DeviceTensorND& data, const HostTensorND& hvalue) } TensorInfo* ChannelImpl::put_impl(const DeviceTensorND& data, const HostTensorND& hvalue) { auto& state = get_channel_state(); - state.scopes.push("Put"); + auto _ = StackManager::Guard{"Put", &state.stack_manager}; auto info = alloc(); - RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandEvent::Put); + RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandKind::Put); init(info, {data.layout(), data.comp_node()}); info->mem_desc.id = StorageIdentifier::make(++m_storage_id); info->ptr = Tensor::make(data, hvalue); RECORD_EVENT(TensorProduceEvent, info->id, info->desc.layout, info->desc.comp_node, data.raw_ptr()); info->status = TensorInfo::Produced; - RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandFinishEvent::Put); - state.scopes.pop("Put"); + RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandKind::Put); return info; } @@ -230,7 +228,7 @@ void ChannelImpl::dispatch_default_cpu( auto& state = get_channel_state(); auto name = op->trait()->make_name(*op); - state.scopes.push(name); + auto _ = StackManager::Guard(name, &state.stack_manager); auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); RECORD_EVENT(ShapeInferEvent, validated); @@ -291,9 +289,9 @@ void ChannelImpl::dispatch_default_cpu( } return op_info; }; - RECORD_EVENT(OpDispatchEvent, op_id, op->trait()->name, op_info_getter, tinfo_to_tid(input_infos), tinfo_to_tid(output_infos)); - - state.scopes.pop(name); + RECORD_EVENT(OpDispatchEvent, op_id, op->trait()->name, op_info_getter, + tinfo_to_tid(input_infos), tinfo_to_tid(output_infos), + state.stack_manager.dump()); } void ChannelImpl::dispatch_kernel( @@ -305,7 +303,7 @@ void ChannelImpl::dispatch_kernel( auto& options = state.options; auto name = op->trait()->make_name(*op); - state.scopes.push(name); + auto _ = StackManager::Guard{name, &state.stack_manager}; auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); RECORD_EVENT(ShapeInferEvent, validated); @@ -334,7 +332,9 @@ void ChannelImpl::dispatch_kernel( } return op_info; }; - RECORD_EVENT(OpDispatchEvent, cmd.id, cmd.op->trait()->name, op_info_getter, tinfo_to_tid(cmd.inputs), tinfo_to_tid(cmd.outputs)); + RECORD_EVENT(OpDispatchEvent, cmd.id, cmd.op->trait()->name, op_info_getter, + tinfo_to_tid(cmd.inputs), tinfo_to_tid(cmd.outputs), + state.stack_manager.dump()); m_buffer.enqueue(std::move(cmd)); if (!validated && options.async_level == 1) { sync_impl(); @@ -346,7 +346,6 @@ void ChannelImpl::dispatch_kernel( info->ptr->comp_node().sync(); } } - state.scopes.pop(name); } SmallVector ChannelImpl::apply_op( @@ -505,7 +504,8 @@ TensorInfo* ChannelImpl::alloc() { }(); info->id = Profiler::next_id(); if (Profiler::is_profiling()) { - info->name = state.scopes.next_tensor_name(); + size_t tensor_id = state.stack_manager.current()->next_id("tensor"); + info->name = state.stack_manager.dump().to_string() + ssprintf(":%zu", tensor_id); } return info; } @@ -554,7 +554,7 @@ void ChannelImpl::free(TensorInfo* ptr) { } void ChannelImpl::recursive_free(TensorInfo* ptr) { - RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandEvent::RecFree); + RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandKind::RecFree); SmallVector inps; if (ptr->producer) { for (auto i : ptr->producer->inputs) { @@ -569,7 +569,7 @@ void ChannelImpl::recursive_free(TensorInfo* ptr) { recursive_free(i); } } - RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandFinishEvent::RecFree); + RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandKind::RecFree); } void ChannelImpl::real_free(TensorInfo* ptr) { @@ -625,9 +625,9 @@ void ChannelImpl::regenerate(TensorInfo* dest) { m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest}); if (!m_applying) flush_apply_stack(); } else if (dest->evict_type == EvictType::SWAP) { - RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandEvent::ReGen); + RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen); produce_tensor(dest, Tensor::make(dest->h_value)); - RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandFinishEvent::ReGen); + RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandKind::ReGen); } } @@ -721,22 +721,24 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { // refcnt --, owners: [tensor_inputs] // if it's decreased to 1, would be detected at @see: proxy_graph_detail::apply_on_physical_tensor uint64_t del_id = del->id; - RECORD_EVENT(OpDelEvent, del_id); + RECORD_EVENT(TensorCommandEvent, del_id, TensorCommandKind::Del); free(del); - RECORD_EVENT(OpDelFinishEvent, del_id); + RECORD_EVENT(TensorCommandFinishEvent, del_id, TensorCommandKind::Del); } // Before wait //TODO: split operator wait and execute so that OpWait could be corrected recorded. // Before execute for (auto&& [device, kernel_id]: kernels) { - RECORD_EVENT(KernelExecuteEvent, apply_id, kernel_id, Timer::record_event(device)); + RECORD_EVENT(KernelLaunchEvent, apply_id, kernel_id, device); + RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); } // Apply op // Here std::move is REQUIRED for removing duplicated references. auto outputs = apply_on_physical_tensor(apply_on_physical_tensor, *cmd.op, inputs); // After execute for (auto&& [device, kernel_id]: kernels) { - RECORD_EVENT(KernelExecuteFinishEvent, apply_id, kernel_id, Timer::record_event(device)); + RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); + RECORD_EVENT(KernelLaunchFinishEvent, apply_id, kernel_id, device); } // End profiling operator mgb_assert(outputs.size() == cmd.outputs.size()); @@ -787,7 +789,7 @@ void ChannelImpl::flush_apply_stack() { m_dtr.pin(cmd.inputs); } if (recomp) { - RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandEvent::ReGen); + RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandKind::ReGen); } } bool regen = false; @@ -810,7 +812,7 @@ void ChannelImpl::flush_apply_stack() { m_apply_stack.pop(); do_apply_op(cmd_backup); if (recomp_backup) { - RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandFinishEvent::ReGen); + RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen); for (auto o : cmd_backup.outputs) { if (o) { m_dtr.update_dsu_after_recompute(o); @@ -902,7 +904,7 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) { check_worker_exc_unsafe(); return require_host ? host_available() : static_cast(info->ptr); }); - RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop, m_waitee == nullptr); + RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop); m_waitee = nullptr; return info->ptr; } @@ -1003,7 +1005,7 @@ std::tuple, SmallVector, SmallVector; if constexpr (std::is_same_v) { - RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Put); + RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Put); + RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node())); auto value = cmd.no_cache ? std::make_shared(cmd.value) : Tensor::make(cmd.value); + RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node())); produce_tensor(cmd.dest, std::move(value)); - RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Put); + RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Put); sample_on_device(cmd.dest->desc.comp_node, false); } else if constexpr (std::is_same_v) { for (auto& i : cmd.inputs) { @@ -1084,11 +1088,11 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { } } } else if constexpr (std::is_same_v) { - RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Del); + RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Del); CompNode device = cmd.dest->desc.comp_node; uint64_t tensor_id = cmd.dest->id; free(cmd.dest); - RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandFinishEvent::Del); + RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandKind::Del); sample_on_device(device, false); } else if constexpr (std::is_same_v) { if (cmd.dest->invalid) return; @@ -1102,26 +1106,26 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { imperative_log_profile_end("GetValue"); } else if constexpr (std::is_same_v) { if (cmd.dest->invalid) return; - RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapIn); + RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapIn); produce_tensor(cmd.dest, Tensor::make(cmd.dest->h_value)); - RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapIn); + RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapIn); sample_on_device(cmd.dest->desc.comp_node, false); } else if constexpr (std::is_same_v) { if (cmd.dest->invalid) return; - RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapOut); + RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapOut); cmd.dest->h_value = cmd.dest->ptr->get_value(); if (cmd.dest->evict_type == EvictType::NONE) { cmd.dest->evict_type = EvictType::SWAP; cmd.dest->status = TensorInfo::Swapped; release_tensor(cmd.dest); } - RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapOut); + RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapOut); sample_on_device(cmd.dest->desc.comp_node, false); } else if constexpr (std::is_same_v) { if (cmd.dest->invalid) return; - RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Drop); + RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Drop); do_drop(cmd.dest, true); - RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Drop); + RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Drop); } else if constexpr (std::is_same_v) { options.set_option(cmd.key, cmd.value); } else if constexpr (std::is_same_v) { @@ -1138,6 +1142,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { if (Profiler::get_option("sample_rate", 0)) { sample_on_device(device, true); } + RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); }); RECORD_EVENT(StartProfileFinishEvent); } else if constexpr (std::is_same_v) { @@ -1186,7 +1191,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { notify_tensor_unsafe(m_waitee); } } - }, icmd.second); + }, icmd.data); } void ChannelImpl::check_worker_exc_unsafe() { @@ -1203,12 +1208,13 @@ void ChannelImpl::check_worker_exc_unsafe() { } } -void ChannelImpl::CommandBuffer::enqueue(Command cmd) { +void ChannelImpl::CommandBuffer::enqueue(CommandData cmd) { + auto& state = m_owner->get_channel_state(); if (std::get_if(&cmd) && fuse_del(std::get(cmd))) { return; } // mgb_log_debug("%s Enqueued", to_string(cmd).c_str()); - m_commands.push_back(std::move(cmd)); + m_commands.push_back({Profiler::next_id(), std::move(cmd), state.stack_manager.dump()}); auto flush_pos = flush_pos_for(m_commands.back()); flush(flush_pos); } @@ -1222,7 +1228,7 @@ void ChannelImpl::CommandBuffer::flush(Handle pos) { if (Profiler::is_profiling()) { mgb_log_debug("%s Flushed", to_string(*iter).c_str()); } - m_owner->m_worker.add_task(IdentifiedCommand{Profiler::next_id(), std::move(*iter)}); + m_owner->m_worker.add_task(std::move(*iter)); } m_commands.erase(m_commands.begin(), pos); } @@ -1248,7 +1254,7 @@ auto ChannelImpl::CommandBuffer::flush_pos_for(const Command& cmd) -> Handle { return m_commands.begin() + (m_commands.size() - buffer_length); } return m_commands.begin(); - }, cmd); + }, cmd.data); } /** @@ -1261,7 +1267,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) { // TODO: eliminate Puts auto begin = m_commands.begin(), end = m_commands.end(); auto apply_iter = std::find_if(begin, end, [dest](const Command& cmd){ - if (auto* apply = std::get_if(&cmd)) { + if (auto* apply = std::get_if(&cmd.data)) { return std::count(apply->inputs.begin(), apply->inputs.end(), dest) > 0; } return false; @@ -1270,7 +1276,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) { return false; } // mgb_log_debug("%s Fused", to_string(Command{cmd}).c_str()); - std::get(*apply_iter).dels.push_back(dest); + std::get(apply_iter->data).dels.push_back(dest); return true; } @@ -1297,7 +1303,7 @@ auto ChannelImpl::CommandBuffer::find_last_usage(TensorInfo* dest, Range range) found = iter; } } - }, *iter); + }, iter->data); }; return found; } @@ -1313,7 +1319,7 @@ auto ChannelImpl::CommandBuffer::find_produce(TensorInfo* dest, Range range) return cmd.dest == dest; } return false; - }, cmd); + }, cmd.data); }); } @@ -1340,7 +1346,7 @@ void ChannelImpl::push_scope(std::string name) { MGB_LOCK_GUARD(m_spin); mgb_assert(check_available(), "Channel already closed"); auto& state = get_channel_state(); - state.scopes.push(name); + state.stack_manager.enter(name); RECORD_EVENT(ScopeEvent, name); m_buffer.enqueue(PushScope{name}); } @@ -1349,7 +1355,7 @@ void ChannelImpl::pop_scope(std::string name) { MGB_LOCK_GUARD(m_spin); mgb_assert(check_available(), "Channel already closed"); auto& state = get_channel_state(); - state.scopes.pop(name); + state.stack_manager.exit(name); RECORD_EVENT(ScopeFinishEvent, name); m_buffer.enqueue(PopScope{name}); } diff --git a/imperative/src/impl/interpreter/interpreter_impl.h b/imperative/src/impl/interpreter/interpreter_impl.h index 28dd963db0f60c94280965450bfdb5feb461c189..cc998bffd0da0b4cacbf753c3a649b906855795b 100644 --- a/imperative/src/impl/interpreter/interpreter_impl.h +++ b/imperative/src/impl/interpreter/interpreter_impl.h @@ -26,6 +26,7 @@ #include "./commands.h" #include "./tensor_info.h" #include "./option_manager.h" +#include "./stack_manager.h" #include "../profiler/events.h" @@ -94,7 +95,7 @@ private: TensorPtr wait_tensor(TensorInfo* info, profiler::TensorProp prop); void notify_tensor_unsafe(TensorInfo* info); - void process_one_task(IdentifiedCommand&); + void process_one_task(Command&); void check_worker_exc_unsafe(); @@ -129,10 +130,10 @@ private: void assert_in_worker(); std::thread::id get_worker_tid(); - template - void enqueue_command(TCommand&& cmd) { - m_buffer.enqueue(Command{std::forward(cmd)}); - } + // template + // void enqueue_command(TCommand&& cmd) { + // m_buffer.enqueue(Command{std::forward(cmd)}); + // } void sample_on_device(CompNode device, bool force); @@ -153,13 +154,13 @@ private: bool m_applying = false; bool m_closed = false; - struct WorkQueue : AsyncQueueSC { + struct WorkQueue : AsyncQueueSC { // set max_spin=0 to prevent Queue fetch task in busy wait manner. // this won't affect throughput when python interpreter is sending enough task, // but will significantly save CPU time when waiting for task, e.g. wait for data input // limit pending tasks to 10000 WorkQueue(ChannelImpl* owner) - : AsyncQueueSC(0, 10000), m_owner(owner) { + : AsyncQueueSC(0, 10000), m_owner(owner) { sys::set_thread_name("interpreter"); if (const char* env_val = MGB_GETENV("MEGENGINE_ASYNC_QUEUE_SIZE")) { int len = strlen(env_val); @@ -171,7 +172,7 @@ private: update_max_items(val); } } - void process_one_task(IdentifiedCommand& icmd) { + void process_one_task(Command& icmd) { m_owner->process_one_task(icmd); } void on_async_queue_worker_thread_start() override; @@ -193,7 +194,7 @@ private: */ struct CommandBuffer { CommandBuffer(ChannelImpl* owner) : m_owner(owner) {} - void enqueue(Command cmd); + void enqueue(CommandData cmd); bool empty() const { return m_commands.empty(); } @@ -224,91 +225,13 @@ private: //! level 0: both sync. int m_async_level = 2; - struct Scope { - std::string name; - std::unordered_map> children; - size_t version = 0; - size_t parent_version = 0; - size_t tensor_count = 0; - Scope* active_child = nullptr; - Scope* parent = nullptr; - - Scope* enter(std::string name) { - auto& child = children[name]; - if (!child) { - child = std::make_unique(); - child->name = name; - child->parent = this; - } - if (version != child->parent_version) { - child->version = 0; - child->parent_version = version; - } else { - child->version++; - } - child->tensor_count = 0; - return active_child = child.get(); - } - - Scope* exit(std::string name) { - mgb_assert(this->name == name, "scope name mismatch"); - parent->active_child = nullptr; - return parent; - } - }; - - class ScopeManager { - private: - Scope m_root; - Scope* m_current_scope = &m_root; - public: - class ScopeGuard{ - private: - ScopeManager* m_manager; - std::string m_name; - public: - ScopeGuard(ScopeManager* manager, std::string name): m_manager{manager}, m_name{name} { - m_manager->push(m_name); - } - ~ScopeGuard() { - m_manager->pop(m_name); - } - }; - void push(std::string name) { - m_current_scope = m_current_scope->enter(name); - } - void pop(std::string name) { - m_current_scope = m_current_scope->exit(name); - } - std::string next_tensor_name() { - std::string builder; - Scope* scope = &m_root; - while (true) { - builder.append(scope->name); - if (scope->version != 0) { - builder.append(ssprintf("(%ld)", scope->version)); - } - if (scope != &m_root) { - builder.append("."); - } - if (scope->active_child == nullptr) { - builder.append(ssprintf(":%%%ld", scope->tensor_count++)); - break; - } else { - scope = scope->active_child; - } - } - return builder; - } - }; - struct State { std::thread::id tid; OptionManager options; }; struct ChannelState: State { - ScopeManager scopes; + StackManager stack_manager; }; struct WorkerState: State {}; diff --git a/imperative/src/impl/interpreter/stack_manager.h b/imperative/src/impl/interpreter/stack_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..23106123b1334423d09e5bb081aba7465c046663 --- /dev/null +++ b/imperative/src/impl/interpreter/stack_manager.h @@ -0,0 +1,188 @@ +/** + * \file imperative/src/impl/interpreter/stack_manager.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include +#include +#include + +#include "megbrain/utils/metahelper.h" +#include "megbrain/utils/small_vector.h" + +namespace mgb::imperative::interpreter::intl{ + +class StackSnapshot; + +class StackManager: public NonCopyableObj { +public: + class Node; + class Guard; + struct Frame; + class Trace; +private: + std::unique_ptr m_root = nullptr; + Node* m_current = nullptr; + SmallVector m_trace_id_stack; + uint64_t m_last_trace_id = 0; +public: + StackManager(); + std::pair enter(std::string name); + void exit(std::string name); + Trace dump(); + Node* current(); +}; + +class StackManager::Node: public NonCopyableObj { +private: + std::string m_name; + std::unordered_map> m_children; + std::unordered_map m_id_table; + Node* m_parent = nullptr; + int64_t m_depth = -1; + uint64_t m_version = 0; + explicit Node(std::string name, Node* parent): m_name{name}, m_parent{parent} { + if (parent) { + m_depth = parent->m_depth + 1; + } + } +public: + const std::string& name() const { + return m_name; + } + Node* operator[](const std::string& name) { + auto& child = m_children[name]; + if (child == nullptr) { + child.reset(new Node(name, this)); + } + return child.get(); + } + Node* parent() { + return m_parent; + } + bool is_root() { + return m_parent == nullptr; + } + uint64_t version() const { + return m_version; + } + void update_version() { + ++m_version; + for (auto&& [key, child]: m_children) { + child->reset_version(); + } + m_id_table.clear(); + } + void reset_version() { + m_version = 0; + m_id_table.clear(); + } + int64_t depth() const { + return m_depth; + } + uint64_t next_id(std::string key) { + return m_id_table[key]++; + } + static std::unique_ptr make() { + return std::unique_ptr(new Node("", nullptr)); + } +}; + +class StackManager::Guard { +private: + std::string m_name; + StackManager* m_manager; +public: + Guard(std::string name, StackManager* manager): m_name{name}, m_manager{manager}{ + if (m_manager) { + m_manager->enter(name); + } + } + ~Guard() { + release(); + } + void release() { + if (m_manager) { + m_manager->exit(m_name); + m_manager = nullptr; + } + } +}; + +struct StackManager::Frame { + StackManager::Node* node; + uint64_t version; +}; + +class StackManager::Trace { +private: + SmallVector m_frames; + uint64_t m_id = 0; +public: + explicit Trace(StackManager::Node* top, uint64_t id): m_id{id} { + int64_t nr_frames = top->depth() + 1; + m_frames = SmallVector(nr_frames); + StackManager::Node* node = top; + for (int64_t i = 0; i < nr_frames; ++i) { + m_frames[m_frames.size()-1-i] = {node, node->version()}; + node = node->parent(); + } + mgb_assert(node->is_root() , ""); + } + Trace() = default; + std::string to_string() const { + std::string buffer; + for (auto&& [node, version]: m_frames) { + if (!buffer.empty()) { + buffer.append("."); + } + buffer.append(node->name()); + if (version != 0) { + buffer.append(ssprintf("[%zu]", version)); + } + } + return buffer; + } + const SmallVector& frames() const { + return m_frames; + } + uint64_t id() const { + return m_id; + } +}; + +inline StackManager::StackManager() { + m_root = Node::make(); + m_current = m_root.get(); +} + +inline std::pair StackManager::enter(std::string name) { + m_current = (*m_current)[name]; + m_trace_id_stack.push_back(++m_last_trace_id); + return {m_current, m_current->version()}; +} + +inline void StackManager::exit(std::string name) { + mgb_assert(m_current->name() == name, "scope name mismatch"); + m_current = m_current->parent(); + m_trace_id_stack.pop_back(); + m_current->update_version(); +} + +inline StackManager::Trace StackManager::dump() { + return Trace(m_current, m_trace_id_stack.empty() ? 0 : m_trace_id_stack.back()); +} + +inline StackManager::Node* StackManager::current() { + return m_current; +} + +} diff --git a/imperative/src/impl/ops/opr_attr.cpp b/imperative/src/impl/ops/opr_attr.cpp index c06cdc10dd06d173738f78f7e08df60a18699ff9..ca16f7a10d5c34b18dc1dfd826ad7421d848292a 100644 --- a/imperative/src/impl/ops/opr_attr.cpp +++ b/imperative/src/impl/ops/opr_attr.cpp @@ -102,7 +102,8 @@ std::vector> props(const OpDef& def) { } std::string make_name(const OpDef& def) { - return "OprAttr"; + auto&& attr = def.cast_final_safe(); + return attr.type; } OP_TRAIT_REG(OprAttr, OprAttr) diff --git a/imperative/src/impl/profiler.cpp b/imperative/src/impl/profiler.cpp index 4a19fb6bb69b1d006556aaab353fdb1e89f0a865..8a7c26d25a0f87f6aac20b69cffcbcdade57df01 100644 --- a/imperative/src/impl/profiler.cpp +++ b/imperative/src/impl/profiler.cpp @@ -27,25 +27,11 @@ namespace mgb { namespace imperative { -uint64_t Timer::get_nsecs() { - using namespace std::chrono; - auto finish = steady_clock::now(); - auto duration = duration_cast(finish - m_start); - return duration.count(); +profiler::Time Timer::record_host() { + return std::chrono::high_resolution_clock::now(); } -uint64_t Timer::get_started_at() { - return m_started_at; -} - -void Timer::reset() { - using namespace std::chrono; - m_start = steady_clock::now(); - auto now_ns = duration_cast(std::chrono::system_clock::now().time_since_epoch()); - m_started_at = now_ns.count(); -} - -std::shared_ptr Timer::record_event(CompNode device) { +std::shared_ptr Timer::record_device(CompNode device) { auto event = EventPool::with_timer().alloc_shared(device); event->record(); return event; @@ -55,13 +41,13 @@ Profiler::options_t Profiler::sm_profile_options; std::mutex Profiler::sm_mutex; std::unordered_map Profiler::sm_profilers; Timer Profiler::sm_timer; +profiler::HostTime Profiler::sm_start_at; std::atomic_uint64_t Profiler::sm_last_id = 0; bool Profiler::sm_profiling = false; thread_local std::unique_ptr Profiler::tm_profiler = std::make_unique(); std::atomic_size_t Profiler::sm_preferred_capacity; auto Profiler::get_thread_dict() -> thread_dict_t { - MGB_LOCK_GUARD(sm_mutex); thread_dict_t thread_dict; for (auto&& [tid, profiler]: sm_profilers) { thread_dict[tid] = profiler->m_thread_name; @@ -69,15 +55,13 @@ auto Profiler::get_thread_dict() -> thread_dict_t { return thread_dict; } -void Profiler::dump_profile(std::string basename, std::string format, results_t results, options_t options) { - auto thread_dict = get_thread_dict(); - if (format == "chrome_timeline.json") { - profiler::dump_chrome_timeline(basename, options, thread_dict, results); - } else if (format == "memory_flow.svg") { - profiler::dump_memory_flow(basename, options, thread_dict, results); - } else { +void Profiler::dump_profile(std::string basename, std::string format, bundle_t result) { + std::unordered_map format_table; + auto iter = format_table.find(format); + if (iter == format_table.end()) { mgb_log_error("unsupported profiling format %s", format.c_str()); } + return (iter->second)(basename, result); } } // namespace imperative diff --git a/imperative/src/impl/profiler/chrome_timeline.cpp b/imperative/src/impl/profiler/chrome_timeline.cpp index 2827110c8024faea8265ce59be24ca3755217e7a..c315c6dd239d9e7678aa019e553629d3b3a39533 100644 --- a/imperative/src/impl/profiler/chrome_timeline.cpp +++ b/imperative/src/impl/profiler/chrome_timeline.cpp @@ -17,6 +17,10 @@ #error Unsupported platform #endif +#include "nlohmann/json.hpp" + +#include "megbrain/utils/debug.h" + #include "./formats.h" #include "./states.h" @@ -53,8 +57,9 @@ public: m_idx = idx; return *this; } - ChromeTraceEvent& ts(uint64_t ts) { - m_ts = ts; + template + ChromeTraceEvent& ts(TDuration ts) { + m_ts = std::chrono::duration_cast>(ts).count(); return *this; } ChromeTraceEvent& dur(uint64_t dur) { @@ -69,51 +74,46 @@ public: m_bp = bp; return *this; } - ChromeTraceEvent& args(std::shared_ptr args) { + ChromeTraceEvent& args(nlohmann::json args) { m_args = std::move(args); return *this; } ChromeTraceEvent& arg(std::string key, std::string value) { - if (!m_args) { - m_args = json::Object::make(); - } - (*m_args)[key] = json::String::make(value); + m_args[key] = value; return *this; } ChromeTraceEvent& arg(std::string key, double value) { - if (!m_args) { - m_args = json::Object::make(); - } - (*m_args)[key] = json::Number::make(value); + m_args[key] = value; return *this; } - ChromeTraceEvent& arg(std::string key, std::shared_ptr value) { - if (!m_args) { - m_args = json::Object::make(); - } - (*m_args)[key] = value; + ChromeTraceEvent& arg(std::string key, nlohmann::json value) { + m_args[key] = value; + return *this; + } + ChromeTraceEvent& stack(Trace trace) { + m_stack = std::move(trace); return *this; } - std::shared_ptr to_json() const { - auto result = json::Object::make(); + nlohmann::json to_json() const { + nlohmann::json result; auto prop_str = [&](auto key, auto value) { if (value.empty()) { return; } - (*result)[key] = json::String::make(value); + result[key] = value; }; auto prop_num = [&](auto key, auto value) { if (!value) { return; } - (*result)[key] = json::Number::make(value.value()); + result[key] = value.value(); }; auto prop_char = [&](auto key, auto value) { if (!value) { return; } - (*result)[key] = json::String::make(std::string{} + value.value()); + result[key] = std::string{} + value.value(); }; prop_str("name", m_name); prop_str("cat", m_cat); @@ -126,8 +126,16 @@ public: prop_num("dur", m_dur); prop_char("ph", m_ph); prop_char("bp", m_bp); - if (m_args) { - (*result)["args"] = m_args; + if (!m_args.empty()) { + result["args"] = m_args; + } + if (m_stack) { + nlohmann::json stack; + for (auto&& frame: m_stack->frames()) { + stack.push_back(ssprintf("%s%ld", frame.node->name().c_str(), frame.version)); + } + std::reverse(stack.begin(), stack.end()); + result["stack"] = stack; } return result; } @@ -140,11 +148,12 @@ private: std::optional m_pid; std::optional m_id; std::optional m_idx; - std::optional m_ts; + std::optional m_ts; std::optional m_dur; std::optional m_ph; std::optional m_bp; - std::shared_ptr m_args; + nlohmann::json m_args; + std::optional m_stack; }; class ChromeTraceEvents { @@ -154,368 +163,218 @@ public: return m_content.back(); } - std::shared_ptr to_json() const { - auto result = json::Object::make(); - auto event_list = json::Array::make(); + std::string metadata(std::string key) { + return m_metadata[key]; + } + + nlohmann::json to_json() const { + nlohmann::json result; + nlohmann::json event_list; + nlohmann::json metadata; for (auto&& event: m_content) { - event_list->add(event.to_json()); + event_list.push_back(event.to_json()); + } + for (auto&& [key, value]: m_metadata) { + metadata[key] = value; } - (*result)["traceEvents"] = event_list; + result["traceEvents"] = event_list; + result["metadata"] = metadata; return result; } + + std::string to_string() const { + auto json = to_json(); + return "{" "traceEvents:" + nlohmann::to_string(json["traceEvents"]) + "," + "metadata:" + nlohmann::to_string(json["metadata"]) + "}"; + } private: std::vector m_content; + std::unordered_map m_metadata; }; -void dump_chrome_timeline(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results){ - auto pid = getpid(); - - ProfileDataCollector collector; - ProfileState state; -#define HANDLE_EVENT(type, ...) \ - collector.handle([&](uint64_t id, std::thread::id tid, uint64_t time, type event) __VA_ARGS__ ); - +struct ChromeTimelineEventVisitor: EventVisitor { ChromeTraceEvents trace_events; + decltype(getpid()) pid = getpid(); + std::string pid_str = std::to_string(pid); - #define NEW_HOST(NAME, PH) trace_events.new_event().name(NAME).pid(pid).tid(state[tid].index).ph(PH).ts((double)time/1e3) - - #define NEW_DEVICE(NAME, PH) trace_events.new_event().name(NAME).pid(pid).tid(256+state[event.event->comp_node()].index).ph(PH).ts((double)get_device_time(event.event, time)/1e3) - - #define OP_NAME op_state.name - - #define OP_KERNEL_NAME (op_state.name + "") - - #define OP_PROPS get_op_args(op_state) - - #define OP_ID event.op_id - - #define TENSOR_PROPS get_tensor_args(tensor_state, time) - - #define TENSOR_INFO get_tensor_info(tensor_state, time) - - #define TENSOR_COMMAND_KIND print_tensor_command_kind(event.kind) - - #define HANDLE_PLAIN_EVENT(START, FINISH, NAME_EXPR)\ - HANDLE_EVENT(START, { NEW_HOST(NAME_EXPR, 'B'); })\ - HANDLE_EVENT(FINISH, { NEW_HOST(NAME_EXPR, 'E'); }) - - #define HANDLE_TENSOR_EVENT(START, FINISH, NAME_EXPR)\ - HANDLE_EVENT(START, { NEW_HOST(NAME_EXPR, 'B'); })\ - HANDLE_EVENT(FINISH, { auto& tensor_state = state.tensors[event.tensor_id]; NEW_HOST(NAME_EXPR, 'E').args(TENSOR_PROPS); }) - - #define INC_COUNTER(NAME, DELTA)\ - { state.statics.NAME += DELTA; NEW_HOST(#NAME, 'C').arg(#NAME, state.statics.NAME); } - - auto get_tensor_args = [](const ProfileTensorState& tensor, uint64_t time) -> std::shared_ptr { - auto args = json::Object::make(); - (*args)["id"] = json::Number::make(tensor.id); - (*args)["name"] = json::String::make(tensor.name); - (*args)["shape"] = json::String::make(tensor.layout.TensorShape::to_string()); - (*args)["dtype"] = json::String::make(tensor.layout.dtype.name()); - (*args)["nr_elements"] = json::Number::make(tensor.layout.total_nr_elems()); - (*args)["device"] = json::String::make(tensor.device.to_string()); - if (tensor.produced) { - (*args)["living_time"] = json::String::make(std::to_string((time - tensor.produced + tensor.living_time)/1e6) + "ms"); - } - return args; - }; - - auto get_tensor_info = [](const ProfileTensorState& tensor, uint64_t time) -> std::string { - std::string name = tensor.name; - std::string shape = tensor.layout.TensorShape::to_string(); - std::string size_in_bytes = std::to_string(tensor.size_in_bytes()); - std::string device = tensor.device.to_string(); - std::string dtype = tensor.layout.dtype.name(); - return ssprintf("%s(%s:%s:%s)", name.c_str(), shape.c_str(), dtype.c_str(), device.c_str()); - }; + ChromeTraceEvent& new_event(std::string name, char ph, size_t tid, profiler::HostTime time) { + return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts(since_start(time)); + } - auto get_op_args = [&](const ProfileOperatorState& op) -> std::shared_ptr { - auto args = json::Object::make(); - auto params = op.params; - for (auto&& [name, value]: params) { - (*args)[name] = json::String::make(value); - } - (*args)["__id__"] = json::Number::make(op.id); - (*args)["__name__"] = json::String::make(op.name); - (*args)["__device__"] = json::String::make(op.device.to_string()); - return args; + ChromeTraceEvent& new_host_event(std::string name, char ph) { + return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(current->tid)).ts(since_start(current->time)); }; - auto get_device_time = [&](const std::shared_ptr& event, uint64_t host) -> uint64_t { - event->host_wait(); - auto& device_state = state.devices[event->comp_node()]; - if (!device_state.base_event) { - device_state.base_event = event; - device_state.base_time = host; - return host; - } - uint64_t device = device_state.base_event->elapsed_time_until(*event) * 1e9 + device_state.base_time; - return std::max(device, host); + ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { + using namespace std::literals::chrono_literals; + auto time = since_start(to_device_time(current->time, device)); + return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(device)).ts(time); }; - auto print_tensor_command_kind = [&](int kind) -> const char* { + const char* to_cstr(TensorCommandKind kind) { switch(kind) { - case TensorCommandEvent::Put: + case TensorCommandKind::Put: return "Put"; - case TensorCommandEvent::Drop: + case TensorCommandKind::Drop: return "Drop"; - case TensorCommandEvent::Del: + case TensorCommandKind::Del: return "Del"; - case TensorCommandEvent::SwapIn: + case TensorCommandKind::SwapIn: return "SwapIn"; - case TensorCommandEvent::SwapOut: + case TensorCommandKind::SwapOut: return "SwapOut"; - case TensorCommandEvent::RecFree: + case TensorCommandKind::RecFree: return "RecFree"; - case TensorCommandEvent::ReGen: + case TensorCommandKind::ReGen: return "ReGen"; + case TensorCommandKind::GetValue: + return "GetValue"; } return "UnknownCommand"; - }; - - HANDLE_EVENT(OpDispatchEvent, { - auto& op_state = state.operators[OP_ID] = {}; - op_state.id = OP_ID; - op_state.name = event.op_name; - op_state.params = event.op_params(); - op_state.inputs = event.inputs; - op_state.outputs = event.outputs; - NEW_HOST("OpDispatch", 'B'); - NEW_HOST(ssprintf("%d", pid), 's') - .cat("OpDispatch") - .id(OP_ID) - .scope(std::to_string(pid)); - NEW_HOST("OpDispatch", 'E').args(OP_PROPS); - INC_COUNTER(op_enqueue_count, 1); - }); - - HANDLE_EVENT(OpExecuteEvent, { - mgb_assert(OP_ID != 0); - mgb_assert(state.operators.count(OP_ID) > 0); - auto& op_state = state.operators[OP_ID]; - op_state.host_begin = time; - NEW_HOST(OP_NAME, 'B'); - //.args(OP_PROPS); - NEW_HOST(ssprintf("%d", pid), 't') - .cat("OpDispatch") - .id(OP_ID) - .scope(std::to_string(pid)); - INC_COUNTER(op_execute_count, 1); - }); - - HANDLE_EVENT(OpExecuteFinishEvent, { - auto& op_state = state.operators[event.op_id]; - op_state.host_end = time; - NEW_HOST(OP_NAME, 'E') - .args(OP_PROPS); - }); - - HANDLE_EVENT(KernelExecuteEvent, { - auto& op_state = state.operators[event.op_id]; - op_state.device_begin = event.event; - NEW_HOST(ssprintf("%d", pid), 's') - .id(event.kernel_id) - .cat("KernelLaunch") - .scope(std::to_string(pid)); - NEW_DEVICE(OP_KERNEL_NAME, 'B') - .cat("Kernel"); - //.args(OP_PROPS); - NEW_DEVICE(ssprintf("%d", pid), 'f') - .id(event.kernel_id) - .bp('e') - .cat("KernelLaunch") - .scope(std::to_string(pid)); - }); - - HANDLE_EVENT(KernelExecuteFinishEvent, { - auto& op_state = state.operators[event.op_id]; - op_state.device_end = event.event; - NEW_DEVICE(OP_KERNEL_NAME, 'E') - .cat("Kernel") - .args(OP_PROPS); - }); - - HANDLE_EVENT(TensorDeclareEvent, { - auto& tensor_state = state.tensors[event.tensor_id] = {}; - tensor_state.id = event.tensor_id; - tensor_state.name = event.name; - }); - - HANDLE_EVENT(TensorProduceEvent, { - auto& tensor_state = state.tensors[event.tensor_id]; - tensor_state.device = event.device; - tensor_state.layout = event.layout; - tensor_state.produced = time; - if (!tensor_state.living_time) { - NEW_HOST(ssprintf("%d", pid), 's') - .id(event.tensor_id) - .cat("TensorLink") - .scope(std::to_string(pid)); - } else { - NEW_HOST(ssprintf("%d", pid), 't') - .id(event.tensor_id) - .cat("TensorLink") - .scope(std::to_string(pid)); - } - INC_COUNTER(alive_tensor_count, 1); - INC_COUNTER(produce_tensor_count, 1); - state.tensors_by_size.insert({tensor_state.id, tensor_state.size_in_bytes()}); - state.tensors_by_produced.insert({tensor_state.id, tensor_state.produced}); - }); - - HANDLE_EVENT(TensorUsageEvent, { - NEW_HOST(ssprintf("%d", pid), 't') - .id(event.tensor_id) - .cat("TensorLink") - .scope(std::to_string(pid)); - }); - - HANDLE_EVENT(TensorReleaseEvent, { - auto& tensor_state = state.tensors[event.tensor_id]; - tensor_state.living_time += time - tensor_state.produced; - tensor_state.produced = 0; - INC_COUNTER(alive_tensor_count, -1); - INC_COUNTER(erase_tensor_count, 1); - state.tensors_by_size.erase({tensor_state.id, tensor_state.size_in_bytes()}); - state.tensors_by_produced.erase({tensor_state.id, tensor_state.produced}); - NEW_HOST(ssprintf("%d", pid), 't') - .id(event.tensor_id) - .cat("TensorLink") - .scope(std::to_string(pid)); - }); - - HANDLE_EVENT(TensorEraseEvent, { - auto& tensor_state = state.tensors[event.tensor_id]; - if (tensor_state.living_time) { - NEW_HOST(ssprintf("%d", pid), 'f') - .id(event.tensor_id) - .bp('e') - .cat("TensorLink") - .scope(std::to_string(pid)); - } - if (event.use_count == 0) { - INC_COUNTER(redundant_tensor_count, 1); - } - }); - - HANDLE_EVENT(TensorGetPropEvent, { - auto& tensor_state = state.tensors[event.tensor_id]; - NEW_HOST("TensorGetProp", 'X') - .dur(0).args(TENSOR_PROPS); - }); - - HANDLE_EVENT(TensorWaitPropEvent, { - NEW_HOST("TensorWaitProp", 'B'); - if (event.prop == TensorProp::HostValue) { - INC_COUNTER(wait_value_count, 1); - } else if (event.prop == TensorProp::Shape) { - INC_COUNTER(wait_shape_count, 1); - } - INC_COUNTER(wait_prop_count, 1); - }); - - HANDLE_EVENT(TensorWaitPropFinishEvent, { - auto& tensor_state = state.tensors[event.tensor_id]; - if (event.notified) { - NEW_HOST(ssprintf("%d", pid), 'f') - .id(event.tensor_id) - .bp('e') - .cat("TensorProp") - .scope(std::to_string(pid)); - } - NEW_HOST("TensorWaitProp", 'E') - .args(TENSOR_PROPS); - }); - - HANDLE_EVENT(TensorNotifyPropEvent, { - NEW_HOST(ssprintf("%d", pid), 's') - .id(event.tensor_id) - .cat("TensorProp") - .scope(std::to_string(pid)); - }); - - HANDLE_EVENT(ShapeInferEvent, { - if (event.success) { - INC_COUNTER(infer_shape_valid_count, 1); - } else { - INC_COUNTER(infer_shape_invalid_count, 1); - } - }); - - HANDLE_EVENT(SampleDeviceEvent, { - NEW_HOST("TopKTensor", 'B'); - }); - - HANDLE_EVENT(SampleDeviceFinishEvent, { - std::string device_name = event.device.locator().to_string(); - std::string prop_name = ssprintf("%s_alloc_memory", device_name.c_str()); - NEW_HOST(prop_name, 'C') - .arg(prop_name, event.total_memory - event.free_memory); - auto top_k_tensors = state.top_k_tensor_in_device(event.device, options.at("num_tensor_watch")); - auto& top_k_event = NEW_HOST("TopKTensor", 'E'); - for (size_t i = 0; i < top_k_tensors.size(); ++i) { - auto tensor_id = top_k_tensors[i]; - auto& tensor_state = state.tensors[tensor_id]; - top_k_event.arg(ssprintf("top%03d", (int)i), TENSOR_INFO); //%03d is always enough - } - }); - - HANDLE_EVENT(WorkerExceptionEvent, { - INC_COUNTER(exception_count, 1); - }); - - HANDLE_EVENT(TensorCommandEvent, { - NEW_HOST(ssprintf("%s %zu", TENSOR_COMMAND_KIND, event.tensor_id), 'B'); - }); - - HANDLE_EVENT(TensorCommandFinishEvent, { - auto& tensor_state = state.tensors[event.tensor_id]; - NEW_HOST(ssprintf("%s %zu", TENSOR_COMMAND_KIND, event.tensor_id), 'E') - .args(TENSOR_PROPS); - }); - - HANDLE_EVENT(ScopeEvent, { - NEW_HOST(event.name, 'B'); - state.threads[tid].scope_stack.push_back(event.name); - }); - - HANDLE_EVENT(ScopeFinishEvent, { - NEW_HOST(event.name, 'E'); - mgb_assert(state.threads[tid].scope_stack.back() == event.name); - state.threads[tid].scope_stack.pop_back(); - }); - - HANDLE_TENSOR_EVENT(OpInputEvent, OpInputFinishEvent, ssprintf("Input %zu", event.tensor_id)); - HANDLE_TENSOR_EVENT(OpOutputEvent, OpOutputFinishEvent, ssprintf("Output %zu", event.tensor_id)); - HANDLE_TENSOR_EVENT(OpDelEvent, OpDelFinishEvent, ssprintf("Del %zu", event.tensor_id)); - HANDLE_PLAIN_EVENT(StartProfileEvent, StartProfileFinishEvent, "StartProfile"); - HANDLE_PLAIN_EVENT(StopProfileEvent, StopProfileFinishEvent, "StopProfile"); - HANDLE_PLAIN_EVENT(CustomEvent, CustomFinishEvent, event.title); - HANDLE_PLAIN_EVENT(AutoEvictEvent, AutoEvictFinishEvent, "AutoEvict"); - - if (results.size() > 0) { - uint64_t time = results[0].second.time; - trace_events.new_event().name("Metadata").ph('I').pid(pid).ts(0).arg("localTime", time/1e3); } - for (auto&& result: results) { - collector(result.second.id, result.first, result.second.time, result.second.data); - } - - for (auto&& [tid, thread]: state.threads) { - if (!thread_dict.count(tid)) { - continue; + template + void visit_event(const TEvent &event) { + if constexpr (std::is_same_v) { + new_host_event("OpDispatch", 'B'); + new_host_event(pid_str, 's') + .cat("OpDispatch") + .id(event.op_id) + .scope(pid_str); + new_host_event("OpDispatch", 'E').args(current_op->detail()); + } else if constexpr (std::is_same_v) { + mgb_assert(event.op_id != 0); + current_op->execute_begin = current->time; + new_host_event(current_op->name, 'B'); + new_host_event(pid_str, 't') + .cat("OpDispatch") + .id(current_op->id) + .scope(pid_str); + } else if constexpr (std::is_same_v) { + current_op->execute_end = current->time; + new_host_event(current_op->name, 'E') + .args(current_op->detail()); + } else if constexpr (std::is_same_v) { + new_host_event(pid_str, 's') + .id(event.kernel_id) + .cat("KernelLaunch") + .scope(pid_str); + new_device_event(current_op->name, 'B', event.device) + .cat("Kernel"); + new_device_event(pid_str, 'f', event.device) + .id(event.kernel_id) + .bp('e') + .cat("KernelLaunch") + .scope(pid_str); + } else if constexpr (std::is_same_v) { + new_device_event(current_op->name, 'E', event.device) + .cat("Kernel") + .args(current_op->detail()); + } else if constexpr (std::is_same_v) { + if (current_tensor->living_time != profiler::Duration::zero()) { + new_host_event(pid_str, 's') + .id(event.tensor_id) + .cat("TensorLink") + .scope(pid_str); + } else { + new_host_event(pid_str, 't') + .id(event.tensor_id) + .cat("TensorLink") + .scope(pid_str); + } + } else if constexpr (std::is_same_v) { + new_host_event(pid_str, 't') + .id(event.tensor_id) + .cat("TensorLink") + .scope(pid_str); + } else if constexpr (std::is_same_v) { + current_tensor->living_time += current->time - current_tensor->produced; + current_tensor->produced = {}; + new_host_event(pid_str, 't') + .id(event.tensor_id) + .cat("TensorLink") + .scope(pid_str); + } else if constexpr (std::is_same_v) { + if (current_tensor->living_time != profiler::Duration::zero()) { + new_host_event(pid_str, 'f') + .id(event.tensor_id) + .bp('e') + .cat("TensorLink") + .scope(pid_str); + } + } else if constexpr (std::is_same_v) { + new_host_event("TensorGetProp", 'X') + .dur(0).args(current_tensor->detail(current->time)); + } else if constexpr (std::is_same_v) { + new_host_event(pid_str, 'f') + .id(event.tensor_id) + .bp('e') + .cat("TensorProp") + .scope(pid_str); + new_host_event("TensorWaitProp", 'E') + .args(current_tensor->detail(current->time)); + } else if constexpr (std::is_same_v) { + new_host_event(pid_str, 's') + .id(event.tensor_id) + .cat("TensorProp") + .scope(pid_str); + } else if constexpr (std::is_same_v) { + std::string device_name = event.device.locator().to_string(); + new_host_event("memory", 'C') + .arg(ssprintf("%s_alloc_mem", device_name.c_str()), event.total_memory - event.free_memory); + } else if constexpr (std::is_same_v) { + new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'B'); + } else if constexpr (std::is_same_v) { + new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'E') + .args(current_tensor->detail(current->time)); + } else if constexpr (std::is_same_v) { + new_host_event(event.name, 'B'); + } else if constexpr (std::is_same_v) { + new_host_event(event.name, 'E'); + } else if constexpr (std::is_same_v) { + new_host_event(ssprintf("Input %zu", event.tensor_id), 'B') + .args(current_tensor->detail(current->time)); + } else if constexpr (std::is_same_v) { + new_host_event(ssprintf("Input %zu", event.tensor_id), 'E') + .args(current_tensor->detail(current->time)); + } else if constexpr (std::is_same_v) { + new_host_event(ssprintf("Output %zu", event.tensor_id), 'B') + .args(current_tensor->detail(current->time)); + } else if constexpr (std::is_same_v) { + new_host_event(ssprintf("Output %zu", event.tensor_id), 'E') + .args(current_tensor->detail(current->time)); + } else if constexpr (std::is_same_v) { + new_host_event("StartProfile", 'B'); + } else if constexpr (std::is_same_v) { + new_host_event("StartProfile", 'E'); + } else if constexpr (std::is_same_v) { + new_host_event("StopProfile", 'B'); + } else if constexpr (std::is_same_v) { + new_host_event("StopProfile", 'E'); + } else if constexpr (std::is_same_v) { + new_host_event(event.title, 'B'); + } else if constexpr (std::is_same_v) { + new_host_event(event.title, 'E'); + } else if constexpr (std::is_same_v) { + new_host_event("AutoEvict", 'B'); + } else if constexpr (std::is_same_v) { + new_host_event("AutoEvict", 'E'); } - trace_events.new_event().ts(0).name("thread_name").ph('M').pid(pid).tid(thread.index).arg("name", thread_dict[tid]); } - for (auto&& [device, device_state]: state.devices) { - trace_events.new_event().ts(0).name("thread_name").ph('M').pid(pid).tid(256+device_state.index).arg("name", device.to_string()); + void notify_counter(std::string key, int64_t old_val, int64_t new_val) { + new_host_event(key, 'C').arg("value", new_val); } +}; - trace_events.to_json()->writeto_fpath(filename); +void dump_chrome_timeline(std::string filename, Profiler::bundle_t result){ + ChromeTimelineEventVisitor visitor; + visitor.process_events(result); + visitor.trace_events.metadata("localTime") = std::to_string(result.start_at.time_since_epoch().count()); + std::string json_repr = visitor.trace_events.to_string(); + mgb::debug::write_to_file(filename.c_str(), json_repr); } } diff --git a/imperative/src/impl/profiler/events.h b/imperative/src/impl/profiler/events.h index 3251a553866497fa7dbf1bd187fffb306421d994..dd034b321284180fe560d06f9d8593c9d8c6cb0c 100644 --- a/imperative/src/impl/profiler/events.h +++ b/imperative/src/impl/profiler/events.h @@ -12,7 +12,9 @@ #pragma once #include "megbrain/utils/small_vector.h" +#include "megbrain/imperative/profiler.h" +#include "../interpreter/stack_manager.h" #include "../op_trait.h" namespace mgb::imperative::profiler { @@ -52,6 +54,11 @@ struct ToStringTrait{ namespace mgb::imperative::profiler { +using Trace = interpreter::intl::StackManager::Trace; + +struct ProfileOperatorState; +struct ProfileTensorState; + #define DEF_EVENT(X, ...) struct X##Event __VA_ARGS__; #define DEF_DUR_EVENT(X, ...) struct X##Event __VA_ARGS__; struct X##FinishEvent __VA_ARGS__; @@ -61,6 +68,7 @@ DEF_EVENT(OpDispatch, { std::function op_params; SmallVector inputs; SmallVector outputs; + Trace trace; }); DEF_DUR_EVENT(OpInput, { @@ -68,11 +76,6 @@ DEF_DUR_EVENT(OpInput, { TensorShape shape; }); -DEF_DUR_EVENT(OpDel, { - uint64_t tensor_id; - TensorShape shape; -}); - DEF_DUR_EVENT(OpOutput, { uint64_t tensor_id; TensorShape shape; @@ -80,16 +83,13 @@ DEF_DUR_EVENT(OpOutput, { DEF_DUR_EVENT(OpExecute, { uint64_t op_id; + SmallVector device_list; }); -DEF_DUR_EVENT(OpPostExecute, { - uint64_t op_id; -}); - -DEF_DUR_EVENT(KernelExecute, { +DEF_DUR_EVENT(KernelLaunch, { uint64_t op_id; uint64_t kernel_id; - std::shared_ptr event; + CompNode device; }); DEF_EVENT(TensorDeclare, { @@ -128,19 +128,12 @@ DEF_EVENT(TensorNotifyProp, { TensorProp prop; }); -DEF_EVENT(TensorWaitProp, { +DEF_DUR_EVENT(TensorWaitProp, { uint64_t tensor_id; uint64_t wait_id; TensorProp prop; }); -DEF_EVENT(TensorWaitPropFinish, { - uint64_t tensor_id; - uint64_t wait_id; - TensorProp prop; - bool notified; -}); - DEF_DUR_EVENT(SampleDevice, { CompNode device; size_t total_memory; @@ -157,13 +150,10 @@ DEF_DUR_EVENT(Scope, { std::string name; }); -DEF_DUR_EVENT(DeviceScope, { - std::string name; - std::shared_ptr event; +DEF_DUR_EVENT(Sync, { + Trace trace; }); -DEF_DUR_EVENT(Sync, {}); - DEF_DUR_EVENT(StartProfile, { size_t capture_count; }); @@ -172,10 +162,13 @@ DEF_DUR_EVENT(StopProfile, { size_t escape_count; }); + +enum class TensorCommandKind { + Put, Del, SwapIn, SwapOut, Drop, ReGen, RecFree, GetValue +}; + DEF_DUR_EVENT(TensorCommand, { - enum Kind { - Put, Del, SwapIn, SwapOut, Drop, ReGen, RecFree, GetValue - }; + using Kind = TensorCommandKind; uint64_t tensor_id; Kind kind; }); @@ -187,6 +180,17 @@ DEF_DUR_EVENT(Custom, { std::string content; }); +DEF_EVENT(RecordDevice, { + std::shared_ptr event; +}); + +DEF_DUR_EVENT(HostToDevice, { + TensorLayout layout; + CompNode device; + void* host_ptr; + void* device_ptr; +}); + #undef DEF_EVENT #undef DEF_DUR_EVENT diff --git a/imperative/src/impl/profiler/formats.h b/imperative/src/impl/profiler/formats.h index 0fc6e9ee05ab78e1007ff63f88fd3b870054d112..4e26fed4320793b0670768179d1603863075cd73 100644 --- a/imperative/src/impl/profiler/formats.h +++ b/imperative/src/impl/profiler/formats.h @@ -15,10 +15,12 @@ #include "megbrain/imperative/profiler.h" +#include "./states.h" + namespace mgb::imperative::profiler { -void dump_chrome_timeline(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results); +void dump_chrome_timeline(std::string filename, Profiler::bundle_t result); -void dump_memory_flow(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results); +void dump_memory_flow(std::string filename, Profiler::bundle_t result); } diff --git a/imperative/src/impl/profiler/memory_chunk.cpp b/imperative/src/impl/profiler/memory_chunk.cpp index ee784367052065279ff221e47ae9c2fe75e643f3..a4b666643f8482eb3d94431b182beee941fb6f04 100644 --- a/imperative/src/impl/profiler/memory_chunk.cpp +++ b/imperative/src/impl/profiler/memory_chunk.cpp @@ -89,7 +89,8 @@ struct MemoryChunk { std::array address; std::string name; TensorLayout layout; - std::array time; + std::array time; + std::optional group; bool empty() const { return address[1] - address[0] == 0; @@ -111,9 +112,9 @@ struct MemoryFlow { return {addr_begin, addr_end}; } - std::pair time_range() const { - auto time_begin = std::numeric_limits::max(); - auto time_end = std::numeric_limits::min(); + std::pair time_range() const { + auto time_begin = profiler::Duration::max(); + auto time_end = profiler::Duration::min(); for(auto&& [id, chunk]: chunks) { MGB_MARK_USED_VAR(id); if (chunk.empty()) continue; @@ -123,27 +124,6 @@ struct MemoryFlow { return {time_begin, time_end}; } - std::shared_ptr to_json() const { - auto results = json::Array::make(); - for(auto&& [id, chunk]: chunks) { - MGB_MARK_USED_VAR(id); - if (chunk.empty()) continue; - auto address = json::Array::make(); - auto time = json::Array::make(); - address->add(json::String::make(std::to_string(chunk.address[0]))); - address->add(json::String::make(std::to_string(chunk.address[1]))); - time->add(json::String::make(std::to_string(chunk.time[0]))); - time->add(json::String::make(std::to_string(chunk.time[1]))); - results->add(json::Object::make({ - {"address", address}, - {"name", json::String::make(chunk.name)}, - {"layout", json::String::make(chunk.layout.to_string())}, - {"time", time} - })); - } - return results; - } - XMLWriter to_svg() const { XMLWriter writer; auto&& [addr_begin, addr_end] = address_range(); @@ -157,13 +137,13 @@ struct MemoryFlow { svg.attr("xmlns:tag", std::string{"https://megengine.org.cn"}); double time_scale = 1e5; double addr_scale = 1e6; - svg.attr("width", (time_end-time_begin)/time_scale); + svg.attr("width", (time_end-time_begin).count()/time_scale); svg.attr("height", (addr_end-addr_begin)/addr_scale); { auto rect = writer.element("rect"); rect.attr("x", 0); rect.attr("y", 0); - rect.attr("width", (time_end-time_begin)/time_scale); + rect.attr("width", (time_end-time_begin).count()/time_scale); rect.attr("height", (addr_end-addr_begin)/addr_scale); rect.attr("fill", std::string{"blue"}); } @@ -177,7 +157,7 @@ struct MemoryFlow { {1000 * ms, "#888888"}, {std::numeric_limits::infinity(), "#555555"}, }; - auto time2str = [](uint64_t ns){ + auto time2str = [](profiler::Duration ns){ using pair_t = std::pair; static pair_t units[] = { {1, "ns "}, @@ -189,9 +169,9 @@ struct MemoryFlow { auto comparator = [](const pair_t& lhs, const pair_t& rhs) { return lhs.first < rhs.first; }; - while (ns > 0) { - auto iter = std::upper_bound(std::begin(units), std::end(units), std::make_pair(ns, ""), comparator) - 1; - builder += std::to_string(ns / iter->first) + iter->second; + while (ns.count() > 0) { + auto iter = std::upper_bound(std::begin(units), std::end(units), std::make_pair(ns.count(), ""), comparator) - 1; + builder += std::to_string(ns.count() / iter->first) + iter->second; ns = ns % iter->first; } return builder; @@ -218,11 +198,11 @@ struct MemoryFlow { for (auto&& [id, chunk]: chunks) { MGB_MARK_USED_VAR(id); if (chunk.empty()) continue; - double left = (chunk.time[0]-time_begin)/time_scale; - double right = (chunk.time[1]-time_begin)/time_scale; + double left = (chunk.time[0]-time_begin).count()/time_scale; + double right = (chunk.time[1]-time_begin).count()/time_scale; double top = (chunk.address[0]-addr_begin)/addr_scale; double bottom = (chunk.address[1]-addr_begin)/addr_scale; - double duration = chunk.time[1] - chunk.time[0]; + double duration = (chunk.time[1] - chunk.time[0]).count(); { auto rect = writer.element("rect"); rect.attr("x", left); @@ -241,70 +221,48 @@ struct MemoryFlow { mge_attr("produced", time2str(chunk.time[0])); mge_attr("erased", time2str(chunk.time[1])); mge_attr("duration", time2str(chunk.time[1] - chunk.time[0])); + if (chunk.group) { + mge_attr("group", std::to_string(*chunk.group)); + } } } return writer; } }; -void dump_memory_flow(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results) { - MemoryFlow flow; - - ProfileDataCollector collector; - ProfileState state; -#define HANDLE_EVENT(type, ...) \ - collector.handle([&](uint64_t id, std::thread::id tid, uint64_t time, type event) __VA_ARGS__ ); - - HANDLE_EVENT(TensorDeclareEvent, { - auto& tensor_state = state.tensors[event.tensor_id] = {}; - tensor_state.id = event.tensor_id; - tensor_state.name = event.name; - }); +struct MemoryFlowVisitor: EventVisitor { + MemoryFlow memory_flow; - HANDLE_EVENT(TensorProduceEvent, { - auto& tensor_state = state.tensors[event.tensor_id]; - tensor_state.device = event.device; - tensor_state.layout = event.layout; - tensor_state.produced = time; - state.tensors_by_size.insert({tensor_state.id, tensor_state.size_in_bytes()}); - state.tensors_by_produced.insert({tensor_state.id, tensor_state.produced}); - auto& chunk = flow.chunks[event.tensor_id]; - uintptr_t address = reinterpret_cast(event.ptr); - auto span = event.layout.span(); - auto dtype = event.layout.dtype; - // assume dtype is not lowbit - if (!address) { - chunk.address = {0, 0}; - } else { - chunk.address = {address+span.low_elem*dtype.size(), address+span.high_elem*dtype.size()}; + template + void visit_event(const TEvent &event) { + if constexpr (std::is_same_v) { + auto& chunk = memory_flow.chunks[event.tensor_id]; + uint64_t address = reinterpret_cast(event.ptr); + auto span = event.layout.span(); + auto dtype = event.layout.dtype; + // assume dtype is not lowbit + if (!address) { + chunk.address = {0, 0}; + } else { + chunk.address = {address+span.low_elem*dtype.size(), address+span.high_elem*dtype.size()}; + } + chunk.layout = event.layout; + chunk.time[0] = since_start(to_device_time(current->time, current_tensor->device)); + chunk.name = current_tensor->name; + chunk.group = current_tensor->source; + } else if constexpr (std::is_same_v) { + auto& chunk = memory_flow.chunks[event.tensor_id]; + chunk.time[1] = since_start(to_device_time(current->time, current_tensor->device)); } - chunk.layout = tensor_state.layout; - chunk.time[0] = time; - chunk.name = tensor_state.name; - }); - - HANDLE_EVENT(TensorReleaseEvent, { - auto& tensor_state = state.tensors[event.tensor_id]; - state.tensors_by_size.erase({tensor_state.id, tensor_state.size_in_bytes()}); - state.tensors_by_produced.erase({tensor_state.id, tensor_state.produced}); - auto& chunk = flow.chunks[event.tensor_id]; - chunk.time[1] = time; - }); - - HANDLE_EVENT(ScopeEvent, { - state.threads[tid].scope_stack.push_back(event.name); - }); - - HANDLE_EVENT(ScopeFinishEvent, { - mgb_assert(state.threads[tid].scope_stack.back() == event.name); - state.threads[tid].scope_stack.pop_back(); - }); - - for (auto&& result: results) { - collector(result.second.id, result.first, result.second.time, result.second.data); } - debug::write_to_file(filename.c_str(), flow.to_svg().to_string()); + void notify_counter(std::string key, int64_t old_val, int64_t new_val) {} +}; + +void dump_memory_flow(std::string filename, Profiler::bundle_t result) { + MemoryFlowVisitor visitor; + visitor.process_events(std::move(result)); + debug::write_to_file(filename.c_str(), visitor.memory_flow.to_svg().to_string()); } } diff --git a/imperative/src/impl/profiler/states.h b/imperative/src/impl/profiler/states.h index bfdc2284020410b91f2d1a5fda94a3a34e15123b..fe8682aa98488180e610599ff9df3a33d412ae01 100644 --- a/imperative/src/impl/profiler/states.h +++ b/imperative/src/impl/profiler/states.h @@ -3,6 +3,9 @@ #include #include #include +#include + +#include "nlohmann/json.hpp" #include "megbrain/tensor.h" @@ -10,24 +13,16 @@ namespace mgb::imperative::profiler { -struct ProfileDeviceState { - int64_t index; - CompNode device; - std::shared_ptr base_event; - uint64_t base_time; //in ns -}; - -struct ProfileWorkerState { - -}; +using StackManager = interpreter::intl::StackManager; struct ProfileTensorState { - uint64_t id; + uint64_t id = 0; + std::optional source; TensorLayout layout; CompNode device; std::string name; - uint64_t produced = 0; - uint64_t living_time = 0; + profiler::HostTime produced = profiler::HostTime::min(); + profiler::Duration living_time = profiler::Duration::zero(); size_t size_in_bytes() const { if (!layout.dtype.valid()) { @@ -35,41 +30,51 @@ struct ProfileTensorState { } return layout.dtype.size(layout.total_nr_elems()); } -}; -struct ProfileStaticsState { - size_t op_enqueue_count = 0; - size_t op_execute_count = 0; - size_t wait_value_count = 0; - size_t wait_shape_count = 0; - size_t exception_count = 0; - size_t infer_shape_valid_count = 0; - size_t infer_shape_invalid_count = 0; - size_t alive_tensor_count = 0; - size_t produce_tensor_count = 0; - size_t erase_tensor_count = 0; - size_t wait_prop_count = 0; - size_t redundant_tensor_count = 0; + std::string info(HostTime current_time) { + std::string shape = layout.TensorShape::to_string(); + std::string dtype = layout.dtype.name(); + return ssprintf("%s(%s:%s:%s)", name.c_str(), shape.c_str(), dtype.c_str(), device.to_string().c_str()); + } + + nlohmann::json detail(HostTime current_time) { + nlohmann::json args; + args["id"] = id; + args["name"] = name; + args["shape"] = layout.TensorShape::to_string(); + args["dtype"] = layout.dtype.name(); + args["nr_elements"] = layout.total_nr_elems(); + args["device"] = device.to_string(); + if (produced != produced.min()) { + double ms_count = std::chrono::duration_cast>(current_time - produced + living_time).count(); + args["living_time"] = ssprintf("%lf ms", ms_count); + } + return args; + } }; struct ProfileOperatorState { - uint64_t id; + uint64_t id = 0; std::string name; OpParams params; SmallVector inputs; SmallVector outputs; CompNode device; + Trace trace; - uint64_t host_begin; - uint64_t host_end; - std::shared_ptr device_begin; - std::shared_ptr device_end; -}; + profiler::HostTime execute_begin; + profiler::HostTime execute_end; -struct ProfileThreadState { - std::thread::id tid; - int64_t index; - std::vector scope_stack; + nlohmann::json detail() { + nlohmann::json args; + for (auto&& [name, value]: params) { + args[name] = value; + } + args["__id__"] = id; + args["__name__"] = name; + args["__device__"] = device.to_string(); + return args; + } }; template @@ -93,37 +98,12 @@ struct ProfileTensorPropPair { using ProfileTensorSizePair = ProfileTensorPropPair; using ProfileTensorProducedPair = ProfileTensorPropPair; -struct GeneralTensorEvent { - uint64_t tensor_id; - std::type_index type; -}; - struct ProfileState { std::unordered_map tensors; std::unordered_map operators; std::unordered_map tensor_name_counter; std::set tensors_by_size; std::set tensors_by_produced; - ProfileWorkerState worker; - ProfileStaticsState statics; - std::unordered_map threads; - CompNode::UnorderedMap devices; - - ProfileThreadState& operator[](std::thread::id tid) { - if (threads.count(tid) == 0) { - threads[tid].tid = tid; - threads[tid].index = threads.size(); - } - return threads[tid]; - } - - ProfileDeviceState& operator[](CompNode device) { - if (devices.count(device) == 0) { - devices[device].device = device; - devices[device].index = devices.size(); - } - return devices[device]; - } std::vector top_k_tensor_in_device(CompNode device, size_t k) { std::vector results; @@ -138,19 +118,233 @@ struct ProfileState { } return results; } +}; - std::string concat_scope(std::thread::id tid) { - auto& scope_stack = threads[tid].scope_stack; - if (scope_stack.empty()) { - return {}; - } - std::string result = scope_stack[0]; - for (size_t i = 1; i < scope_stack.size(); ++i) { - result += "::"; - result += scope_stack[i]; +template +struct is_op_event : std::false_type { }; + +template +struct is_op_event().op_id, void())> : std::true_type { }; + +template +struct is_tensor_event : std::false_type { }; + +template +struct is_tensor_event().tensor_id, void())> : std::true_type { }; +template +struct is_trace_event : std::false_type { }; +template +struct is_trace_event().trace, void())> : std::true_type { }; + +template +class AnyToVariantConverter { +public: + using any_t = std::any; + using variant_t = std::variant; +private: + std::unordered_map> m_table; + + template + void register_converter() { + m_table[typeid(TItem)] = [](any_t input) { + return variant_t(std::any_cast(std::move(input))); + }; + } +public: + AnyToVariantConverter() { + (register_converter(), ...); + } + variant_t operator()(any_t input) { + return m_table[input.type()](std::move(input)); + } +}; + +template +class EventVisitor { +private: + std::unordered_map m_operators; + std::unordered_map m_tensors; + std::unordered_map> m_duration_stack; + HostTime m_start_time; + CompNode::UnorderedMap m_device_tid_table; + std::unordered_map m_host_tid_table; + CompNode::UnorderedMap> m_device_timeline; + std::unordered_map> m_trace_stack; + std::unordered_map m_counter_table; +protected: + Profiler::Record* current; + ProfileOperatorState* current_op; + ProfileTensorState* current_tensor; +protected: + profiler::Duration since_start(profiler::HostTime time) { + return time - m_start_time; + } + + profiler::HostTime to_device_time(profiler::HostTime time, CompNode device) { + auto& device_timeline = m_device_timeline[device]; + auto upper = device_timeline.lower_bound(time); + if (upper == device_timeline.end()) { + if (upper == device_timeline.begin()) { + return time; + } else { + --upper; + return time + std::chrono::duration_cast(upper->second); + } + } else if (upper->first == time) { + return time + std::chrono::duration_cast(upper->second); + } else if (upper == device_timeline.begin()) { + return time + std::chrono::duration_cast(upper->second); } + auto lower = upper; + -- lower; + double ratio = ((double)(time - lower->first).count() / (double)(upper->first - lower->first).count()); + mgb_assert(ratio > 0 && ratio < 1, "invalid ratio"); + mgb_assert(lower->first + lower->second <= upper->first + upper->second, "device time corr"); + auto shift = lower->second + ratio * (upper->second - lower->second); + auto result = time + std::chrono::duration_cast(shift); return result; } + + size_t to_tid(std::thread::id host_tid) { + return m_host_tid_table.at(host_tid); + } + + size_t to_tid(CompNode device) { + return m_device_tid_table.at(device); + } + + void inc_counter(const char* key, int64_t delta) { + if (!m_counter_table.count(key)) { + m_counter_table[key] = 0; + } + auto& value = m_counter_table[key]; + static_cast(*this).notify_counter(key, value, value + delta); + value += delta; + } +public: + void process_events(Profiler::bundle_t bundle) { + m_start_time = bundle.start_at; + + auto& self = static_cast(*this); + AnyToVariantConverter converter; + + auto for_each_entry = [&](auto&& handler) { + for (auto& entry: bundle.entries) { + current = &entry; + std::visit(handler, converter(entry.data)); + } + current = nullptr; + }; + + // build device timeline + struct DeviceStartPair { + profiler::HostTime host; + std::shared_ptr device; + }; + CompNode::UnorderedMap device_start_table; + + for_each_entry([&](auto&& event){ + using T = std::decay_t; + if constexpr (std::is_same_v) { + using namespace std::chrono_literals; + DeviceStartPair& device_start = device_start_table[event.event->comp_node()]; + if (!device_start.device) { + device_start = { current->time, event.event }; + } + event.event->host_wait(); + auto device_time = (device_start.host - current->time) + std::chrono::duration_cast(device_start.device->elapsed_time_until(*event.event) * 1s); + m_device_timeline[event.event->comp_node()][current->time] = device_time; + } + }); + + // register host threads + for_each_entry([&](auto&& event){ + if (!m_host_tid_table.count(current->tid)) { + m_host_tid_table[current->tid] = {m_device_tid_table.size() + m_host_tid_table.size()}; + } + }); + + for_each_entry([&](auto&& event){ + using T = std::decay_t; + if constexpr (std::is_same_v) { + auto& op = m_operators[event.op_id]; + mgb_assert(op.id == 0, "duplicate operator id"); + op.id = event.op_id; + op.name = event.op_name; + op.params = event.op_params(); + op.inputs = event.inputs; + op.outputs = event.outputs; + op.trace = event.trace; + for (auto&& output: event.outputs) { + m_tensors.at(output).source = op.id; + } + } else if constexpr (std::is_same_v) { + auto& tensor = m_tensors[event.tensor_id]; + mgb_assert(tensor.id == 0, "duplicated tensor id"); + tensor.id = event.tensor_id; + tensor.name = event.name; + } else if constexpr (std::is_same_v) { + auto& tensor = m_tensors.at(event.tensor_id); + if (!m_device_tid_table.count(event.device)) { + m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()}; + } + tensor.device = event.device; + } + }); + + // replay execution + using namespace std::placeholders; + for_each_entry([&](auto&& event){ + using T = std::decay_t; + // update current_op/tensor + if constexpr (is_op_event::value) { + current_op = &m_operators.at(event.op_id); + } else if constexpr (is_tensor_event::value) { + current_tensor = &m_tensors.at(event.tensor_id); + } + if constexpr (std::is_same_v) { + current_op->execute_begin = current->time; + } else if constexpr (std::is_same_v) { + current_op->execute_end = current->time; + } + // update counters + if constexpr (std::is_same_v) { + inc_counter("nr_op_pending", 1); + } else if constexpr (std::is_same_v) { + inc_counter("nr_op_pending", -1); + } else if constexpr (std::is_same_v) { + inc_counter("nr_alive_tensor", 1); + } else if constexpr (std::is_same_v) { + inc_counter("nr_alive_tensor", -1); + } else if constexpr (std::is_same_v) { + if (event.use_count == 0) { + inc_counter("nr_redunant_tensor", 1); + } + } else if constexpr (std::is_same_v) { + if (!event.success) { + inc_counter("nr_shape_infer_failure", 1); + } + } else if constexpr (std::is_same_v) { + inc_counter("nr_exception", 1); + } + // visit_event_impl + self.visit_event(event); + // reset current_op/tensor + if constexpr (is_op_event::value) { + current_op = nullptr; + } else if constexpr (is_tensor_event::value) { + current_tensor = nullptr; + } + }); + } }; } diff --git a/imperative/src/impl/profiler_plugin.cpp b/imperative/src/impl/profiler_plugin.cpp index d4a462f740561d814bb97826db4df5575d7d76f6..6a0525a529252218a7225f41308cf322ce34cb45 100644 --- a/imperative/src/impl/profiler_plugin.cpp +++ b/imperative/src/impl/profiler_plugin.cpp @@ -25,6 +25,9 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) { auto on_seq_start = [this](CompSeqExecBeforeStart const& event) { // reset mgb_assert(!event.graph->options().imperative_proxy_graph); + CompNode::foreach([](CompNode device){ + Profiler::record(Timer::record_device(device)); + }); if (m_opr_dict.empty() && m_var_dict.empty()) { init_seq(event.exec); } @@ -122,11 +125,13 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) { }; auto on_before_kern = [this](BeforeKernel const& event) { OperatorNodeBase* opr = event.opr; - Profiler::record(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node)); + Profiler::record(get_opr_info(opr).id, get_opr_info(opr).id, event.comp_node); + Profiler::record(Timer::record_device(event.comp_node)); }; auto on_after_kern = [this](AfterKernel const& event) { OperatorNodeBase* opr = event.opr; - Profiler::record(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node)); + Profiler::record(Timer::record_device(event.comp_node)); + Profiler::record(get_opr_info(opr).id, get_opr_info(opr).id, event.comp_node); }; auto on_graph_compile = [this](const CompSeqOrderDetermined&) { m_opr_dict.clear(); diff --git a/imperative/src/include/megbrain/imperative/profiler.h b/imperative/src/include/megbrain/imperative/profiler.h index 98dcd225902ed6b7e5d0ec9cfd7f34e8ff5fcdf4..1a418292a1aeb41ff3e3e0dbdfab8b8f583da85e 100644 --- a/imperative/src/include/megbrain/imperative/profiler.h +++ b/imperative/src/include/megbrain/imperative/profiler.h @@ -32,15 +32,22 @@ namespace mgb { namespace imperative { +namespace profiler { + +using HostTime = std::chrono::time_point; + +using Duration = std::chrono::nanoseconds; +using RealDuration = std::chrono::duration; + +using Time = HostTime; + +} // namespace profiler + class Timer { public: - void reset(); - uint64_t get_nsecs(); - uint64_t get_started_at(); - static std::shared_ptr record_event(CompNode device); -private: - decltype(std::chrono::steady_clock::now()) m_start; - uint64_t m_started_at; + using Time = profiler::Time; + static profiler::Time record_host(); + static std::shared_ptr record_device(CompNode device); }; @@ -48,7 +55,8 @@ class Profiler { public: struct Record { uint64_t id; - uint64_t time; //in ns + std::thread::id tid; + profiler::Time time; std::any data; }; enum Status: uint8_t { @@ -56,23 +64,32 @@ public: Recording = 1, Collecting = 2, }; - using ProfileCollector = std::function; + struct ResultBundle; + using ProfileCollector = std::function; using option_t = uint64_t; using options_t = std::unordered_map; - using result_t = std::pair; - using results_t = std::vector; + using entry_t = Record; + using bundle_t = ResultBundle; using thread_dict_t = std::unordered_map; + + struct ResultBundle { + profiler::HostTime start_at; + thread_dict_t thread_dict; + options_t options; + std::vector entries; + }; private: std::thread::id m_thread_id; std::vector m_records; + std::vector m_duration_stack; std::atomic m_status = Running; - uint64_t m_last_time = 0; std::string m_thread_name; static options_t sm_profile_options; static std::mutex sm_mutex; static std::unordered_map sm_profilers; static Timer sm_timer; + static profiler::HostTime sm_start_at; static std::atomic_uint64_t sm_last_id; static std::atomic_size_t sm_preferred_capacity; static bool sm_profiling; @@ -100,7 +117,7 @@ public: static void reset() { mgb_assert(sm_profilers.size() == 0, "profiler already running"); - sm_timer.reset(); + sm_start_at = profiler::HostTime::min(); } static uint64_t next_id() { @@ -110,16 +127,13 @@ public: template static uint64_t record(TArgs&&... args) { auto& profiler = get_instance(); - auto last_time = profiler.m_last_time; if constexpr (sm_debug) { Status expected = Running; mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording)); } uint64_t id = next_id(); - uint64_t time = sm_timer.get_nsecs(); - time = std::max(time, last_time + 2000); - profiler.m_last_time = time; - profiler.m_records.push_back({id, time, T{std::forward(args)...}}); + profiler::Time time = sm_timer.record_host(); + profiler.m_records.push_back({id, std::this_thread::get_id(), time, T{std::forward(args)...}}); if constexpr (sm_debug) { Status expected = Recording; mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running)); @@ -127,7 +141,8 @@ public: return id; } - static results_t collect() { + static bundle_t collect() { + bundle_t bundle; MGB_LOCK_GUARD(sm_mutex); if constexpr (sm_debug) { for (auto&& [tid, profiler]: sm_profilers) { @@ -136,17 +151,17 @@ public: mgb_assert(profiler->m_status.compare_exchange_strong(expected, Collecting)); } } - std::vector> profile_data; + std::vector profile_data; for (auto&& [tid, profiler]: sm_profilers) { sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size()); for (auto& record: profiler->m_records) { - profile_data.push_back({tid, std::move(record)}); + profile_data.push_back(std::move(record)); } profiler->m_records.clear(); profiler->m_records.reserve(sm_preferred_capacity); } std::sort(profile_data.begin(), profile_data.end(), [](auto& lhs, auto& rhs){ - return lhs.second.id < rhs.second.id; + return lhs.id < rhs.id; }); if constexpr (sm_debug) { for (auto&& [tid, profiler]: sm_profilers) { @@ -155,7 +170,11 @@ public: mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running)); } } - return profile_data; + bundle.entries = profile_data; + bundle.options = get_options(); + bundle.start_at = sm_start_at; + bundle.thread_dict = get_thread_dict(); + return bundle; } static option_t get_option(std::string key, option_t default_val) { @@ -179,6 +198,7 @@ public: static void start_profile() { mgb_assert(!sm_profiling); + sm_start_at = Timer::record_host(); sm_profiling = true; } @@ -189,7 +209,7 @@ public: static thread_dict_t get_thread_dict(); - static void dump_profile(std::string basename, std::string format, results_t results, options_t options); + static void dump_profile(std::string basename, std::string format, bundle_t result); }; diff --git a/imperative/test/CMakeLists.txt b/imperative/test/CMakeLists.txt index 3ecd7936336772891adbe23d2151dc8df5960a3f..32a0dc6984f26c8c29f85783d60d7b779e0cc829 100644 --- a/imperative/test/CMakeLists.txt +++ b/imperative/test/CMakeLists.txt @@ -19,7 +19,7 @@ target_include_directories(imperative_test PRIVATE ${MODULE_SRC_INCLUDE} ${PYTHO target_compile_definitions(imperative_test PRIVATE MODULE_NAME=C) target_compile_options(imperative_test PRIVATE -Wno-unused-parameter) -set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3) +set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3 nlohmann_json::nlohmann_json) if(MGE_WITH_CUDA) list(APPEND LINK_LIBS cudart)