提交 a841d8e7 编写于 作者: M Megvii Engine Team

refactor(profiler): refactor event processing

GitOrigin-RevId: 26f0c5a6e1e9af669d809776995a380e8dadf5f0
上级 dff7719e
...@@ -39,6 +39,9 @@ endif() ...@@ -39,6 +39,9 @@ endif()
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 ${PROJECT_BINARY_DIR}/third_party/range-v3) add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 ${PROJECT_BINARY_DIR}/third_party/range-v3)
target_link_libraries(${MODULE_NAME} PRIVATE range-v3) target_link_libraries(${MODULE_NAME} PRIVATE range-v3)
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/Json ${PROJECT_BINARY_DIR}/third_party/Json)
target_link_libraries(${MODULE_NAME} PRIVATE nlohmann_json::nlohmann_json)
target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${MGB_OPDEF_OUT_DIR}) target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${MGB_OPDEF_OUT_DIR})
target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME}) target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME})
target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter) target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter)
......
...@@ -1013,9 +1013,8 @@ void init_tensor(py::module m) { ...@@ -1013,9 +1013,8 @@ void init_tensor(py::module m) {
interpreter_for_py->sync(); interpreter_for_py->sync();
imperative::Profiler::stop_profile(); imperative::Profiler::stop_profile();
auto results = imperative::Profiler::collect(); auto results = imperative::Profiler::collect();
auto options = imperative::Profiler::get_options(); return [results=std::move(results)](std::string basename, std::string format){
return [results=std::move(results), options=std::move(options)](std::string basename, std::string format){ imperative::Profiler::dump_profile(basename, format, results);
imperative::Profiler::dump_profile(basename, format, results, options);
}; };
}, py::call_guard<py::gil_scoped_release>()); }, py::call_guard<py::gil_scoped_release>());
m.def("sync", m.def("sync",
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "megbrain/imperative/op_def.h" #include "megbrain/imperative/op_def.h"
#include "megbrain/imperative/utils/to_string.h" #include "megbrain/imperative/utils/to_string.h"
#include "./stack_manager.h"
#include "./tensor_info.h" #include "./tensor_info.h"
namespace mgb::imperative { namespace mgb::imperative {
...@@ -193,7 +194,7 @@ struct PopScope { ...@@ -193,7 +194,7 @@ struct PopScope {
} }
}; };
using Command = std::variant<Put, using CommandData = std::variant<Put,
ApplyOp, ApplyOp,
Del, Del,
GetValue, GetValue,
...@@ -206,14 +207,20 @@ using Command = std::variant<Put, ...@@ -206,14 +207,20 @@ using Command = std::variant<Put,
PushScope, PushScope,
PopScope>; PopScope>;
using IdentifiedCommand = std::pair<uint64_t, Command>; struct Command {
uint64_t id;
CommandData data;
StackManager::Trace trace;
};
// using IdentifiedCommand = std::pair<uint64_t, Command>;
} }
template <> template <>
struct ToStringTrait<interpreter::intl::Command>{ struct ToStringTrait<interpreter::intl::Command>{
std::string operator()(const interpreter::intl::Command& cmd) const { std::string operator()(const interpreter::intl::Command& cmd) const {
return std::visit([](const auto& cmd){ std::string content = std::visit([](const auto& cmd){
std::string result = cmd.get_name(); std::string result = cmd.get_name();
result += "{"; result += "{";
cmd.get_props([&](const char* key, auto&& value) { cmd.get_props([&](const char* key, auto&& value) {
...@@ -224,7 +231,8 @@ struct ToStringTrait<interpreter::intl::Command>{ ...@@ -224,7 +231,8 @@ struct ToStringTrait<interpreter::intl::Command>{
}); });
result += "}"; result += "}";
return result; return result;
}, cmd); }, cmd.data);
return content;
} }
}; };
......
...@@ -127,9 +127,8 @@ Handle ChannelImpl::put(const HostTensorND& value, bool no_cache) { ...@@ -127,9 +127,8 @@ Handle ChannelImpl::put(const HostTensorND& value, bool no_cache) {
MGB_LOCK_GUARD(m_spin); MGB_LOCK_GUARD(m_spin);
mgb_assert(check_available(), "Channel already closed"); mgb_assert(check_available(), "Channel already closed");
auto& state = get_channel_state(); auto& state = get_channel_state();
state.scopes.push("Put"); auto _ = StackManager::Guard{"Put", &state.stack_manager};
auto info = put_impl(value, no_cache); auto info = put_impl(value, no_cache);
state.scopes.pop("Put");
return info; return info;
} }
...@@ -158,16 +157,15 @@ Handle ChannelImpl::put(const DeviceTensorND& data, const HostTensorND& hvalue) ...@@ -158,16 +157,15 @@ Handle ChannelImpl::put(const DeviceTensorND& data, const HostTensorND& hvalue)
} }
TensorInfo* ChannelImpl::put_impl(const DeviceTensorND& data, const HostTensorND& hvalue) { TensorInfo* ChannelImpl::put_impl(const DeviceTensorND& data, const HostTensorND& hvalue) {
auto& state = get_channel_state(); auto& state = get_channel_state();
state.scopes.push("Put"); auto _ = StackManager::Guard{"Put", &state.stack_manager};
auto info = alloc(); auto info = alloc();
RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandEvent::Put); RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandKind::Put);
init(info, {data.layout(), data.comp_node()}); init(info, {data.layout(), data.comp_node()});
info->mem_desc.id = StorageIdentifier::make(++m_storage_id); info->mem_desc.id = StorageIdentifier::make(++m_storage_id);
info->ptr = Tensor::make(data, hvalue); info->ptr = Tensor::make(data, hvalue);
RECORD_EVENT(TensorProduceEvent, info->id, info->desc.layout, info->desc.comp_node, data.raw_ptr()); RECORD_EVENT(TensorProduceEvent, info->id, info->desc.layout, info->desc.comp_node, data.raw_ptr());
info->status = TensorInfo::Produced; info->status = TensorInfo::Produced;
RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandFinishEvent::Put); RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandKind::Put);
state.scopes.pop("Put");
return info; return info;
} }
...@@ -230,7 +228,7 @@ void ChannelImpl::dispatch_default_cpu( ...@@ -230,7 +228,7 @@ void ChannelImpl::dispatch_default_cpu(
auto& state = get_channel_state(); auto& state = get_channel_state();
auto name = op->trait()->make_name(*op); auto name = op->trait()->make_name(*op);
state.scopes.push(name); auto _ = StackManager::Guard(name, &state.stack_manager);
auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs);
RECORD_EVENT(ShapeInferEvent, validated); RECORD_EVENT(ShapeInferEvent, validated);
...@@ -291,9 +289,9 @@ void ChannelImpl::dispatch_default_cpu( ...@@ -291,9 +289,9 @@ void ChannelImpl::dispatch_default_cpu(
} }
return op_info; return op_info;
}; };
RECORD_EVENT(OpDispatchEvent, op_id, op->trait()->name, op_info_getter, tinfo_to_tid(input_infos), tinfo_to_tid(output_infos)); RECORD_EVENT(OpDispatchEvent, op_id, op->trait()->name, op_info_getter,
tinfo_to_tid(input_infos), tinfo_to_tid(output_infos),
state.scopes.pop(name); state.stack_manager.dump());
} }
void ChannelImpl::dispatch_kernel( void ChannelImpl::dispatch_kernel(
...@@ -305,7 +303,7 @@ void ChannelImpl::dispatch_kernel( ...@@ -305,7 +303,7 @@ void ChannelImpl::dispatch_kernel(
auto& options = state.options; auto& options = state.options;
auto name = op->trait()->make_name(*op); auto name = op->trait()->make_name(*op);
state.scopes.push(name); auto _ = StackManager::Guard{name, &state.stack_manager};
auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs);
RECORD_EVENT(ShapeInferEvent, validated); RECORD_EVENT(ShapeInferEvent, validated);
...@@ -334,7 +332,9 @@ void ChannelImpl::dispatch_kernel( ...@@ -334,7 +332,9 @@ void ChannelImpl::dispatch_kernel(
} }
return op_info; return op_info;
}; };
RECORD_EVENT(OpDispatchEvent, cmd.id, cmd.op->trait()->name, op_info_getter, tinfo_to_tid(cmd.inputs), tinfo_to_tid(cmd.outputs)); RECORD_EVENT(OpDispatchEvent, cmd.id, cmd.op->trait()->name, op_info_getter,
tinfo_to_tid(cmd.inputs), tinfo_to_tid(cmd.outputs),
state.stack_manager.dump());
m_buffer.enqueue(std::move(cmd)); m_buffer.enqueue(std::move(cmd));
if (!validated && options.async_level == 1) { if (!validated && options.async_level == 1) {
sync_impl(); sync_impl();
...@@ -346,7 +346,6 @@ void ChannelImpl::dispatch_kernel( ...@@ -346,7 +346,6 @@ void ChannelImpl::dispatch_kernel(
info->ptr->comp_node().sync(); info->ptr->comp_node().sync();
} }
} }
state.scopes.pop(name);
} }
SmallVector<Handle> ChannelImpl::apply_op( SmallVector<Handle> ChannelImpl::apply_op(
...@@ -505,7 +504,8 @@ TensorInfo* ChannelImpl::alloc() { ...@@ -505,7 +504,8 @@ TensorInfo* ChannelImpl::alloc() {
}(); }();
info->id = Profiler::next_id(); info->id = Profiler::next_id();
if (Profiler::is_profiling()) { if (Profiler::is_profiling()) {
info->name = state.scopes.next_tensor_name(); size_t tensor_id = state.stack_manager.current()->next_id("tensor");
info->name = state.stack_manager.dump().to_string() + ssprintf(":%zu", tensor_id);
} }
return info; return info;
} }
...@@ -554,7 +554,7 @@ void ChannelImpl::free(TensorInfo* ptr) { ...@@ -554,7 +554,7 @@ void ChannelImpl::free(TensorInfo* ptr) {
} }
void ChannelImpl::recursive_free(TensorInfo* ptr) { void ChannelImpl::recursive_free(TensorInfo* ptr) {
RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandEvent::RecFree); RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandKind::RecFree);
SmallVector<TensorInfo*> inps; SmallVector<TensorInfo*> inps;
if (ptr->producer) { if (ptr->producer) {
for (auto i : ptr->producer->inputs) { for (auto i : ptr->producer->inputs) {
...@@ -569,7 +569,7 @@ void ChannelImpl::recursive_free(TensorInfo* ptr) { ...@@ -569,7 +569,7 @@ void ChannelImpl::recursive_free(TensorInfo* ptr) {
recursive_free(i); recursive_free(i);
} }
} }
RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandFinishEvent::RecFree); RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandKind::RecFree);
} }
void ChannelImpl::real_free(TensorInfo* ptr) { void ChannelImpl::real_free(TensorInfo* ptr) {
...@@ -625,9 +625,9 @@ void ChannelImpl::regenerate(TensorInfo* dest) { ...@@ -625,9 +625,9 @@ void ChannelImpl::regenerate(TensorInfo* dest) {
m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest}); m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest});
if (!m_applying) flush_apply_stack(); if (!m_applying) flush_apply_stack();
} else if (dest->evict_type == EvictType::SWAP) { } else if (dest->evict_type == EvictType::SWAP) {
RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandEvent::ReGen); RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen);
produce_tensor(dest, Tensor::make(dest->h_value)); produce_tensor(dest, Tensor::make(dest->h_value));
RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandFinishEvent::ReGen); RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandKind::ReGen);
} }
} }
...@@ -721,22 +721,24 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { ...@@ -721,22 +721,24 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
// refcnt --, owners: [tensor_inputs] // refcnt --, owners: [tensor_inputs]
// if it's decreased to 1, would be detected at @see: proxy_graph_detail::apply_on_physical_tensor // if it's decreased to 1, would be detected at @see: proxy_graph_detail::apply_on_physical_tensor
uint64_t del_id = del->id; uint64_t del_id = del->id;
RECORD_EVENT(OpDelEvent, del_id); RECORD_EVENT(TensorCommandEvent, del_id, TensorCommandKind::Del);
free(del); free(del);
RECORD_EVENT(OpDelFinishEvent, del_id); RECORD_EVENT(TensorCommandFinishEvent, del_id, TensorCommandKind::Del);
} }
// Before wait // Before wait
//TODO: split operator wait and execute so that OpWait could be corrected recorded. //TODO: split operator wait and execute so that OpWait could be corrected recorded.
// Before execute // Before execute
for (auto&& [device, kernel_id]: kernels) { for (auto&& [device, kernel_id]: kernels) {
RECORD_EVENT(KernelExecuteEvent, apply_id, kernel_id, Timer::record_event(device)); RECORD_EVENT(KernelLaunchEvent, apply_id, kernel_id, device);
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device));
} }
// Apply op // Apply op
// Here std::move is REQUIRED for removing duplicated references. // Here std::move is REQUIRED for removing duplicated references.
auto outputs = apply_on_physical_tensor(apply_on_physical_tensor, *cmd.op, inputs); auto outputs = apply_on_physical_tensor(apply_on_physical_tensor, *cmd.op, inputs);
// After execute // After execute
for (auto&& [device, kernel_id]: kernels) { for (auto&& [device, kernel_id]: kernels) {
RECORD_EVENT(KernelExecuteFinishEvent, apply_id, kernel_id, Timer::record_event(device)); RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device));
RECORD_EVENT(KernelLaunchFinishEvent, apply_id, kernel_id, device);
} }
// End profiling operator // End profiling operator
mgb_assert(outputs.size() == cmd.outputs.size()); mgb_assert(outputs.size() == cmd.outputs.size());
...@@ -787,7 +789,7 @@ void ChannelImpl::flush_apply_stack() { ...@@ -787,7 +789,7 @@ void ChannelImpl::flush_apply_stack() {
m_dtr.pin(cmd.inputs); m_dtr.pin(cmd.inputs);
} }
if (recomp) { if (recomp) {
RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandEvent::ReGen); RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandKind::ReGen);
} }
} }
bool regen = false; bool regen = false;
...@@ -810,7 +812,7 @@ void ChannelImpl::flush_apply_stack() { ...@@ -810,7 +812,7 @@ void ChannelImpl::flush_apply_stack() {
m_apply_stack.pop(); m_apply_stack.pop();
do_apply_op(cmd_backup); do_apply_op(cmd_backup);
if (recomp_backup) { if (recomp_backup) {
RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandFinishEvent::ReGen); RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen);
for (auto o : cmd_backup.outputs) { for (auto o : cmd_backup.outputs) {
if (o) { if (o) {
m_dtr.update_dsu_after_recompute(o); m_dtr.update_dsu_after_recompute(o);
...@@ -902,7 +904,7 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) { ...@@ -902,7 +904,7 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) {
check_worker_exc_unsafe(); check_worker_exc_unsafe();
return require_host ? host_available() : static_cast<bool>(info->ptr); return require_host ? host_available() : static_cast<bool>(info->ptr);
}); });
RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop, m_waitee == nullptr); RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop);
m_waitee = nullptr; m_waitee = nullptr;
return info->ptr; return info->ptr;
} }
...@@ -1003,7 +1005,7 @@ std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPt ...@@ -1003,7 +1005,7 @@ std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPt
return {outputs_desc, alloc_storage(outputs_desc), alloc_storage(workspaces_desc)}; return {outputs_desc, alloc_storage(outputs_desc), alloc_storage(workspaces_desc)};
} }
void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { void ChannelImpl::process_one_task(Command& icmd) {
using namespace ranges; using namespace ranges;
using namespace ranges::views; using namespace ranges::views;
auto& state = get_worker_state(); auto& state = get_worker_state();
...@@ -1012,10 +1014,12 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { ...@@ -1012,10 +1014,12 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
auto cmd_visitor = [&](const auto& cmd) { auto cmd_visitor = [&](const auto& cmd) {
using T = std::decay_t<decltype(cmd)>; using T = std::decay_t<decltype(cmd)>;
if constexpr (std::is_same_v<T, Put>) { if constexpr (std::is_same_v<T, Put>) {
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Put); RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Put);
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node()));
auto value = cmd.no_cache ? std::make_shared<Tensor>(cmd.value) : Tensor::make(cmd.value); auto value = cmd.no_cache ? std::make_shared<Tensor>(cmd.value) : Tensor::make(cmd.value);
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node()));
produce_tensor(cmd.dest, std::move(value)); produce_tensor(cmd.dest, std::move(value));
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Put); RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Put);
sample_on_device(cmd.dest->desc.comp_node, false); sample_on_device(cmd.dest->desc.comp_node, false);
} else if constexpr (std::is_same_v<T, ApplyOp>) { } else if constexpr (std::is_same_v<T, ApplyOp>) {
for (auto& i : cmd.inputs) { for (auto& i : cmd.inputs) {
...@@ -1084,11 +1088,11 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { ...@@ -1084,11 +1088,11 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
} }
} }
} else if constexpr (std::is_same_v<T, Del>) { } else if constexpr (std::is_same_v<T, Del>) {
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Del); RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Del);
CompNode device = cmd.dest->desc.comp_node; CompNode device = cmd.dest->desc.comp_node;
uint64_t tensor_id = cmd.dest->id; uint64_t tensor_id = cmd.dest->id;
free(cmd.dest); free(cmd.dest);
RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandFinishEvent::Del); RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandKind::Del);
sample_on_device(device, false); sample_on_device(device, false);
} else if constexpr (std::is_same_v<T, GetValue>) { } else if constexpr (std::is_same_v<T, GetValue>) {
if (cmd.dest->invalid) return; if (cmd.dest->invalid) return;
...@@ -1102,26 +1106,26 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { ...@@ -1102,26 +1106,26 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
imperative_log_profile_end("GetValue"); imperative_log_profile_end("GetValue");
} else if constexpr (std::is_same_v<T, SwapIn>) { } else if constexpr (std::is_same_v<T, SwapIn>) {
if (cmd.dest->invalid) return; if (cmd.dest->invalid) return;
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapIn); RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapIn);
produce_tensor(cmd.dest, Tensor::make(cmd.dest->h_value)); produce_tensor(cmd.dest, Tensor::make(cmd.dest->h_value));
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapIn); RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapIn);
sample_on_device(cmd.dest->desc.comp_node, false); sample_on_device(cmd.dest->desc.comp_node, false);
} else if constexpr (std::is_same_v<T, SwapOut>) { } else if constexpr (std::is_same_v<T, SwapOut>) {
if (cmd.dest->invalid) return; if (cmd.dest->invalid) return;
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapOut); RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapOut);
cmd.dest->h_value = cmd.dest->ptr->get_value(); cmd.dest->h_value = cmd.dest->ptr->get_value();
if (cmd.dest->evict_type == EvictType::NONE) { if (cmd.dest->evict_type == EvictType::NONE) {
cmd.dest->evict_type = EvictType::SWAP; cmd.dest->evict_type = EvictType::SWAP;
cmd.dest->status = TensorInfo::Swapped; cmd.dest->status = TensorInfo::Swapped;
release_tensor(cmd.dest); release_tensor(cmd.dest);
} }
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapOut); RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapOut);
sample_on_device(cmd.dest->desc.comp_node, false); sample_on_device(cmd.dest->desc.comp_node, false);
} else if constexpr (std::is_same_v<T, Drop>) { } else if constexpr (std::is_same_v<T, Drop>) {
if (cmd.dest->invalid) return; if (cmd.dest->invalid) return;
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Drop); RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Drop);
do_drop(cmd.dest, true); do_drop(cmd.dest, true);
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Drop); RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Drop);
} else if constexpr (std::is_same_v<T, SetOption>) { } else if constexpr (std::is_same_v<T, SetOption>) {
options.set_option(cmd.key, cmd.value); options.set_option(cmd.key, cmd.value);
} else if constexpr (std::is_same_v<T, StartProfile>) { } else if constexpr (std::is_same_v<T, StartProfile>) {
...@@ -1138,6 +1142,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { ...@@ -1138,6 +1142,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
if (Profiler::get_option("sample_rate", 0)) { if (Profiler::get_option("sample_rate", 0)) {
sample_on_device(device, true); sample_on_device(device, true);
} }
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device));
}); });
RECORD_EVENT(StartProfileFinishEvent); RECORD_EVENT(StartProfileFinishEvent);
} else if constexpr (std::is_same_v<T, StopProfile>) { } else if constexpr (std::is_same_v<T, StopProfile>) {
...@@ -1186,7 +1191,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { ...@@ -1186,7 +1191,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
notify_tensor_unsafe(m_waitee); notify_tensor_unsafe(m_waitee);
} }
} }
}, icmd.second); }, icmd.data);
} }
void ChannelImpl::check_worker_exc_unsafe() { void ChannelImpl::check_worker_exc_unsafe() {
...@@ -1203,12 +1208,13 @@ void ChannelImpl::check_worker_exc_unsafe() { ...@@ -1203,12 +1208,13 @@ void ChannelImpl::check_worker_exc_unsafe() {
} }
} }
void ChannelImpl::CommandBuffer::enqueue(Command cmd) { void ChannelImpl::CommandBuffer::enqueue(CommandData cmd) {
auto& state = m_owner->get_channel_state();
if (std::get_if<Del>(&cmd) && fuse_del(std::get<Del>(cmd))) { if (std::get_if<Del>(&cmd) && fuse_del(std::get<Del>(cmd))) {
return; return;
} }
// mgb_log_debug("%s Enqueued", to_string(cmd).c_str()); // mgb_log_debug("%s Enqueued", to_string(cmd).c_str());
m_commands.push_back(std::move(cmd)); m_commands.push_back({Profiler::next_id(), std::move(cmd), state.stack_manager.dump()});
auto flush_pos = flush_pos_for(m_commands.back()); auto flush_pos = flush_pos_for(m_commands.back());
flush(flush_pos); flush(flush_pos);
} }
...@@ -1222,7 +1228,7 @@ void ChannelImpl::CommandBuffer::flush(Handle pos) { ...@@ -1222,7 +1228,7 @@ void ChannelImpl::CommandBuffer::flush(Handle pos) {
if (Profiler::is_profiling()) { if (Profiler::is_profiling()) {
mgb_log_debug("%s Flushed", to_string(*iter).c_str()); mgb_log_debug("%s Flushed", to_string(*iter).c_str());
} }
m_owner->m_worker.add_task(IdentifiedCommand{Profiler::next_id(), std::move(*iter)}); m_owner->m_worker.add_task(std::move(*iter));
} }
m_commands.erase(m_commands.begin(), pos); m_commands.erase(m_commands.begin(), pos);
} }
...@@ -1248,7 +1254,7 @@ auto ChannelImpl::CommandBuffer::flush_pos_for(const Command& cmd) -> Handle { ...@@ -1248,7 +1254,7 @@ auto ChannelImpl::CommandBuffer::flush_pos_for(const Command& cmd) -> Handle {
return m_commands.begin() + (m_commands.size() - buffer_length); return m_commands.begin() + (m_commands.size() - buffer_length);
} }
return m_commands.begin(); return m_commands.begin();
}, cmd); }, cmd.data);
} }
/** /**
...@@ -1261,7 +1267,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) { ...@@ -1261,7 +1267,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) {
// TODO: eliminate Puts // TODO: eliminate Puts
auto begin = m_commands.begin(), end = m_commands.end(); auto begin = m_commands.begin(), end = m_commands.end();
auto apply_iter = std::find_if(begin, end, [dest](const Command& cmd){ auto apply_iter = std::find_if(begin, end, [dest](const Command& cmd){
if (auto* apply = std::get_if<ApplyOp>(&cmd)) { if (auto* apply = std::get_if<ApplyOp>(&cmd.data)) {
return std::count(apply->inputs.begin(), apply->inputs.end(), dest) > 0; return std::count(apply->inputs.begin(), apply->inputs.end(), dest) > 0;
} }
return false; return false;
...@@ -1270,7 +1276,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) { ...@@ -1270,7 +1276,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) {
return false; return false;
} }
// mgb_log_debug("%s Fused", to_string(Command{cmd}).c_str()); // mgb_log_debug("%s Fused", to_string(Command{cmd}).c_str());
std::get<ApplyOp>(*apply_iter).dels.push_back(dest); std::get<ApplyOp>(apply_iter->data).dels.push_back(dest);
return true; return true;
} }
...@@ -1297,7 +1303,7 @@ auto ChannelImpl::CommandBuffer::find_last_usage(TensorInfo* dest, Range range) ...@@ -1297,7 +1303,7 @@ auto ChannelImpl::CommandBuffer::find_last_usage(TensorInfo* dest, Range range)
found = iter; found = iter;
} }
} }
}, *iter); }, iter->data);
}; };
return found; return found;
} }
...@@ -1313,7 +1319,7 @@ auto ChannelImpl::CommandBuffer::find_produce(TensorInfo* dest, Range range) ...@@ -1313,7 +1319,7 @@ auto ChannelImpl::CommandBuffer::find_produce(TensorInfo* dest, Range range)
return cmd.dest == dest; return cmd.dest == dest;
} }
return false; return false;
}, cmd); }, cmd.data);
}); });
} }
...@@ -1340,7 +1346,7 @@ void ChannelImpl::push_scope(std::string name) { ...@@ -1340,7 +1346,7 @@ void ChannelImpl::push_scope(std::string name) {
MGB_LOCK_GUARD(m_spin); MGB_LOCK_GUARD(m_spin);
mgb_assert(check_available(), "Channel already closed"); mgb_assert(check_available(), "Channel already closed");
auto& state = get_channel_state(); auto& state = get_channel_state();
state.scopes.push(name); state.stack_manager.enter(name);
RECORD_EVENT(ScopeEvent, name); RECORD_EVENT(ScopeEvent, name);
m_buffer.enqueue(PushScope{name}); m_buffer.enqueue(PushScope{name});
} }
...@@ -1349,7 +1355,7 @@ void ChannelImpl::pop_scope(std::string name) { ...@@ -1349,7 +1355,7 @@ void ChannelImpl::pop_scope(std::string name) {
MGB_LOCK_GUARD(m_spin); MGB_LOCK_GUARD(m_spin);
mgb_assert(check_available(), "Channel already closed"); mgb_assert(check_available(), "Channel already closed");
auto& state = get_channel_state(); auto& state = get_channel_state();
state.scopes.pop(name); state.stack_manager.exit(name);
RECORD_EVENT(ScopeFinishEvent, name); RECORD_EVENT(ScopeFinishEvent, name);
m_buffer.enqueue(PopScope{name}); m_buffer.enqueue(PopScope{name});
} }
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include "./commands.h" #include "./commands.h"
#include "./tensor_info.h" #include "./tensor_info.h"
#include "./option_manager.h" #include "./option_manager.h"
#include "./stack_manager.h"
#include "../profiler/events.h" #include "../profiler/events.h"
...@@ -94,7 +95,7 @@ private: ...@@ -94,7 +95,7 @@ private:
TensorPtr wait_tensor(TensorInfo* info, profiler::TensorProp prop); TensorPtr wait_tensor(TensorInfo* info, profiler::TensorProp prop);
void notify_tensor_unsafe(TensorInfo* info); void notify_tensor_unsafe(TensorInfo* info);
void process_one_task(IdentifiedCommand&); void process_one_task(Command&);
void check_worker_exc_unsafe(); void check_worker_exc_unsafe();
...@@ -129,10 +130,10 @@ private: ...@@ -129,10 +130,10 @@ private:
void assert_in_worker(); void assert_in_worker();
std::thread::id get_worker_tid(); std::thread::id get_worker_tid();
template <typename TCommand> // template <typename TCommand>
void enqueue_command(TCommand&& cmd) { // void enqueue_command(TCommand&& cmd) {
m_buffer.enqueue(Command{std::forward<TCommand>(cmd)}); // m_buffer.enqueue(Command{std::forward<TCommand>(cmd)});
} // }
void sample_on_device(CompNode device, bool force); void sample_on_device(CompNode device, bool force);
...@@ -153,13 +154,13 @@ private: ...@@ -153,13 +154,13 @@ private:
bool m_applying = false; bool m_applying = false;
bool m_closed = false; bool m_closed = false;
struct WorkQueue : AsyncQueueSC<IdentifiedCommand, WorkQueue> { struct WorkQueue : AsyncQueueSC<Command, WorkQueue> {
// set max_spin=0 to prevent Queue fetch task in busy wait manner. // set max_spin=0 to prevent Queue fetch task in busy wait manner.
// this won't affect throughput when python interpreter is sending enough task, // this won't affect throughput when python interpreter is sending enough task,
// but will significantly save CPU time when waiting for task, e.g. wait for data input // but will significantly save CPU time when waiting for task, e.g. wait for data input
// limit pending tasks to 10000 // limit pending tasks to 10000
WorkQueue(ChannelImpl* owner) WorkQueue(ChannelImpl* owner)
: AsyncQueueSC<IdentifiedCommand, WorkQueue>(0, 10000), m_owner(owner) { : AsyncQueueSC<Command, WorkQueue>(0, 10000), m_owner(owner) {
sys::set_thread_name("interpreter"); sys::set_thread_name("interpreter");
if (const char* env_val = MGB_GETENV("MEGENGINE_ASYNC_QUEUE_SIZE")) { if (const char* env_val = MGB_GETENV("MEGENGINE_ASYNC_QUEUE_SIZE")) {
int len = strlen(env_val); int len = strlen(env_val);
...@@ -171,7 +172,7 @@ private: ...@@ -171,7 +172,7 @@ private:
update_max_items(val); update_max_items(val);
} }
} }
void process_one_task(IdentifiedCommand& icmd) { void process_one_task(Command& icmd) {
m_owner->process_one_task(icmd); m_owner->process_one_task(icmd);
} }
void on_async_queue_worker_thread_start() override; void on_async_queue_worker_thread_start() override;
...@@ -193,7 +194,7 @@ private: ...@@ -193,7 +194,7 @@ private:
*/ */
struct CommandBuffer { struct CommandBuffer {
CommandBuffer(ChannelImpl* owner) : m_owner(owner) {} CommandBuffer(ChannelImpl* owner) : m_owner(owner) {}
void enqueue(Command cmd); void enqueue(CommandData cmd);
bool empty() const { bool empty() const {
return m_commands.empty(); return m_commands.empty();
} }
...@@ -224,91 +225,13 @@ private: ...@@ -224,91 +225,13 @@ private:
//! level 0: both sync. //! level 0: both sync.
int m_async_level = 2; int m_async_level = 2;
struct Scope {
std::string name;
std::unordered_map<std::string, std::unique_ptr<Scope>> children;
size_t version = 0;
size_t parent_version = 0;
size_t tensor_count = 0;
Scope* active_child = nullptr;
Scope* parent = nullptr;
Scope* enter(std::string name) {
auto& child = children[name];
if (!child) {
child = std::make_unique<Scope>();
child->name = name;
child->parent = this;
}
if (version != child->parent_version) {
child->version = 0;
child->parent_version = version;
} else {
child->version++;
}
child->tensor_count = 0;
return active_child = child.get();
}
Scope* exit(std::string name) {
mgb_assert(this->name == name, "scope name mismatch");
parent->active_child = nullptr;
return parent;
}
};
class ScopeManager {
private:
Scope m_root;
Scope* m_current_scope = &m_root;
public:
class ScopeGuard{
private:
ScopeManager* m_manager;
std::string m_name;
public:
ScopeGuard(ScopeManager* manager, std::string name): m_manager{manager}, m_name{name} {
m_manager->push(m_name);
}
~ScopeGuard() {
m_manager->pop(m_name);
}
};
void push(std::string name) {
m_current_scope = m_current_scope->enter(name);
}
void pop(std::string name) {
m_current_scope = m_current_scope->exit(name);
}
std::string next_tensor_name() {
std::string builder;
Scope* scope = &m_root;
while (true) {
builder.append(scope->name);
if (scope->version != 0) {
builder.append(ssprintf("(%ld)", scope->version));
}
if (scope != &m_root) {
builder.append(".");
}
if (scope->active_child == nullptr) {
builder.append(ssprintf(":%%%ld", scope->tensor_count++));
break;
} else {
scope = scope->active_child;
}
}
return builder;
}
};
struct State { struct State {
std::thread::id tid; std::thread::id tid;
OptionManager options; OptionManager options;
}; };
struct ChannelState: State { struct ChannelState: State {
ScopeManager scopes; StackManager stack_manager;
}; };
struct WorkerState: State {}; struct WorkerState: State {};
......
/**
* \file imperative/src/impl/interpreter/stack_manager.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include <string>
#include <memory>
#include <unordered_map>
#include "megbrain/utils/metahelper.h"
#include "megbrain/utils/small_vector.h"
namespace mgb::imperative::interpreter::intl{
class StackSnapshot;
class StackManager: public NonCopyableObj {
public:
class Node;
class Guard;
struct Frame;
class Trace;
private:
std::unique_ptr<Node> m_root = nullptr;
Node* m_current = nullptr;
SmallVector<uint64_t> m_trace_id_stack;
uint64_t m_last_trace_id = 0;
public:
StackManager();
std::pair<Node*, uint64_t> enter(std::string name);
void exit(std::string name);
Trace dump();
Node* current();
};
class StackManager::Node: public NonCopyableObj {
private:
std::string m_name;
std::unordered_map<std::string, std::unique_ptr<Node>> m_children;
std::unordered_map<std::string, size_t> m_id_table;
Node* m_parent = nullptr;
int64_t m_depth = -1;
uint64_t m_version = 0;
explicit Node(std::string name, Node* parent): m_name{name}, m_parent{parent} {
if (parent) {
m_depth = parent->m_depth + 1;
}
}
public:
const std::string& name() const {
return m_name;
}
Node* operator[](const std::string& name) {
auto& child = m_children[name];
if (child == nullptr) {
child.reset(new Node(name, this));
}
return child.get();
}
Node* parent() {
return m_parent;
}
bool is_root() {
return m_parent == nullptr;
}
uint64_t version() const {
return m_version;
}
void update_version() {
++m_version;
for (auto&& [key, child]: m_children) {
child->reset_version();
}
m_id_table.clear();
}
void reset_version() {
m_version = 0;
m_id_table.clear();
}
int64_t depth() const {
return m_depth;
}
uint64_t next_id(std::string key) {
return m_id_table[key]++;
}
static std::unique_ptr<Node> make() {
return std::unique_ptr<Node>(new Node("", nullptr));
}
};
class StackManager::Guard {
private:
std::string m_name;
StackManager* m_manager;
public:
Guard(std::string name, StackManager* manager): m_name{name}, m_manager{manager}{
if (m_manager) {
m_manager->enter(name);
}
}
~Guard() {
release();
}
void release() {
if (m_manager) {
m_manager->exit(m_name);
m_manager = nullptr;
}
}
};
struct StackManager::Frame {
StackManager::Node* node;
uint64_t version;
};
class StackManager::Trace {
private:
SmallVector<StackManager::Frame> m_frames;
uint64_t m_id = 0;
public:
explicit Trace(StackManager::Node* top, uint64_t id): m_id{id} {
int64_t nr_frames = top->depth() + 1;
m_frames = SmallVector<StackManager::Frame>(nr_frames);
StackManager::Node* node = top;
for (int64_t i = 0; i < nr_frames; ++i) {
m_frames[m_frames.size()-1-i] = {node, node->version()};
node = node->parent();
}
mgb_assert(node->is_root() , "");
}
Trace() = default;
std::string to_string() const {
std::string buffer;
for (auto&& [node, version]: m_frames) {
if (!buffer.empty()) {
buffer.append(".");
}
buffer.append(node->name());
if (version != 0) {
buffer.append(ssprintf("[%zu]", version));
}
}
return buffer;
}
const SmallVector<StackManager::Frame>& frames() const {
return m_frames;
}
uint64_t id() const {
return m_id;
}
};
inline StackManager::StackManager() {
m_root = Node::make();
m_current = m_root.get();
}
inline std::pair<StackManager::Node*, uint64_t> StackManager::enter(std::string name) {
m_current = (*m_current)[name];
m_trace_id_stack.push_back(++m_last_trace_id);
return {m_current, m_current->version()};
}
inline void StackManager::exit(std::string name) {
mgb_assert(m_current->name() == name, "scope name mismatch");
m_current = m_current->parent();
m_trace_id_stack.pop_back();
m_current->update_version();
}
inline StackManager::Trace StackManager::dump() {
return Trace(m_current, m_trace_id_stack.empty() ? 0 : m_trace_id_stack.back());
}
inline StackManager::Node* StackManager::current() {
return m_current;
}
}
...@@ -102,7 +102,8 @@ std::vector<std::pair<const char*, std::string>> props(const OpDef& def) { ...@@ -102,7 +102,8 @@ std::vector<std::pair<const char*, std::string>> props(const OpDef& def) {
} }
std::string make_name(const OpDef& def) { std::string make_name(const OpDef& def) {
return "OprAttr"; auto&& attr = def.cast_final_safe<OprAttr>();
return attr.type;
} }
OP_TRAIT_REG(OprAttr, OprAttr) OP_TRAIT_REG(OprAttr, OprAttr)
......
...@@ -27,25 +27,11 @@ ...@@ -27,25 +27,11 @@
namespace mgb { namespace mgb {
namespace imperative { namespace imperative {
uint64_t Timer::get_nsecs() { profiler::Time Timer::record_host() {
using namespace std::chrono; return std::chrono::high_resolution_clock::now();
auto finish = steady_clock::now();
auto duration = duration_cast<nanoseconds>(finish - m_start);
return duration.count();
} }
uint64_t Timer::get_started_at() { std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) {
return m_started_at;
}
void Timer::reset() {
using namespace std::chrono;
m_start = steady_clock::now();
auto now_ns = duration_cast<nanoseconds>(std::chrono::system_clock::now().time_since_epoch());
m_started_at = now_ns.count();
}
std::shared_ptr<CompNode::Event> Timer::record_event(CompNode device) {
auto event = EventPool::with_timer().alloc_shared(device); auto event = EventPool::with_timer().alloc_shared(device);
event->record(); event->record();
return event; return event;
...@@ -55,13 +41,13 @@ Profiler::options_t Profiler::sm_profile_options; ...@@ -55,13 +41,13 @@ Profiler::options_t Profiler::sm_profile_options;
std::mutex Profiler::sm_mutex; std::mutex Profiler::sm_mutex;
std::unordered_map<std::thread::id, Profiler*> Profiler::sm_profilers; std::unordered_map<std::thread::id, Profiler*> Profiler::sm_profilers;
Timer Profiler::sm_timer; Timer Profiler::sm_timer;
profiler::HostTime Profiler::sm_start_at;
std::atomic_uint64_t Profiler::sm_last_id = 0; std::atomic_uint64_t Profiler::sm_last_id = 0;
bool Profiler::sm_profiling = false; bool Profiler::sm_profiling = false;
thread_local std::unique_ptr<Profiler> Profiler::tm_profiler = std::make_unique<Profiler>(); thread_local std::unique_ptr<Profiler> Profiler::tm_profiler = std::make_unique<Profiler>();
std::atomic_size_t Profiler::sm_preferred_capacity; std::atomic_size_t Profiler::sm_preferred_capacity;
auto Profiler::get_thread_dict() -> thread_dict_t { auto Profiler::get_thread_dict() -> thread_dict_t {
MGB_LOCK_GUARD(sm_mutex);
thread_dict_t thread_dict; thread_dict_t thread_dict;
for (auto&& [tid, profiler]: sm_profilers) { for (auto&& [tid, profiler]: sm_profilers) {
thread_dict[tid] = profiler->m_thread_name; thread_dict[tid] = profiler->m_thread_name;
...@@ -69,15 +55,13 @@ auto Profiler::get_thread_dict() -> thread_dict_t { ...@@ -69,15 +55,13 @@ auto Profiler::get_thread_dict() -> thread_dict_t {
return thread_dict; return thread_dict;
} }
void Profiler::dump_profile(std::string basename, std::string format, results_t results, options_t options) { void Profiler::dump_profile(std::string basename, std::string format, bundle_t result) {
auto thread_dict = get_thread_dict(); std::unordered_map<std::string, void(*)(std::string, bundle_t)> format_table;
if (format == "chrome_timeline.json") { auto iter = format_table.find(format);
profiler::dump_chrome_timeline(basename, options, thread_dict, results); if (iter == format_table.end()) {
} else if (format == "memory_flow.svg") {
profiler::dump_memory_flow(basename, options, thread_dict, results);
} else {
mgb_log_error("unsupported profiling format %s", format.c_str()); mgb_log_error("unsupported profiling format %s", format.c_str());
} }
return (iter->second)(basename, result);
} }
} // namespace imperative } // namespace imperative
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
#pragma once #pragma once
#include "megbrain/utils/small_vector.h" #include "megbrain/utils/small_vector.h"
#include "megbrain/imperative/profiler.h"
#include "../interpreter/stack_manager.h"
#include "../op_trait.h" #include "../op_trait.h"
namespace mgb::imperative::profiler { namespace mgb::imperative::profiler {
...@@ -52,6 +54,11 @@ struct ToStringTrait<profiler::TensorProp>{ ...@@ -52,6 +54,11 @@ struct ToStringTrait<profiler::TensorProp>{
namespace mgb::imperative::profiler { namespace mgb::imperative::profiler {
using Trace = interpreter::intl::StackManager::Trace;
struct ProfileOperatorState;
struct ProfileTensorState;
#define DEF_EVENT(X, ...) struct X##Event __VA_ARGS__; #define DEF_EVENT(X, ...) struct X##Event __VA_ARGS__;
#define DEF_DUR_EVENT(X, ...) struct X##Event __VA_ARGS__; struct X##FinishEvent __VA_ARGS__; #define DEF_DUR_EVENT(X, ...) struct X##Event __VA_ARGS__; struct X##FinishEvent __VA_ARGS__;
...@@ -61,6 +68,7 @@ DEF_EVENT(OpDispatch, { ...@@ -61,6 +68,7 @@ DEF_EVENT(OpDispatch, {
std::function<OpParams()> op_params; std::function<OpParams()> op_params;
SmallVector<uint64_t> inputs; SmallVector<uint64_t> inputs;
SmallVector<uint64_t> outputs; SmallVector<uint64_t> outputs;
Trace trace;
}); });
DEF_DUR_EVENT(OpInput, { DEF_DUR_EVENT(OpInput, {
...@@ -68,11 +76,6 @@ DEF_DUR_EVENT(OpInput, { ...@@ -68,11 +76,6 @@ DEF_DUR_EVENT(OpInput, {
TensorShape shape; TensorShape shape;
}); });
DEF_DUR_EVENT(OpDel, {
uint64_t tensor_id;
TensorShape shape;
});
DEF_DUR_EVENT(OpOutput, { DEF_DUR_EVENT(OpOutput, {
uint64_t tensor_id; uint64_t tensor_id;
TensorShape shape; TensorShape shape;
...@@ -80,16 +83,13 @@ DEF_DUR_EVENT(OpOutput, { ...@@ -80,16 +83,13 @@ DEF_DUR_EVENT(OpOutput, {
DEF_DUR_EVENT(OpExecute, { DEF_DUR_EVENT(OpExecute, {
uint64_t op_id; uint64_t op_id;
SmallVector<CompNode> device_list;
}); });
DEF_DUR_EVENT(OpPostExecute, { DEF_DUR_EVENT(KernelLaunch, {
uint64_t op_id;
});
DEF_DUR_EVENT(KernelExecute, {
uint64_t op_id; uint64_t op_id;
uint64_t kernel_id; uint64_t kernel_id;
std::shared_ptr<CompNode::Event> event; CompNode device;
}); });
DEF_EVENT(TensorDeclare, { DEF_EVENT(TensorDeclare, {
...@@ -128,19 +128,12 @@ DEF_EVENT(TensorNotifyProp, { ...@@ -128,19 +128,12 @@ DEF_EVENT(TensorNotifyProp, {
TensorProp prop; TensorProp prop;
}); });
DEF_EVENT(TensorWaitProp, { DEF_DUR_EVENT(TensorWaitProp, {
uint64_t tensor_id; uint64_t tensor_id;
uint64_t wait_id; uint64_t wait_id;
TensorProp prop; TensorProp prop;
}); });
DEF_EVENT(TensorWaitPropFinish, {
uint64_t tensor_id;
uint64_t wait_id;
TensorProp prop;
bool notified;
});
DEF_DUR_EVENT(SampleDevice, { DEF_DUR_EVENT(SampleDevice, {
CompNode device; CompNode device;
size_t total_memory; size_t total_memory;
...@@ -157,13 +150,10 @@ DEF_DUR_EVENT(Scope, { ...@@ -157,13 +150,10 @@ DEF_DUR_EVENT(Scope, {
std::string name; std::string name;
}); });
DEF_DUR_EVENT(DeviceScope, { DEF_DUR_EVENT(Sync, {
std::string name; Trace trace;
std::shared_ptr<CompNode::Event> event;
}); });
DEF_DUR_EVENT(Sync, {});
DEF_DUR_EVENT(StartProfile, { DEF_DUR_EVENT(StartProfile, {
size_t capture_count; size_t capture_count;
}); });
...@@ -172,10 +162,13 @@ DEF_DUR_EVENT(StopProfile, { ...@@ -172,10 +162,13 @@ DEF_DUR_EVENT(StopProfile, {
size_t escape_count; size_t escape_count;
}); });
enum class TensorCommandKind {
Put, Del, SwapIn, SwapOut, Drop, ReGen, RecFree, GetValue
};
DEF_DUR_EVENT(TensorCommand, { DEF_DUR_EVENT(TensorCommand, {
enum Kind { using Kind = TensorCommandKind;
Put, Del, SwapIn, SwapOut, Drop, ReGen, RecFree, GetValue
};
uint64_t tensor_id; uint64_t tensor_id;
Kind kind; Kind kind;
}); });
...@@ -187,6 +180,17 @@ DEF_DUR_EVENT(Custom, { ...@@ -187,6 +180,17 @@ DEF_DUR_EVENT(Custom, {
std::string content; std::string content;
}); });
DEF_EVENT(RecordDevice, {
std::shared_ptr<CompNode::Event> event;
});
DEF_DUR_EVENT(HostToDevice, {
TensorLayout layout;
CompNode device;
void* host_ptr;
void* device_ptr;
});
#undef DEF_EVENT #undef DEF_EVENT
#undef DEF_DUR_EVENT #undef DEF_DUR_EVENT
......
...@@ -15,10 +15,12 @@ ...@@ -15,10 +15,12 @@
#include "megbrain/imperative/profiler.h" #include "megbrain/imperative/profiler.h"
#include "./states.h"
namespace mgb::imperative::profiler { namespace mgb::imperative::profiler {
void dump_chrome_timeline(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results); void dump_chrome_timeline(std::string filename, Profiler::bundle_t result);
void dump_memory_flow(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results); void dump_memory_flow(std::string filename, Profiler::bundle_t result);
} }
...@@ -89,7 +89,8 @@ struct MemoryChunk { ...@@ -89,7 +89,8 @@ struct MemoryChunk {
std::array<uintptr_t, 2> address; std::array<uintptr_t, 2> address;
std::string name; std::string name;
TensorLayout layout; TensorLayout layout;
std::array<uint64_t, 2> time; std::array<profiler::Duration, 2> time;
std::optional<uint64_t> group;
bool empty() const { bool empty() const {
return address[1] - address[0] == 0; return address[1] - address[0] == 0;
...@@ -111,9 +112,9 @@ struct MemoryFlow { ...@@ -111,9 +112,9 @@ struct MemoryFlow {
return {addr_begin, addr_end}; return {addr_begin, addr_end};
} }
std::pair<uint64_t, uint64_t> time_range() const { std::pair<profiler::Duration, profiler::Duration> time_range() const {
auto time_begin = std::numeric_limits<uint64_t>::max(); auto time_begin = profiler::Duration::max();
auto time_end = std::numeric_limits<uint64_t>::min(); auto time_end = profiler::Duration::min();
for(auto&& [id, chunk]: chunks) { for(auto&& [id, chunk]: chunks) {
MGB_MARK_USED_VAR(id); MGB_MARK_USED_VAR(id);
if (chunk.empty()) continue; if (chunk.empty()) continue;
...@@ -123,27 +124,6 @@ struct MemoryFlow { ...@@ -123,27 +124,6 @@ struct MemoryFlow {
return {time_begin, time_end}; return {time_begin, time_end};
} }
std::shared_ptr<json::Array> to_json() const {
auto results = json::Array::make();
for(auto&& [id, chunk]: chunks) {
MGB_MARK_USED_VAR(id);
if (chunk.empty()) continue;
auto address = json::Array::make();
auto time = json::Array::make();
address->add(json::String::make(std::to_string(chunk.address[0])));
address->add(json::String::make(std::to_string(chunk.address[1])));
time->add(json::String::make(std::to_string(chunk.time[0])));
time->add(json::String::make(std::to_string(chunk.time[1])));
results->add(json::Object::make({
{"address", address},
{"name", json::String::make(chunk.name)},
{"layout", json::String::make(chunk.layout.to_string())},
{"time", time}
}));
}
return results;
}
XMLWriter to_svg() const { XMLWriter to_svg() const {
XMLWriter writer; XMLWriter writer;
auto&& [addr_begin, addr_end] = address_range(); auto&& [addr_begin, addr_end] = address_range();
...@@ -157,13 +137,13 @@ struct MemoryFlow { ...@@ -157,13 +137,13 @@ struct MemoryFlow {
svg.attr("xmlns:tag", std::string{"https://megengine.org.cn"}); svg.attr("xmlns:tag", std::string{"https://megengine.org.cn"});
double time_scale = 1e5; double time_scale = 1e5;
double addr_scale = 1e6; double addr_scale = 1e6;
svg.attr("width", (time_end-time_begin)/time_scale); svg.attr("width", (time_end-time_begin).count()/time_scale);
svg.attr("height", (addr_end-addr_begin)/addr_scale); svg.attr("height", (addr_end-addr_begin)/addr_scale);
{ {
auto rect = writer.element("rect"); auto rect = writer.element("rect");
rect.attr("x", 0); rect.attr("x", 0);
rect.attr("y", 0); rect.attr("y", 0);
rect.attr("width", (time_end-time_begin)/time_scale); rect.attr("width", (time_end-time_begin).count()/time_scale);
rect.attr("height", (addr_end-addr_begin)/addr_scale); rect.attr("height", (addr_end-addr_begin)/addr_scale);
rect.attr("fill", std::string{"blue"}); rect.attr("fill", std::string{"blue"});
} }
...@@ -177,7 +157,7 @@ struct MemoryFlow { ...@@ -177,7 +157,7 @@ struct MemoryFlow {
{1000 * ms, "#888888"}, {1000 * ms, "#888888"},
{std::numeric_limits<double>::infinity(), "#555555"}, {std::numeric_limits<double>::infinity(), "#555555"},
}; };
auto time2str = [](uint64_t ns){ auto time2str = [](profiler::Duration ns){
using pair_t = std::pair<uint64_t, const char*>; using pair_t = std::pair<uint64_t, const char*>;
static pair_t units[] = { static pair_t units[] = {
{1, "ns "}, {1, "ns "},
...@@ -189,9 +169,9 @@ struct MemoryFlow { ...@@ -189,9 +169,9 @@ struct MemoryFlow {
auto comparator = [](const pair_t& lhs, const pair_t& rhs) { auto comparator = [](const pair_t& lhs, const pair_t& rhs) {
return lhs.first < rhs.first; return lhs.first < rhs.first;
}; };
while (ns > 0) { while (ns.count() > 0) {
auto iter = std::upper_bound(std::begin(units), std::end(units), std::make_pair(ns, ""), comparator) - 1; auto iter = std::upper_bound(std::begin(units), std::end(units), std::make_pair(ns.count(), ""), comparator) - 1;
builder += std::to_string(ns / iter->first) + iter->second; builder += std::to_string(ns.count() / iter->first) + iter->second;
ns = ns % iter->first; ns = ns % iter->first;
} }
return builder; return builder;
...@@ -218,11 +198,11 @@ struct MemoryFlow { ...@@ -218,11 +198,11 @@ struct MemoryFlow {
for (auto&& [id, chunk]: chunks) { for (auto&& [id, chunk]: chunks) {
MGB_MARK_USED_VAR(id); MGB_MARK_USED_VAR(id);
if (chunk.empty()) continue; if (chunk.empty()) continue;
double left = (chunk.time[0]-time_begin)/time_scale; double left = (chunk.time[0]-time_begin).count()/time_scale;
double right = (chunk.time[1]-time_begin)/time_scale; double right = (chunk.time[1]-time_begin).count()/time_scale;
double top = (chunk.address[0]-addr_begin)/addr_scale; double top = (chunk.address[0]-addr_begin)/addr_scale;
double bottom = (chunk.address[1]-addr_begin)/addr_scale; double bottom = (chunk.address[1]-addr_begin)/addr_scale;
double duration = chunk.time[1] - chunk.time[0]; double duration = (chunk.time[1] - chunk.time[0]).count();
{ {
auto rect = writer.element("rect"); auto rect = writer.element("rect");
rect.attr("x", left); rect.attr("x", left);
...@@ -241,70 +221,48 @@ struct MemoryFlow { ...@@ -241,70 +221,48 @@ struct MemoryFlow {
mge_attr("produced", time2str(chunk.time[0])); mge_attr("produced", time2str(chunk.time[0]));
mge_attr("erased", time2str(chunk.time[1])); mge_attr("erased", time2str(chunk.time[1]));
mge_attr("duration", time2str(chunk.time[1] - chunk.time[0])); mge_attr("duration", time2str(chunk.time[1] - chunk.time[0]));
if (chunk.group) {
mge_attr("group", std::to_string(*chunk.group));
}
} }
} }
return writer; return writer;
} }
}; };
void dump_memory_flow(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results) { struct MemoryFlowVisitor: EventVisitor<MemoryFlowVisitor> {
MemoryFlow flow; MemoryFlow memory_flow;
ProfileDataCollector collector;
ProfileState state;
#define HANDLE_EVENT(type, ...) \
collector.handle<type>([&](uint64_t id, std::thread::id tid, uint64_t time, type event) __VA_ARGS__ );
HANDLE_EVENT(TensorDeclareEvent, {
auto& tensor_state = state.tensors[event.tensor_id] = {};
tensor_state.id = event.tensor_id;
tensor_state.name = event.name;
});
HANDLE_EVENT(TensorProduceEvent, { template <typename TEvent>
auto& tensor_state = state.tensors[event.tensor_id]; void visit_event(const TEvent &event) {
tensor_state.device = event.device; if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) {
tensor_state.layout = event.layout; auto& chunk = memory_flow.chunks[event.tensor_id];
tensor_state.produced = time; uint64_t address = reinterpret_cast<uintptr_t>(event.ptr);
state.tensors_by_size.insert({tensor_state.id, tensor_state.size_in_bytes()}); auto span = event.layout.span();
state.tensors_by_produced.insert({tensor_state.id, tensor_state.produced}); auto dtype = event.layout.dtype;
auto& chunk = flow.chunks[event.tensor_id]; // assume dtype is not lowbit
uintptr_t address = reinterpret_cast<uintptr_t>(event.ptr); if (!address) {
auto span = event.layout.span(); chunk.address = {0, 0};
auto dtype = event.layout.dtype; } else {
// assume dtype is not lowbit chunk.address = {address+span.low_elem*dtype.size(), address+span.high_elem*dtype.size()};
if (!address) { }
chunk.address = {0, 0}; chunk.layout = event.layout;
} else { chunk.time[0] = since_start(to_device_time(current->time, current_tensor->device));
chunk.address = {address+span.low_elem*dtype.size(), address+span.high_elem*dtype.size()}; chunk.name = current_tensor->name;
chunk.group = current_tensor->source;
} else if constexpr (std::is_same_v<TEvent, TensorReleaseEvent>) {
auto& chunk = memory_flow.chunks[event.tensor_id];
chunk.time[1] = since_start(to_device_time(current->time, current_tensor->device));
} }
chunk.layout = tensor_state.layout;
chunk.time[0] = time;
chunk.name = tensor_state.name;
});
HANDLE_EVENT(TensorReleaseEvent, {
auto& tensor_state = state.tensors[event.tensor_id];
state.tensors_by_size.erase({tensor_state.id, tensor_state.size_in_bytes()});
state.tensors_by_produced.erase({tensor_state.id, tensor_state.produced});
auto& chunk = flow.chunks[event.tensor_id];
chunk.time[1] = time;
});
HANDLE_EVENT(ScopeEvent, {
state.threads[tid].scope_stack.push_back(event.name);
});
HANDLE_EVENT(ScopeFinishEvent, {
mgb_assert(state.threads[tid].scope_stack.back() == event.name);
state.threads[tid].scope_stack.pop_back();
});
for (auto&& result: results) {
collector(result.second.id, result.first, result.second.time, result.second.data);
} }
debug::write_to_file(filename.c_str(), flow.to_svg().to_string()); void notify_counter(std::string key, int64_t old_val, int64_t new_val) {}
};
void dump_memory_flow(std::string filename, Profiler::bundle_t result) {
MemoryFlowVisitor visitor;
visitor.process_events(std::move(result));
debug::write_to_file(filename.c_str(), visitor.memory_flow.to_svg().to_string());
} }
} }
...@@ -3,6 +3,9 @@ ...@@ -3,6 +3,9 @@
#include <set> #include <set>
#include <any> #include <any>
#include <typeindex> #include <typeindex>
#include <sstream>
#include "nlohmann/json.hpp"
#include "megbrain/tensor.h" #include "megbrain/tensor.h"
...@@ -10,24 +13,16 @@ ...@@ -10,24 +13,16 @@
namespace mgb::imperative::profiler { namespace mgb::imperative::profiler {
struct ProfileDeviceState { using StackManager = interpreter::intl::StackManager;
int64_t index;
CompNode device;
std::shared_ptr<CompNode::Event> base_event;
uint64_t base_time; //in ns
};
struct ProfileWorkerState {
};
struct ProfileTensorState { struct ProfileTensorState {
uint64_t id; uint64_t id = 0;
std::optional<uint64_t> source;
TensorLayout layout; TensorLayout layout;
CompNode device; CompNode device;
std::string name; std::string name;
uint64_t produced = 0; profiler::HostTime produced = profiler::HostTime::min();
uint64_t living_time = 0; profiler::Duration living_time = profiler::Duration::zero();
size_t size_in_bytes() const { size_t size_in_bytes() const {
if (!layout.dtype.valid()) { if (!layout.dtype.valid()) {
...@@ -35,41 +30,51 @@ struct ProfileTensorState { ...@@ -35,41 +30,51 @@ struct ProfileTensorState {
} }
return layout.dtype.size(layout.total_nr_elems()); return layout.dtype.size(layout.total_nr_elems());
} }
};
struct ProfileStaticsState { std::string info(HostTime current_time) {
size_t op_enqueue_count = 0; std::string shape = layout.TensorShape::to_string();
size_t op_execute_count = 0; std::string dtype = layout.dtype.name();
size_t wait_value_count = 0; return ssprintf("%s(%s:%s:%s)", name.c_str(), shape.c_str(), dtype.c_str(), device.to_string().c_str());
size_t wait_shape_count = 0; }
size_t exception_count = 0;
size_t infer_shape_valid_count = 0; nlohmann::json detail(HostTime current_time) {
size_t infer_shape_invalid_count = 0; nlohmann::json args;
size_t alive_tensor_count = 0; args["id"] = id;
size_t produce_tensor_count = 0; args["name"] = name;
size_t erase_tensor_count = 0; args["shape"] = layout.TensorShape::to_string();
size_t wait_prop_count = 0; args["dtype"] = layout.dtype.name();
size_t redundant_tensor_count = 0; args["nr_elements"] = layout.total_nr_elems();
args["device"] = device.to_string();
if (produced != produced.min()) {
double ms_count = std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(current_time - produced + living_time).count();
args["living_time"] = ssprintf("%lf ms", ms_count);
}
return args;
}
}; };
struct ProfileOperatorState { struct ProfileOperatorState {
uint64_t id; uint64_t id = 0;
std::string name; std::string name;
OpParams params; OpParams params;
SmallVector<uint64_t> inputs; SmallVector<uint64_t> inputs;
SmallVector<uint64_t> outputs; SmallVector<uint64_t> outputs;
CompNode device; CompNode device;
Trace trace;
uint64_t host_begin; profiler::HostTime execute_begin;
uint64_t host_end; profiler::HostTime execute_end;
std::shared_ptr<CompNode::Event> device_begin;
std::shared_ptr<CompNode::Event> device_end;
};
struct ProfileThreadState { nlohmann::json detail() {
std::thread::id tid; nlohmann::json args;
int64_t index; for (auto&& [name, value]: params) {
std::vector<std::string> scope_stack; args[name] = value;
}
args["__id__"] = id;
args["__name__"] = name;
args["__device__"] = device.to_string();
return args;
}
}; };
template <typename TProp> template <typename TProp>
...@@ -93,37 +98,12 @@ struct ProfileTensorPropPair { ...@@ -93,37 +98,12 @@ struct ProfileTensorPropPair {
using ProfileTensorSizePair = ProfileTensorPropPair<size_t>; using ProfileTensorSizePair = ProfileTensorPropPair<size_t>;
using ProfileTensorProducedPair = ProfileTensorPropPair<uint64_t>; using ProfileTensorProducedPair = ProfileTensorPropPair<uint64_t>;
struct GeneralTensorEvent {
uint64_t tensor_id;
std::type_index type;
};
struct ProfileState { struct ProfileState {
std::unordered_map<uint64_t, ProfileTensorState> tensors; std::unordered_map<uint64_t, ProfileTensorState> tensors;
std::unordered_map<uint64_t, ProfileOperatorState> operators; std::unordered_map<uint64_t, ProfileOperatorState> operators;
std::unordered_map<std::string, uint64_t> tensor_name_counter; std::unordered_map<std::string, uint64_t> tensor_name_counter;
std::set<ProfileTensorSizePair> tensors_by_size; std::set<ProfileTensorSizePair> tensors_by_size;
std::set<ProfileTensorSizePair> tensors_by_produced; std::set<ProfileTensorSizePair> tensors_by_produced;
ProfileWorkerState worker;
ProfileStaticsState statics;
std::unordered_map<std::thread::id, ProfileThreadState> threads;
CompNode::UnorderedMap<ProfileDeviceState> devices;
ProfileThreadState& operator[](std::thread::id tid) {
if (threads.count(tid) == 0) {
threads[tid].tid = tid;
threads[tid].index = threads.size();
}
return threads[tid];
}
ProfileDeviceState& operator[](CompNode device) {
if (devices.count(device) == 0) {
devices[device].device = device;
devices[device].index = devices.size();
}
return devices[device];
}
std::vector<uint64_t> top_k_tensor_in_device(CompNode device, size_t k) { std::vector<uint64_t> top_k_tensor_in_device(CompNode device, size_t k) {
std::vector<uint64_t> results; std::vector<uint64_t> results;
...@@ -138,19 +118,233 @@ struct ProfileState { ...@@ -138,19 +118,233 @@ struct ProfileState {
} }
return results; return results;
} }
};
std::string concat_scope(std::thread::id tid) { template<typename T, typename = void>
auto& scope_stack = threads[tid].scope_stack; struct is_op_event : std::false_type { };
if (scope_stack.empty()) {
return {}; template<typename T>
} struct is_op_event<T, decltype(std::declval<T>().op_id, void())> : std::true_type { };
std::string result = scope_stack[0];
for (size_t i = 1; i < scope_stack.size(); ++i) { template<typename T, typename = void>
result += "::"; struct is_tensor_event : std::false_type { };
result += scope_stack[i];
template<typename T>
struct is_tensor_event<T, decltype(std::declval<T>().tensor_id, void())> : std::true_type { };
template<typename T, typename = void>
struct is_trace_event : std::false_type { };
template<typename T>
struct is_trace_event<T, decltype(std::declval<T>().trace, void())> : std::true_type { };
template <typename... TItems>
class AnyToVariantConverter {
public:
using any_t = std::any;
using variant_t = std::variant<TItems...>;
private:
std::unordered_map<std::type_index, std::function<variant_t(any_t)>> m_table;
template <typename TItem>
void register_converter() {
m_table[typeid(TItem)] = [](any_t input) {
return variant_t(std::any_cast<TItem>(std::move(input)));
};
}
public:
AnyToVariantConverter() {
(register_converter<TItems>(), ...);
}
variant_t operator()(any_t input) {
return m_table[input.type()](std::move(input));
}
};
template <typename TSelf>
class EventVisitor {
private:
std::unordered_map<size_t, ProfileOperatorState> m_operators;
std::unordered_map<size_t, ProfileTensorState> m_tensors;
std::unordered_map<size_t, std::vector<Profiler::Record>> m_duration_stack;
HostTime m_start_time;
CompNode::UnorderedMap<size_t> m_device_tid_table;
std::unordered_map<std::thread::id, size_t> m_host_tid_table;
CompNode::UnorderedMap<std::map<profiler::HostTime, profiler::RealDuration>> m_device_timeline;
std::unordered_map<std::thread::id, std::vector<Trace>> m_trace_stack;
std::unordered_map<std::string, int64_t> m_counter_table;
protected:
Profiler::Record* current;
ProfileOperatorState* current_op;
ProfileTensorState* current_tensor;
protected:
profiler::Duration since_start(profiler::HostTime time) {
return time - m_start_time;
}
profiler::HostTime to_device_time(profiler::HostTime time, CompNode device) {
auto& device_timeline = m_device_timeline[device];
auto upper = device_timeline.lower_bound(time);
if (upper == device_timeline.end()) {
if (upper == device_timeline.begin()) {
return time;
} else {
--upper;
return time + std::chrono::duration_cast<profiler::Duration>(upper->second);
}
} else if (upper->first == time) {
return time + std::chrono::duration_cast<profiler::Duration>(upper->second);
} else if (upper == device_timeline.begin()) {
return time + std::chrono::duration_cast<profiler::Duration>(upper->second);
} }
auto lower = upper;
-- lower;
double ratio = ((double)(time - lower->first).count() / (double)(upper->first - lower->first).count());
mgb_assert(ratio > 0 && ratio < 1, "invalid ratio");
mgb_assert(lower->first + lower->second <= upper->first + upper->second, "device time corr");
auto shift = lower->second + ratio * (upper->second - lower->second);
auto result = time + std::chrono::duration_cast<profiler::Duration>(shift);
return result; return result;
} }
size_t to_tid(std::thread::id host_tid) {
return m_host_tid_table.at(host_tid);
}
size_t to_tid(CompNode device) {
return m_device_tid_table.at(device);
}
void inc_counter(const char* key, int64_t delta) {
if (!m_counter_table.count(key)) {
m_counter_table[key] = 0;
}
auto& value = m_counter_table[key];
static_cast<TSelf&>(*this).notify_counter(key, value, value + delta);
value += delta;
}
public:
void process_events(Profiler::bundle_t bundle) {
m_start_time = bundle.start_at;
auto& self = static_cast<TSelf&>(*this);
AnyToVariantConverter<OpDispatchEvent, OpExecuteEvent, OpExecuteFinishEvent,
KernelLaunchEvent, KernelLaunchFinishEvent,
OpInputEvent, OpInputFinishEvent, OpOutputEvent, OpOutputFinishEvent,
TensorDeclareEvent, TensorProduceEvent, TensorUsageEvent, TensorReleaseEvent, TensorEraseEvent,
TensorGetPropEvent, TensorNotifyPropEvent, TensorWaitPropEvent, TensorWaitPropFinishEvent,
SampleDeviceEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent,
StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent,
TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent,
CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent> converter;
auto for_each_entry = [&](auto&& handler) {
for (auto& entry: bundle.entries) {
current = &entry;
std::visit(handler, converter(entry.data));
}
current = nullptr;
};
// build device timeline
struct DeviceStartPair {
profiler::HostTime host;
std::shared_ptr<CompNode::Event> device;
};
CompNode::UnorderedMap<DeviceStartPair> device_start_table;
for_each_entry([&](auto&& event){
using T = std::decay_t<decltype(event)>;
if constexpr (std::is_same_v<T, RecordDeviceEvent>) {
using namespace std::chrono_literals;
DeviceStartPair& device_start = device_start_table[event.event->comp_node()];
if (!device_start.device) {
device_start = { current->time, event.event };
}
event.event->host_wait();
auto device_time = (device_start.host - current->time) + std::chrono::duration_cast<profiler::RealDuration>(device_start.device->elapsed_time_until(*event.event) * 1s);
m_device_timeline[event.event->comp_node()][current->time] = device_time;
}
});
// register host threads
for_each_entry([&](auto&& event){
if (!m_host_tid_table.count(current->tid)) {
m_host_tid_table[current->tid] = {m_device_tid_table.size() + m_host_tid_table.size()};
}
});
for_each_entry([&](auto&& event){
using T = std::decay_t<decltype(event)>;
if constexpr (std::is_same_v<T, OpDispatchEvent>) {
auto& op = m_operators[event.op_id];
mgb_assert(op.id == 0, "duplicate operator id");
op.id = event.op_id;
op.name = event.op_name;
op.params = event.op_params();
op.inputs = event.inputs;
op.outputs = event.outputs;
op.trace = event.trace;
for (auto&& output: event.outputs) {
m_tensors.at(output).source = op.id;
}
} else if constexpr (std::is_same_v<T, TensorDeclareEvent>) {
auto& tensor = m_tensors[event.tensor_id];
mgb_assert(tensor.id == 0, "duplicated tensor id");
tensor.id = event.tensor_id;
tensor.name = event.name;
} else if constexpr (std::is_same_v<T, TensorProduceEvent>) {
auto& tensor = m_tensors.at(event.tensor_id);
if (!m_device_tid_table.count(event.device)) {
m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()};
}
tensor.device = event.device;
}
});
// replay execution
using namespace std::placeholders;
for_each_entry([&](auto&& event){
using T = std::decay_t<decltype(event)>;
// update current_op/tensor
if constexpr (is_op_event<T>::value) {
current_op = &m_operators.at(event.op_id);
} else if constexpr (is_tensor_event<T>::value) {
current_tensor = &m_tensors.at(event.tensor_id);
}
if constexpr (std::is_same_v<T, OpExecuteEvent>) {
current_op->execute_begin = current->time;
} else if constexpr (std::is_same_v<T, OpExecuteFinishEvent>) {
current_op->execute_end = current->time;
}
// update counters
if constexpr (std::is_same_v<T, OpDispatchEvent>) {
inc_counter("nr_op_pending", 1);
} else if constexpr (std::is_same_v<T, OpExecuteEvent>) {
inc_counter("nr_op_pending", -1);
} else if constexpr (std::is_same_v<T, TensorProduceEvent>) {
inc_counter("nr_alive_tensor", 1);
} else if constexpr (std::is_same_v<T, TensorReleaseEvent>) {
inc_counter("nr_alive_tensor", -1);
} else if constexpr (std::is_same_v<T, TensorEraseEvent>) {
if (event.use_count == 0) {
inc_counter("nr_redunant_tensor", 1);
}
} else if constexpr (std::is_same_v<T, ShapeInferEvent>) {
if (!event.success) {
inc_counter("nr_shape_infer_failure", 1);
}
} else if constexpr (std::is_same_v<T, WorkerExceptionEvent>) {
inc_counter("nr_exception", 1);
}
// visit_event_impl
self.visit_event(event);
// reset current_op/tensor
if constexpr (is_op_event<T>::value) {
current_op = nullptr;
} else if constexpr (is_tensor_event<T>::value) {
current_tensor = nullptr;
}
});
}
}; };
} }
...@@ -25,6 +25,9 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) { ...@@ -25,6 +25,9 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) {
auto on_seq_start = [this](CompSeqExecBeforeStart const& event) { auto on_seq_start = [this](CompSeqExecBeforeStart const& event) {
// reset // reset
mgb_assert(!event.graph->options().imperative_proxy_graph); mgb_assert(!event.graph->options().imperative_proxy_graph);
CompNode::foreach([](CompNode device){
Profiler::record<RecordDeviceEvent>(Timer::record_device(device));
});
if (m_opr_dict.empty() && m_var_dict.empty()) { if (m_opr_dict.empty() && m_var_dict.empty()) {
init_seq(event.exec); init_seq(event.exec);
} }
...@@ -122,11 +125,13 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) { ...@@ -122,11 +125,13 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) {
}; };
auto on_before_kern = [this](BeforeKernel const& event) { auto on_before_kern = [this](BeforeKernel const& event) {
OperatorNodeBase* opr = event.opr; OperatorNodeBase* opr = event.opr;
Profiler::record<KernelExecuteEvent>(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node)); Profiler::record<KernelLaunchEvent>(get_opr_info(opr).id, get_opr_info(opr).id, event.comp_node);
Profiler::record<RecordDeviceEvent>(Timer::record_device(event.comp_node));
}; };
auto on_after_kern = [this](AfterKernel const& event) { auto on_after_kern = [this](AfterKernel const& event) {
OperatorNodeBase* opr = event.opr; OperatorNodeBase* opr = event.opr;
Profiler::record<KernelExecuteFinishEvent>(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node)); Profiler::record<RecordDeviceEvent>(Timer::record_device(event.comp_node));
Profiler::record<KernelLaunchEvent>(get_opr_info(opr).id, get_opr_info(opr).id, event.comp_node);
}; };
auto on_graph_compile = [this](const CompSeqOrderDetermined&) { auto on_graph_compile = [this](const CompSeqOrderDetermined&) {
m_opr_dict.clear(); m_opr_dict.clear();
......
...@@ -32,15 +32,22 @@ ...@@ -32,15 +32,22 @@
namespace mgb { namespace mgb {
namespace imperative { namespace imperative {
namespace profiler {
using HostTime = std::chrono::time_point<std::chrono::high_resolution_clock>;
using Duration = std::chrono::nanoseconds;
using RealDuration = std::chrono::duration<double, std::nano>;
using Time = HostTime;
} // namespace profiler
class Timer { class Timer {
public: public:
void reset(); using Time = profiler::Time;
uint64_t get_nsecs(); static profiler::Time record_host();
uint64_t get_started_at(); static std::shared_ptr<CompNode::Event> record_device(CompNode device);
static std::shared_ptr<CompNode::Event> record_event(CompNode device);
private:
decltype(std::chrono::steady_clock::now()) m_start;
uint64_t m_started_at;
}; };
...@@ -48,7 +55,8 @@ class Profiler { ...@@ -48,7 +55,8 @@ class Profiler {
public: public:
struct Record { struct Record {
uint64_t id; uint64_t id;
uint64_t time; //in ns std::thread::id tid;
profiler::Time time;
std::any data; std::any data;
}; };
enum Status: uint8_t { enum Status: uint8_t {
...@@ -56,23 +64,32 @@ public: ...@@ -56,23 +64,32 @@ public:
Recording = 1, Recording = 1,
Collecting = 2, Collecting = 2,
}; };
using ProfileCollector = std::function<void(std::thread::id, Record)>; struct ResultBundle;
using ProfileCollector = std::function<void(Record)>;
using option_t = uint64_t; using option_t = uint64_t;
using options_t = std::unordered_map<std::string, option_t>; using options_t = std::unordered_map<std::string, option_t>;
using result_t = std::pair<std::thread::id, Record>; using entry_t = Record;
using results_t = std::vector<result_t>; using bundle_t = ResultBundle;
using thread_dict_t = std::unordered_map<std::thread::id, std::string>; using thread_dict_t = std::unordered_map<std::thread::id, std::string>;
struct ResultBundle {
profiler::HostTime start_at;
thread_dict_t thread_dict;
options_t options;
std::vector<entry_t> entries;
};
private: private:
std::thread::id m_thread_id; std::thread::id m_thread_id;
std::vector<Record> m_records; std::vector<Record> m_records;
std::vector<std::any> m_duration_stack;
std::atomic<Status> m_status = Running; std::atomic<Status> m_status = Running;
uint64_t m_last_time = 0;
std::string m_thread_name; std::string m_thread_name;
static options_t sm_profile_options; static options_t sm_profile_options;
static std::mutex sm_mutex; static std::mutex sm_mutex;
static std::unordered_map<std::thread::id, Profiler*> sm_profilers; static std::unordered_map<std::thread::id, Profiler*> sm_profilers;
static Timer sm_timer; static Timer sm_timer;
static profiler::HostTime sm_start_at;
static std::atomic_uint64_t sm_last_id; static std::atomic_uint64_t sm_last_id;
static std::atomic_size_t sm_preferred_capacity; static std::atomic_size_t sm_preferred_capacity;
static bool sm_profiling; static bool sm_profiling;
...@@ -100,7 +117,7 @@ public: ...@@ -100,7 +117,7 @@ public:
static void reset() { static void reset() {
mgb_assert(sm_profilers.size() == 0, "profiler already running"); mgb_assert(sm_profilers.size() == 0, "profiler already running");
sm_timer.reset(); sm_start_at = profiler::HostTime::min();
} }
static uint64_t next_id() { static uint64_t next_id() {
...@@ -110,16 +127,13 @@ public: ...@@ -110,16 +127,13 @@ public:
template <typename T, typename... TArgs> template <typename T, typename... TArgs>
static uint64_t record(TArgs&&... args) { static uint64_t record(TArgs&&... args) {
auto& profiler = get_instance(); auto& profiler = get_instance();
auto last_time = profiler.m_last_time;
if constexpr (sm_debug) { if constexpr (sm_debug) {
Status expected = Running; Status expected = Running;
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording)); mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording));
} }
uint64_t id = next_id(); uint64_t id = next_id();
uint64_t time = sm_timer.get_nsecs(); profiler::Time time = sm_timer.record_host();
time = std::max(time, last_time + 2000); profiler.m_records.push_back({id, std::this_thread::get_id(), time, T{std::forward<TArgs>(args)...}});
profiler.m_last_time = time;
profiler.m_records.push_back({id, time, T{std::forward<TArgs>(args)...}});
if constexpr (sm_debug) { if constexpr (sm_debug) {
Status expected = Recording; Status expected = Recording;
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running)); mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running));
...@@ -127,7 +141,8 @@ public: ...@@ -127,7 +141,8 @@ public:
return id; return id;
} }
static results_t collect() { static bundle_t collect() {
bundle_t bundle;
MGB_LOCK_GUARD(sm_mutex); MGB_LOCK_GUARD(sm_mutex);
if constexpr (sm_debug) { if constexpr (sm_debug) {
for (auto&& [tid, profiler]: sm_profilers) { for (auto&& [tid, profiler]: sm_profilers) {
...@@ -136,17 +151,17 @@ public: ...@@ -136,17 +151,17 @@ public:
mgb_assert(profiler->m_status.compare_exchange_strong(expected, Collecting)); mgb_assert(profiler->m_status.compare_exchange_strong(expected, Collecting));
} }
} }
std::vector<std::pair<std::thread::id, Record>> profile_data; std::vector<entry_t> profile_data;
for (auto&& [tid, profiler]: sm_profilers) { for (auto&& [tid, profiler]: sm_profilers) {
sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size()); sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size());
for (auto& record: profiler->m_records) { for (auto& record: profiler->m_records) {
profile_data.push_back({tid, std::move(record)}); profile_data.push_back(std::move(record));
} }
profiler->m_records.clear(); profiler->m_records.clear();
profiler->m_records.reserve(sm_preferred_capacity); profiler->m_records.reserve(sm_preferred_capacity);
} }
std::sort(profile_data.begin(), profile_data.end(), [](auto& lhs, auto& rhs){ std::sort(profile_data.begin(), profile_data.end(), [](auto& lhs, auto& rhs){
return lhs.second.id < rhs.second.id; return lhs.id < rhs.id;
}); });
if constexpr (sm_debug) { if constexpr (sm_debug) {
for (auto&& [tid, profiler]: sm_profilers) { for (auto&& [tid, profiler]: sm_profilers) {
...@@ -155,7 +170,11 @@ public: ...@@ -155,7 +170,11 @@ public:
mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running)); mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running));
} }
} }
return profile_data; bundle.entries = profile_data;
bundle.options = get_options();
bundle.start_at = sm_start_at;
bundle.thread_dict = get_thread_dict();
return bundle;
} }
static option_t get_option(std::string key, option_t default_val) { static option_t get_option(std::string key, option_t default_val) {
...@@ -179,6 +198,7 @@ public: ...@@ -179,6 +198,7 @@ public:
static void start_profile() { static void start_profile() {
mgb_assert(!sm_profiling); mgb_assert(!sm_profiling);
sm_start_at = Timer::record_host();
sm_profiling = true; sm_profiling = true;
} }
...@@ -189,7 +209,7 @@ public: ...@@ -189,7 +209,7 @@ public:
static thread_dict_t get_thread_dict(); static thread_dict_t get_thread_dict();
static void dump_profile(std::string basename, std::string format, results_t results, options_t options); static void dump_profile(std::string basename, std::string format, bundle_t result);
}; };
......
...@@ -19,7 +19,7 @@ target_include_directories(imperative_test PRIVATE ${MODULE_SRC_INCLUDE} ${PYTHO ...@@ -19,7 +19,7 @@ target_include_directories(imperative_test PRIVATE ${MODULE_SRC_INCLUDE} ${PYTHO
target_compile_definitions(imperative_test PRIVATE MODULE_NAME=C) target_compile_definitions(imperative_test PRIVATE MODULE_NAME=C)
target_compile_options(imperative_test PRIVATE -Wno-unused-parameter) target_compile_options(imperative_test PRIVATE -Wno-unused-parameter)
set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3) set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3 nlohmann_json::nlohmann_json)
if(MGE_WITH_CUDA) if(MGE_WITH_CUDA)
list(APPEND LINK_LIBS cudart) list(APPEND LINK_LIBS cudart)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册