提交 defbc20e 编写于 作者: M Megvii Engine Team

fix(profiler): fix profiler in dtr

GitOrigin-RevId: 79b5a6b52615cbda6b4017cc9d34c39d62f09bca
上级 d4c71f92
......@@ -616,7 +616,7 @@ void ChannelImpl::release_tensor(TensorInfo* dest) {
void ChannelImpl::regenerate(TensorInfo* dest) {
if (dest->evict_type == EvictType::DROP) {
auto &&path = dest->producer;
m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest});
m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest, "dtr"});
if (!m_applying) flush_apply_stack();
} else if (dest->evict_type == EvictType::SWAP) {
MGB_RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen);
......@@ -625,7 +625,7 @@ void ChannelImpl::regenerate(TensorInfo* dest) {
}
}
void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) {
using namespace ranges;
using namespace ranges::views;
auto& state = get_worker_state();
......@@ -689,7 +689,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
}
return outputs;
};
MGB_RECORD_EVENT(OpExecuteEvent, apply_id);
MGB_RECORD_EVENT(OpExecuteEvent, apply_id, {}, reason);
// Begin profiling operator
SmallVector<std::pair<CompNode, uint64_t>> kernels;
if (profiling_device) {
......@@ -769,7 +769,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
}
m_dtr.unpin(cmd.inputs);
}
MGB_RECORD_EVENT(OpExecuteFinishEvent, apply_id);
MGB_RECORD_EVENT(OpExecuteFinishEvent, apply_id, {}, reason);
// End profiling operator
}
......@@ -777,7 +777,7 @@ void ChannelImpl::flush_apply_stack() {
m_applying = true;
auto& state = get_worker_state();
while (!m_apply_stack.empty()) {
auto& [cmd, idx, recomp] = m_apply_stack.top(); // cmd.inputs[0~idx-1] is in memory
auto& [cmd, idx, recomp, reason] = m_apply_stack.top(); // cmd.inputs[0~idx-1] is in memory
if (idx == 0) {
if (state.options.enable_dtr_auto_drop) {
m_dtr.pin(cmd.inputs);
......@@ -801,10 +801,9 @@ void ChannelImpl::flush_apply_stack() {
}
if (regen) continue;
// the required input tensors are already in memory
auto cmd_backup = cmd;
auto recomp_backup = recomp;
auto [cmd_backup, recomp_backup, reason_backup] = std::make_tuple(cmd, recomp, reason);
m_apply_stack.pop();
do_apply_op(cmd_backup);
do_apply_op(cmd_backup, reason_backup);
if (recomp_backup) {
MGB_RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen);
for (auto o : cmd_backup.outputs) {
......@@ -829,6 +828,7 @@ bool ChannelImpl::auto_evict(size_t force_num) {
sample_on_device(m_dtr.comp_node, false);
auto best = m_dtr.find_best_tensor(state.options.enable_dtr_sqrt_sampling && !force_num);
if (!best) {
MGB_RECORD_EVENT(AutoEvictFinishEvent);
break;
}
if (best->ptr.unique() && best->ptr->blob().unique()) {
......@@ -947,7 +947,9 @@ void ChannelImpl::alloc_tensor_with_evict(Blob* x) {
set_log_level(pre_level);
mgb_log_warn("reallocating all cuda memory to alleviate fragmentation, the performance may be affected");
set_log_level(LogLevel::NO_LOG);
imperative_log_profile_begin("defrag");
BlobManager::inst()->defrag(x->comp_node());
imperative_log_profile_end("defrag");
BlobManager::inst()->alloc_direct(x, x->size());
}
});
......@@ -1025,7 +1027,7 @@ void ChannelImpl::process_one_task(Command& icmd) {
return;
}
}
m_apply_stack.push({cmd, 0, nullptr});
m_apply_stack.push({cmd, 0, nullptr, "cmd"});
flush_apply_stack();
for (size_t i = 0; i < cmd.outputs.size(); ++i) {
auto output = cmd.outputs[i];
......
......@@ -104,8 +104,8 @@ private:
void release_tensor(TensorInfo* dest);
void regenerate(TensorInfo* dest);
void do_apply_op(const ApplyOp& cmd);
void flush_apply_stack();
void do_apply_op(const ApplyOp& cmd, std::string reason);
std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPtr>> init_output_and_workspace(
const OpDef& def,
......@@ -150,7 +150,8 @@ private:
std::exception_ptr m_worker_exc;
std::function<void(std::string, std::string)> m_profile_dump_callback;
size_t m_storage_id = 0;
std::stack<std::tuple<ApplyOp, size_t, TensorInfo*>> m_apply_stack;
// TODO: use explicit struct
std::stack<std::tuple<ApplyOp, size_t, TensorInfo*, std::string>> m_apply_stack;
bool m_applying = false;
bool m_closed = false;
......
......@@ -200,13 +200,13 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
ChromeTraceEvent& new_host_event(std::string name, char ph) {
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(current->tid)).ts(since_start(current->time));
};
}
ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) {
using namespace std::literals::chrono_literals;
auto time = since_start(to_device_time(current->time, device));
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(device)).ts(time);
};
}
const char* to_cstr(TensorCommandKind kind) {
switch(kind) {
......@@ -241,14 +241,12 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
new_host_event("OpDispatch", 'E').args(current_op->detail());
} else if constexpr (std::is_same_v<TEvent, OpExecuteEvent>) {
mgb_assert(event.op_id != 0);
current_op->execute_begin = current->time;
new_host_event(current_op->name, 'B');
new_host_event(pid_str, 't')
.cat("OpDispatch")
.id(current_op->id)
.scope(pid_str);
} else if constexpr (std::is_same_v<TEvent, OpExecuteFinishEvent>) {
current_op->execute_end = current->time;
new_host_event(current_op->name, 'E')
.args(current_op->detail());
} else if constexpr (std::is_same_v<TEvent, KernelLaunchEvent>) {
......
......@@ -84,6 +84,7 @@ DEF_DUR_EVENT(OpOutput, {
DEF_DUR_EVENT(OpExecute, {
uint64_t op_id;
SmallVector<CompNode> device_list;
std::string reason;
});
DEF_DUR_EVENT(KernelLaunch, {
......
......@@ -62,8 +62,13 @@ struct ProfileOperatorState {
CompNode device;
Trace trace;
profiler::HostTime execute_begin;
profiler::HostTime execute_end;
struct Execution {
std::string reason;
profiler::HostTime begin;
profiler::HostTime end;
};
SmallVector<Execution> executions;
nlohmann::json detail() {
nlohmann::json args;
......@@ -285,7 +290,7 @@ public:
op.outputs = event.outputs;
op.trace = event.trace;
for (auto&& output: event.outputs) {
m_tensors.at(output).source = op.id;
m_tensors[output].source = op.id;
}
} else if constexpr (std::is_same_v<T, TensorDeclareEvent>) {
auto& tensor = m_tensors[event.tensor_id];
......@@ -293,7 +298,7 @@ public:
tensor.id = event.tensor_id;
tensor.name = event.name;
} else if constexpr (std::is_same_v<T, TensorProduceEvent>) {
auto& tensor = m_tensors.at(event.tensor_id);
auto& tensor = m_tensors[event.tensor_id];
if (!m_device_tid_table.count(event.device)) {
m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()};
}
......@@ -308,15 +313,24 @@ public:
using T = std::decay_t<decltype(event)>;
// update current_op/tensor
if constexpr (is_op_event<T>::value) {
current_op = &m_operators.at(event.op_id);
current_op = &m_operators[event.op_id];
if (current_op->id == 0) {
current_op->id = event.op_id;
current_op->name = "UnknownOperator";
}
} else if constexpr (is_tensor_event<T>::value) {
mgb_assert(m_tensors.count(event.tensor_id) != 0, "tensor not found");
current_tensor = &m_tensors.at(event.tensor_id);
current_tensor = &m_tensors[event.tensor_id];
if (current_tensor->id == 0) {
current_tensor->id = event.tensor_id;
current_tensor->name = "UnknownTensor";
}
}
if constexpr (std::is_same_v<T, OpExecuteEvent>) {
current_op->execute_begin = current->time;
current_op->executions.emplace_back();
current_op->executions.back().reason = event.reason;
current_op->executions.back().begin = current->time;
} else if constexpr (std::is_same_v<T, OpExecuteFinishEvent>) {
current_op->execute_end = current->time;
current_op->executions.back().end = current->time;
}
// update counters
if constexpr (std::is_same_v<T, OpDispatchEvent>) {
......@@ -337,6 +351,12 @@ public:
}
} else if constexpr (std::is_same_v<T, WorkerExceptionEvent>) {
inc_counter("nr_exception", 1);
} else if constexpr (std::is_same_v<T, KernelLaunchFinishEvent>) {
auto& execution = current_op->executions.back();
if (execution.reason == "dtr") {
auto overhead = to_device_time(current->time, event.device) - to_device_time(execution.begin, event.device);
inc_counter("dtr_overhead_us", std::chrono::duration_cast<std::chrono::microseconds>(overhead).count());
}
}
// visit_event_impl
self.visit_event(event);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册