/** * \file imperative/src/impl/profiler.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #include "megbrain/imperative/profiler.h" #include "./function_hook.h" #include "megbrain/imperative/ops/opr_attr.h" #include "megbrain/imperative/physical_tensor.h" #include "megbrain/plugin/opr_footprint.h" #include "./event_pool.h" #include "./op_trait.h" namespace mgb { namespace imperative { namespace { CompNode::UnorderedSet collect_comp_nodes( const OpDef& def, const SmallVector& inputs) { CompNode::UnorderedSet comp_nodes; SmallVector inp_descs; for (auto&& i : inputs) { comp_nodes.insert(i->comp_node()); inp_descs.push_back({i->layout(), i->comp_node(), {}}); } SmallVector oup_descs = std::get<0>(def.infer_output_attrs_fallible(def, inp_descs)); for (auto&& output_attr : oup_descs) { comp_nodes.insert(output_attr.comp_node); } return comp_nodes; } DeviceTimer::SharedEvent alloc_recorded_event(CompNode device) { auto event = EventPool::with_timer().alloc_shared(device); event->record(); return event; } OprFootprint footprint{}; } // namespace void DeviceTimer::reset(thin_function host_timer) { CompNode::foreach ([this, host_timer](CompNode device) { m_base_event_table[device] = {alloc_recorded_event(device), host_timer()}; }); m_host_timer = host_timer; } thin_function DeviceTimer::get_device_time(CompNode device) { auto event = EventPool::with_timer().alloc_shared(device); event->record(); if(m_base_event_table.count(device) == 0) { m_base_event_table[device] = {alloc_recorded_event(device), m_host_timer()}; } auto base = m_base_event_table[device]; return [base, event] { auto [base_event, host_time] = base; // TODO: sync once for each compnode event->host_wait(); return base_event->elapsed_time_until(*event) * 1000 + host_time; }; } void DeviceTimer::clear() { m_base_event_table.clear(); } size_t TensorRecorder::record_tensor(const TensorPtr& tensor) { if (m_tensor_map.count(tensor.get()) > 0) { auto& [prev, id] = m_tensor_map[tensor.get()]; if (prev.lock() != tensor) { prev = tensor; id = m_next_id++; } return id; } else { auto id = m_next_id++; m_tensor_map.insert( {tensor.get(), {std::weak_ptr{tensor}, id}}); return id; } } void TensorRecorder::clear() { m_next_id = 0; m_tensor_map.clear(); } Profile& Profiler::get_profile() { for (auto& entry : m_profile) { for (auto& [device, device_begin, device_end] : entry.device_list) { MGB_MARK_USED_VAR(device); device_begin = [value = device_begin()] { return value; }; device_end = [value = device_end()] { return value; }; } } return m_profile; } void Profiler::start(uint32_t flags) { m_host_timer.reset(); m_device_timer.reset([&] { return m_host_timer.get_msecs(); }); OpTrait::for_each_trait([this, flags](OpTrait& trait) { auto hook_apply_on_physical_tensor = make_shared_hook(&trait.apply_on_physical_tensor); auto hook_apply_on_var_node = make_shared_hook(&trait.apply_on_var_node); hook_apply_on_physical_tensor->apply_hook([this, flags] (auto&& apply, const OpDef& def, SmallVector inputs) { auto shape2vector = [](const TensorShape& shape) { std::vector vector_shape; for (size_t i = 0; i < shape.ndim; i++) { vector_shape.push_back(shape[i]); } return vector_shape; }; ProfileEntry entry; entry.id = m_entry_count++; // TODO: assign parent entry.parent = 0; // Record apply context and save to m_profile entry.op = const_cast(def).shared_from_this(); for (auto&& input : inputs) { entry.inputs.push_back({m_tensor_recorder.record_tensor(input), shape2vector(input->layout()), input->comp_node()}); } double host_begin = m_host_timer.get_msecs(); auto&& comp_nodes = collect_comp_nodes(def, inputs); for (auto&& comp_node : comp_nodes) { entry.device_list.push_back( {comp_node, m_device_timer.get_device_time(comp_node), {}}); } if (flags & PROFILE_FOOTPRINT) { MGB_LOCK_GUARD(m_lock); m_entry_stack.push({&def, &entry, std::this_thread::get_id()}); } // Do real apply auto outputs = apply(def, inputs); for (auto& [cn, dev_begin, dev_end] : entry.device_list) { MGB_MARK_USED_VAR(cn); MGB_MARK_USED_VAR(dev_begin); dev_end = m_device_timer.get_device_time(cn); } entry.host = {host_begin, m_host_timer.get_msecs()}; for (auto&& output : outputs) { entry.outputs.push_back( {m_tensor_recorder.record_tensor(output), shape2vector(output->layout()), output->comp_node()}); } if (flags & PROFILE_FOOTPRINT) { mgb_assert(std::get<1>(m_entry_stack.top()) == &entry); MGB_LOCK_GUARD(m_lock); m_entry_stack.pop(); } m_profile.push_back(std::move(entry)); return outputs; }); if (flags & PROFILE_FOOTPRINT) { hook_apply_on_var_node->apply_hook( [this](auto&& apply, const OpDef& def, VarNodeArray inputs) -> VarNodeArray { auto vars = apply(def, std::move(inputs)); std::remove_reference_t top; { MGB_LOCK_GUARD(m_lock); if (m_entry_stack.empty()) { return vars; } top = m_entry_stack.top(); } auto [current_op, current_entry, thread_id] = top; if (current_op != &def || thread_id != std::this_thread::get_id()) { return vars; } auto&& footprint_result = footprint.calc_footprint(vars[0]->owner_opr()); current_entry->memory = footprint_result.memory; current_entry->computation = footprint_result.computation; #if MGB_ENABLE_JSON current_entry->param = footprint_result.param; #endif return vars; }); } m_hooker_list.push_back(std::move(hook_apply_on_physical_tensor)); m_hooker_list.push_back(std::move(hook_apply_on_var_node)); }); } void Profiler::stop() { m_hooker_list.clear(); for (auto& entry : m_profile) { entry.wait_device(); } } void Profiler::clear() { mgb_assert(m_entry_stack.empty(), "entry_stack should be empty after profile"); mgb_assert(m_hooker_list.empty(), "hooks should be released"); m_profile.clear(); m_entry_count = 0; m_device_timer.clear(); m_tensor_recorder.clear(); } } // namespace imperative } // namespace mgb