#pragma once #include #include #include #include #include #include #include #include "nlohmann/json.hpp" #include "megbrain/tensor.h" #include "./events.h" namespace mgb::imperative::profiler { using StackManager = interpreter::intl::StackManager; struct ProfileTensorState { uint64_t id = 0; std::optional source; TensorLayout layout; CompNode device; std::string name; profiler::HostTime produced = profiler::HostTime::min(); profiler::Duration living_time = profiler::Duration::zero(); size_t size_in_bytes() const { if (!layout.dtype.valid()) { return 0; } return layout.dtype.size(layout.total_nr_elems()); } std::string info(HostTime current_time) { std::string shape = layout.TensorShape::to_string(); std::string dtype = layout.dtype.name(); return ssprintf( "%s(%s:%s:%s)", name.c_str(), shape.c_str(), dtype.c_str(), device.to_string().c_str()); } nlohmann::json detail(HostTime current_time) { nlohmann::json args; args["id"] = id; args["name"] = name; args["shape"] = layout.TensorShape::to_string(); args["dtype"] = layout.dtype.name(); args["nr_elements"] = layout.total_nr_elems(); args["device"] = device.to_string(); if (produced != produced.min()) { double ms_count = std::chrono::duration_cast< std::chrono::duration>( current_time - produced + living_time) .count(); args["living_time"] = ssprintf("%lf ms", ms_count); } return args; } }; struct ProfileOperatorState { uint64_t id = 0; std::string name; OpParams params; SmallVector inputs; SmallVector outputs; CompNode device; Trace trace; struct Execution { std::string reason; profiler::HostTime begin; profiler::HostTime end; }; SmallVector executions; nlohmann::json detail() { nlohmann::json args; for (auto&& [name, value] : params) { args[name] = value; } args["__id__"] = id; args["__name__"] = name; args["__device__"] = device.to_string(); return args; } }; template struct ProfileTensorPropPair { uint64_t id; TProp value; bool operator<(const ProfileTensorPropPair& lhs) const { return value == lhs.value ? id < lhs.id : value < lhs.value; } bool operator==(const ProfileTensorPropPair& lhs) const { return id == lhs.id && value == lhs.value; } bool operator>(const ProfileTensorPropPair& lhs) const { return value == lhs.value ? id > lhs.id : value > lhs.value; } }; using ProfileTensorSizePair = ProfileTensorPropPair; using ProfileTensorProducedPair = ProfileTensorPropPair; struct ProfileState { std::unordered_map tensors; std::unordered_map operators; std::unordered_map tensor_name_counter; std::set tensors_by_size; std::set tensors_by_produced; std::vector top_k_tensor_in_device(CompNode device, size_t k) { std::vector results; for (auto iter = tensors_by_size.rbegin(); iter != tensors_by_size.rend(); ++iter) { if (!k) { break; } if (tensors[iter->id].device == device) { results.push_back(iter->id); --k; } } return results; } }; template struct is_op_event : std::false_type {}; template struct is_op_event().op_id, void())> : std::true_type {}; template struct is_tensor_event : std::false_type {}; template struct is_tensor_event().tensor_id, void())> : std::true_type {}; template struct is_trace_event : std::false_type {}; template struct is_trace_event().trace, void())> : std::true_type {}; template class AnyToVariantConverter { public: using any_t = AnyPtr; using variant_t = std::variant; private: std::unordered_map> m_table; template void register_converter() { m_table[typeid(TItem)] = [](const any_t& input) { return variant_t(input.cast()); }; } public: AnyToVariantConverter() { (register_converter(), ...); } variant_t operator()(const any_t& input) { return m_table[input.type()](std::move(input)); } }; template class EventVisitor { private: std::unordered_map m_operators; std::unordered_map m_tensors; std::unordered_map> m_duration_stack; HostTime m_start_time; CompNode::UnorderedMap m_device_tid_table; std::unordered_map m_host_tid_table; std::unordered_map m_cupti_tid_table; CompNode::UnorderedMap> m_device_timeline; std::unordered_map> m_trace_stack; std::unordered_map m_counter_table; std::optional> m_cupti_timestamp = {}; // Record the start and end time of all kernels std::vector> m_kernel_start_finish_time; // The sum of all kernel execution times profiler::Duration m_gpu_usage_time = std::chrono::microseconds(0); // The sum of all kernel execution times, including the gap time between kernels // within a step profiler::Duration m_gpu_usage_time_with_gap = std::chrono::microseconds(0); // Record the end time of each step std::vector m_step_finish_time; // Record the start time of the first kernel and the end time of the last kernel in // each step std::vector> m_step_first_last_kernel; protected: Profiler::Record* current; ProfileOperatorState* current_op; ProfileTensorState* current_tensor; protected: size_t next_tid() { return m_host_tid_table.size() + m_device_tid_table.size() + m_cupti_tid_table.size(); } profiler::Duration since_start(profiler::HostTime time) { return time - m_start_time; } profiler::HostTime to_device_time(profiler::HostTime time, CompNode device) { auto& device_timeline = m_device_timeline[device]; auto upper = device_timeline.lower_bound(time); if (upper == device_timeline.end()) { if (upper == device_timeline.begin()) { return time; } else { --upper; return time + std::chrono::duration_cast(upper->second); } } else if (upper->first == time) { return time + std::chrono::duration_cast(upper->second); } else if (upper == device_timeline.begin()) { return time + std::chrono::duration_cast(upper->second); } auto lower = upper; --lower; double ratio = ((double)(time - lower->first).count() / (double)(upper->first - lower->first).count()); mgb_assert(ratio > 0 && ratio < 1, "invalid ratio"); mgb_assert( lower->first + lower->second <= upper->first + upper->second, "device time corr"); auto shift = lower->second + ratio * (upper->second - lower->second); auto result = time + std::chrono::duration_cast(shift); return result; } size_t to_tid(std::thread::id host_tid) { return m_host_tid_table.at(host_tid); } size_t to_tid(CompNode device) { return m_device_tid_table.at(device); } size_t to_tid(cupti::stream_t cupti_stream) { return m_cupti_tid_table.at(cupti_stream); } SmallVector host_threads() { SmallVector host_threads; for (auto&& [host, _] : m_host_tid_table) { host_threads.push_back(host); } return host_threads; } SmallVector devices() { SmallVector devices; for (auto&& [device, _] : m_device_tid_table) { devices.push_back(device); } return devices; } void inc_counter(const char* key, int64_t delta) { if (!m_counter_table.count(key)) { m_counter_table[key] = 0; } auto& value = m_counter_table[key]; static_cast(*this).notify_counter(key, value, value + delta); value += delta; } profiler::HostTime time_from_cupti(cupti::time_point timestamp) { mgb_assert(m_cupti_timestamp.has_value()); return m_cupti_timestamp->first + std::chrono::duration_cast( timestamp - m_cupti_timestamp->second); } public: void process_events(Profiler::bundle_t& bundle) { m_start_time = bundle.start_at; auto& self = static_cast(*this); AnyToVariantConverter< OpDispatchEvent, OpExecuteEvent, OpExecuteFinishEvent, KernelLaunchEvent, KernelLaunchFinishEvent, OpInputEvent, OpInputFinishEvent, OpOutputEvent, OpOutputFinishEvent, TensorDeclareEvent, TensorProduceEvent, TensorUsageEvent, TensorReleaseEvent, TensorEraseEvent, TensorGetPropEvent, TensorNotifyPropEvent, TensorWaitPropEvent, TensorWaitPropFinishEvent, SampleDeviceEvent, SampleDeviceFinishEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent, StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent, StopStepEvent, TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent, HostToDeviceEvent, HostToDeviceFinishEvent, CUPTITimestampEvent, CUPTIKernelLaunchEvent, CUPTIKernelLaunchFinishEvent, CUPTIKernelExecuteEvent, CUPTIMemcpyLaunchEvent, CUPTIMemcpyLaunchFinishEvent, CUPTIMemcpyEvent, CUPTIRuntimeEvent, CUPTIRuntimeFinishEvent, CUPTIDriverEvent, CUPTIDriverFinishEvent, CUPTIMemsetEvent> converter; auto for_each_entry = [&](auto&& handler) { for (auto& entry : bundle.entries) { current = &entry; std::visit(handler, converter(entry.data)); } current = nullptr; }; // build device timeline struct DeviceStartPair { profiler::HostTime host; std::shared_ptr device; }; CompNode::UnorderedMap device_start_table; std::unordered_map cupti_stream_table; // record device time for_each_entry([&](auto&& event) { using T = std::decay_t; if constexpr (std::is_same_v) { using namespace std::chrono_literals; DeviceStartPair& device_start = device_start_table[event.event->comp_node()]; if (!device_start.device) { device_start = {current->time, event.event}; } event.event->host_wait(); auto device_time = (device_start.host - current->time) + std::chrono::duration_cast( device_start.device->elapsed_time_until(*event.event) * 1s); m_device_timeline[event.event->comp_node()][current->time] = device_time; } }); // record step end time for_each_entry([&](auto&& event) { using T = std::decay_t; if constexpr (std::is_same_v) { auto step_time = current->time; m_step_finish_time.push_back(to_device_time(step_time, event.device)); } }); // register host threads for_each_entry([&](auto&& event) { if (!m_host_tid_table.count(current->tid)) { m_host_tid_table[current->tid] = next_tid(); } }); for_each_entry([&](auto&& event) { using T = std::decay_t; if constexpr (std::is_same_v) { auto& op = m_operators[event.op_id]; mgb_assert(op.id == 0, "duplicate operator id"); op.id = event.op_id; op.name = event.op_name; op.params = event.op_params(); op.inputs = event.inputs; op.outputs = event.outputs; op.trace = event.trace; for (auto&& output : event.outputs) { m_tensors[output].source = op.id; } } else if constexpr (std::is_same_v) { auto& tensor = m_tensors[event.tensor_id]; mgb_assert(tensor.id == 0, "duplicated tensor id"); tensor.id = event.tensor_id; tensor.name = event.name; } else if constexpr (std::is_same_v) { auto& tensor = m_tensors[event.tensor_id]; if (!m_device_tid_table.count(event.device)) { m_device_tid_table[event.device] = next_tid(); } tensor.device = event.device; tensor.layout = event.layout; } }); for_each_entry([&](auto&& event) { using T = std::decay_t; if constexpr (std::is_same_v) { if (!m_cupti_tid_table.count(event.stream)) { m_cupti_tid_table[event.stream] = m_device_tid_table.at(event.device); } } }); // record cupti streams for_each_entry([&](auto&& event) { using T = std::decay_t; if constexpr ( std::is_same_v || std::is_same_v || std::is_same_v) { if (!m_cupti_tid_table.count(event.stream)) { m_cupti_tid_table[event.stream] = next_tid(); } } else if constexpr (std::is_same_v) { mgb_assert(!m_cupti_timestamp.has_value()); m_cupti_timestamp.emplace(current->time, event.timestamp); } }); // replay execution using namespace std::placeholders; for_each_entry([&](auto&& event) { using T = std::decay_t; // update current_op/tensor if constexpr (is_op_event::value) { current_op = &m_operators[event.op_id]; if (current_op->id == 0) { current_op->id = event.op_id; current_op->name = "UnknownOperator"; } } else if constexpr (is_tensor_event::value) { current_tensor = &m_tensors[event.tensor_id]; if (current_tensor->id == 0) { current_tensor->id = event.tensor_id; current_tensor->name = "UnknownTensor"; } } if constexpr (std::is_same_v) { current_op->executions.emplace_back(); current_op->executions.back().reason = event.reason; current_op->executions.back().begin = current->time; } else if constexpr (std::is_same_v) { current_op->executions.back().end = current->time; } // update counters if constexpr (std::is_same_v) { inc_counter("nr_op_pending", 1); } else if constexpr (std::is_same_v) { inc_counter("nr_op_pending", -1); } else if constexpr (std::is_same_v) { inc_counter("nr_alive_tensor", 1); } else if constexpr (std::is_same_v) { inc_counter("nr_alive_tensor", -1); } else if constexpr (std::is_same_v) { if (event.use_count == 0) { inc_counter("nr_redunant_tensor", 1); } } else if constexpr (std::is_same_v) { if (!event.success) { inc_counter("nr_shape_infer_failure", 1); } } else if constexpr (std::is_same_v) { inc_counter("nr_exception", 1); } else if constexpr (std::is_same_v) { auto& execution = current_op->executions.back(); auto overhead = to_device_time(current->time, event.device) - to_device_time(execution.begin, event.device); std::vector current_kernel_start_finish; current_kernel_start_finish.emplace_back( to_device_time(execution.begin, event.device)); current_kernel_start_finish.emplace_back( to_device_time(current->time, event.device)); m_kernel_start_finish_time.emplace_back(current_kernel_start_finish); if (execution.reason == "dtr") { inc_counter( "dtr_overhead_us", std::chrono::duration_cast( overhead) .count()); } } // visit_event_impl self.visit_event(event); // reset current_op/tensor if constexpr (is_op_event::value) { current_op = nullptr; } else if constexpr (is_tensor_event::value) { current_tensor = nullptr; } }); } profiler::Duration last_kernel_finish() { if (m_kernel_start_finish_time.size() == 0) { return profiler::Duration::zero(); } return m_kernel_start_finish_time.back()[1] - m_start_time; } // get GPU busy time (union of calculation time and communication time) profiler::Duration gpu_usage_time() { if (m_kernel_start_finish_time.size() == 0) { return profiler::Duration::zero(); } std::sort( m_kernel_start_finish_time.begin(), m_kernel_start_finish_time.end(), [&](std::vector kernel1, std::vector kernel2) { if (kernel1[0] != kernel2[0]) { return kernel1[0] < kernel2[0]; } return kernel1[1] < kernel2[1]; }); HostTime current_start = profiler::HostTime::min(); HostTime current_end = profiler::HostTime::min(); for (size_t i = 0; i < m_kernel_start_finish_time.size(); ++i) { if (current_start == profiler::HostTime::min()) { current_start = m_kernel_start_finish_time[i][0]; current_end = m_kernel_start_finish_time[i][1]; } else if (current_end < m_kernel_start_finish_time[i][0]) { m_gpu_usage_time += profiler::Duration(current_end - current_start); current_start = m_kernel_start_finish_time[i][0]; current_end = m_kernel_start_finish_time[i][1]; } else if (current_end > m_kernel_start_finish_time[i][0]) { current_end = max(current_end, m_kernel_start_finish_time[i][1]); } } m_gpu_usage_time += profiler::Duration(current_end - current_start); return m_gpu_usage_time; } // compared to gpu_usage_time, this method adds gap time between kernels in the same // step profiler::Duration gpu_usage_time_with_gap() { if (m_step_finish_time.empty()) { return profiler::Duration::zero(); } std::sort( m_step_finish_time.begin(), m_step_finish_time.end(), [&](HostTime time1, HostTime time2) { return time1 < time2; }); std::sort( m_kernel_start_finish_time.begin(), m_kernel_start_finish_time.end(), [&](std::vector kernel1, std::vector kernel2) { if (kernel1[0] != kernel2[0]) { return kernel1[0] < kernel2[0]; } return kernel1[1] < kernel2[1]; }); int cur_step = 0; auto kernel_num = m_kernel_start_finish_time.size(); for (size_t i = 0; i < kernel_num; ++i) { // Record the start time of the first kernel and the end time of the last // kernel of the current step std::vector step_begin_end_time; step_begin_end_time.emplace_back(m_kernel_start_finish_time[i][0]); size_t j = i; while (j < kernel_num && m_kernel_start_finish_time[j][0] < m_step_finish_time[cur_step + 1]) { ++j; } step_begin_end_time.emplace_back(m_kernel_start_finish_time[j - 1][1]); mgb_assert( step_begin_end_time.size() == 2, "step_begin_end_time.size() should be exactly 2!"); m_step_first_last_kernel.emplace_back(step_begin_end_time); i = j - 1; ++cur_step; } for (size_t i = 0; i < m_step_first_last_kernel.size(); ++i) { m_gpu_usage_time_with_gap += profiler::Duration( m_step_first_last_kernel[i][1] - m_step_first_last_kernel[i][0]); } return m_gpu_usage_time_with_gap; } // from the start time of the first kernel of the first step to the end time of the // last kernel of the last step std::chrono::microseconds get_total_train_time() { if (m_kernel_start_finish_time.size() == 0) { return std::chrono::microseconds(1); } return std::chrono::duration_cast( m_kernel_start_finish_time.back()[1] - m_kernel_start_finish_time.front()[0]); } }; } // namespace mgb::imperative::profiler