From d4c71f92dcaf0632eff9e12fd6b12880536c0783 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Fri, 9 Jul 2021 18:17:20 +0800 Subject: [PATCH] perf(profiler): reduce profiler host overhead GitOrigin-RevId: 92cea560f55c23a7de7bbef254cde935a1e91735 --- imperative/python/src/tensor.cpp | 7 +- .../src/impl/interpreter/interpreter_impl.cpp | 8 +- imperative/src/impl/profiler.cpp | 8 +- .../src/impl/profiler/chrome_timeline.cpp | 12 +- imperative/src/impl/profiler/memory_chunk.cpp | 2 +- imperative/src/impl/profiler/states.h | 15 +- .../include/megbrain/imperative/profiler.h | 137 ++++++++++++------ imperative/src/test/profiler.cpp | 10 +- 8 files changed, 126 insertions(+), 73 deletions(-) diff --git a/imperative/python/src/tensor.cpp b/imperative/python/src/tensor.cpp index 2ec14a372..5247a359b 100644 --- a/imperative/python/src/tensor.cpp +++ b/imperative/python/src/tensor.cpp @@ -1012,9 +1012,10 @@ void init_tensor(py::module m) { interpreter_for_py->stop_profile(); interpreter_for_py->sync(); imperative::Profiler::stop_profile(); - auto results = imperative::Profiler::collect(); - return [results=std::move(results)](std::string basename, std::string format){ - imperative::Profiler::dump_profile(basename, format, results); + auto results = std::make_shared(imperative::Profiler::collect()); + return [results=results](std::string basename, std::string format) mutable { + imperative::Profiler::dump_profile(basename, format, std::move(*results)); + results = nullptr; }; }, py::call_guard()); m.def("sync", diff --git a/imperative/src/impl/interpreter/interpreter_impl.cpp b/imperative/src/impl/interpreter/interpreter_impl.cpp index 52e67854b..158e2942e 100644 --- a/imperative/src/impl/interpreter/interpreter_impl.cpp +++ b/imperative/src/impl/interpreter/interpreter_impl.cpp @@ -1133,9 +1133,7 @@ void ChannelImpl::process_one_task(Command& icmd) { } } CompNode::foreach([&](CompNode device){ - if (Profiler::get_option("sample_rate", 0)) { - sample_on_device(device, true); - } + sample_on_device(device, true); MGB_RECORD_EVENT_IF((Profiler::get_option("profile_device", 0)), RecordDeviceEvent, Timer::record_device(device)); }); MGB_RECORD_EVENT(StartProfileFinishEvent); @@ -1149,9 +1147,7 @@ void ChannelImpl::process_one_task(Command& icmd) { MGB_RECORD_EVENT(TensorEraseEvent, info->id); } CompNode::foreach([&](CompNode device){ - if (Profiler::get_option("sample_rate", 0)) { - sample_on_device(device, true); - } + sample_on_device(device, true); }); MGB_RECORD_EVENT(StopProfileFinishEvent); } else if constexpr (std::is_same_v) { diff --git a/imperative/src/impl/profiler.cpp b/imperative/src/impl/profiler.cpp index f8c17136f..7941191b8 100644 --- a/imperative/src/impl/profiler.cpp +++ b/imperative/src/impl/profiler.cpp @@ -28,7 +28,7 @@ namespace mgb { namespace imperative { profiler::Time Timer::record_host() { - return std::chrono::high_resolution_clock::now(); + return std::chrono::system_clock::now(); } std::shared_ptr Timer::record_device(CompNode device) { @@ -40,12 +40,12 @@ std::shared_ptr Timer::record_device(CompNode device) { std::vector Profiler::sm_records; Profiler::options_t Profiler::sm_profile_options; std::mutex Profiler::sm_mutex; -std::unordered_map Profiler::sm_profilers; +std::unordered_map> Profiler::sm_profilers; Timer Profiler::sm_timer; profiler::HostTime Profiler::sm_start_at = profiler::HostTime::min(); std::atomic_uint64_t Profiler::sm_last_id = 0; bool Profiler::sm_profiling = false; -thread_local std::unique_ptr Profiler::tm_profiler = std::make_unique(); +thread_local Profiler* Profiler::tm_profiler = nullptr; std::atomic_size_t Profiler::sm_preferred_capacity; auto Profiler::get_thread_dict() -> thread_dict_t { @@ -65,7 +65,7 @@ void Profiler::dump_profile(std::string basename, std::string format, bundle_t r if (iter == format_table.end()) { mgb_log_error("unsupported profiling format %s", format.c_str()); } - return (iter->second)(basename, result); + return (iter->second)(basename, std::move(result)); } } // namespace imperative diff --git a/imperative/src/impl/profiler/chrome_timeline.cpp b/imperative/src/impl/profiler/chrome_timeline.cpp index f1954a967..64c1f43f4 100644 --- a/imperative/src/impl/profiler/chrome_timeline.cpp +++ b/imperative/src/impl/profiler/chrome_timeline.cpp @@ -268,7 +268,7 @@ struct ChromeTimelineEventVisitor: EventVisitor { .cat("Kernel") .args(current_op->detail()); } else if constexpr (std::is_same_v) { - if (current_tensor->living_time != profiler::Duration::zero()) { + if (current_tensor->living_time == profiler::Duration::zero()) { new_host_event(pid_str, 's') .id(event.tensor_id) .cat("TensorLink") @@ -319,8 +319,8 @@ struct ChromeTimelineEventVisitor: EventVisitor { .scope(pid_str); } else if constexpr (std::is_same_v) { std::string device_name = event.device.locator().to_string(); - new_host_event("memory", 'C') - .arg(ssprintf("%s_alloc_mem", device_name.c_str()), event.total_memory - event.free_memory); + new_host_event(ssprintf("%s_alloc_mem", device_name.c_str()), 'C') + .arg("value", event.total_memory - event.free_memory); } else if constexpr (std::is_same_v) { new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'B'); } else if constexpr (std::is_same_v) { @@ -366,6 +366,12 @@ struct ChromeTimelineEventVisitor: EventVisitor { .arg("dtype", event.layout.dtype.name()) .arg("nr_elements", event.layout.total_nr_elems()) .arg("device", event.device.to_string()); + } else if constexpr (std::is_same_v) { + auto current_host_time = current->time; + auto current_device_time = to_device_time(current->time, event.event->comp_node()); + auto device_ahead = std::chrono::duration_cast(current_device_time-current_host_time); + new_host_event("device_ahead_ms", 'C') + .arg("value", device_ahead.count()); } } diff --git a/imperative/src/impl/profiler/memory_chunk.cpp b/imperative/src/impl/profiler/memory_chunk.cpp index a4b666643..69461bf5f 100644 --- a/imperative/src/impl/profiler/memory_chunk.cpp +++ b/imperative/src/impl/profiler/memory_chunk.cpp @@ -261,7 +261,7 @@ struct MemoryFlowVisitor: EventVisitor { void dump_memory_flow(std::string filename, Profiler::bundle_t result) { MemoryFlowVisitor visitor; - visitor.process_events(std::move(result)); + visitor.process_events(result); debug::write_to_file(filename.c_str(), visitor.memory_flow.to_svg().to_string()); } diff --git a/imperative/src/impl/profiler/states.h b/imperative/src/impl/profiler/states.h index d50898b4d..0c758637f 100644 --- a/imperative/src/impl/profiler/states.h +++ b/imperative/src/impl/profiler/states.h @@ -139,22 +139,22 @@ struct is_trace_event().trace, void())> : std::true_ template class AnyToVariantConverter { public: - using any_t = std::any; + using any_t = AnyPtr; using variant_t = std::variant; private: - std::unordered_map> m_table; + std::unordered_map> m_table; template void register_converter() { - m_table[typeid(TItem)] = [](any_t input) { - return variant_t(std::any_cast(std::move(input))); + m_table[typeid(TItem)] = [](const any_t& input) { + return variant_t(*input.as()); }; } public: AnyToVariantConverter() { (register_converter(), ...); } - variant_t operator()(any_t input) { + variant_t operator()(const any_t& input) { return m_table[input.type()](std::move(input)); } }; @@ -222,7 +222,7 @@ protected: value += delta; } public: - void process_events(Profiler::bundle_t bundle) { + void process_events(Profiler::bundle_t& bundle) { m_start_time = bundle.start_at; auto& self = static_cast(*this); @@ -231,7 +231,7 @@ public: OpInputEvent, OpInputFinishEvent, OpOutputEvent, OpOutputFinishEvent, TensorDeclareEvent, TensorProduceEvent, TensorUsageEvent, TensorReleaseEvent, TensorEraseEvent, TensorGetPropEvent, TensorNotifyPropEvent, TensorWaitPropEvent, TensorWaitPropFinishEvent, - SampleDeviceEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent, + SampleDeviceEvent, SampleDeviceFinishEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent, StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent, TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent, @@ -298,6 +298,7 @@ public: m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()}; } tensor.device = event.device; + tensor.layout = event.layout; } }); diff --git a/imperative/src/include/megbrain/imperative/profiler.h b/imperative/src/include/megbrain/imperative/profiler.h index d897fe7c7..532cb47d0 100644 --- a/imperative/src/include/megbrain/imperative/profiler.h +++ b/imperative/src/include/megbrain/imperative/profiler.h @@ -34,9 +34,10 @@ namespace imperative { namespace profiler { -using HostTime = std::chrono::time_point; +using HostTime = std::chrono::time_point; using Duration = std::chrono::nanoseconds; + using RealDuration = std::chrono::duration; using Time = HostTime; @@ -50,6 +51,52 @@ public: static std::shared_ptr record_device(CompNode device); }; +class AnyPtr { +public: + struct Deleter { + void* object; + void (*method)(void*, void*); + void operator() (void* ptr) { + method(object, ptr); + } + }; +private: + using holder_t = std::unique_ptr; + + const std::type_info* m_type = nullptr; + holder_t m_holder = nullptr; +public: + AnyPtr() = default; + template , AnyPtr>>> + explicit AnyPtr(T* value, Deleter deleter) { + m_type = &typeid(T); + m_holder = {value, deleter}; + } + template + T* as() { + mgb_assert(is_exactly(), "type mismatch"); + return reinterpret_cast(m_holder.get()); + } + template + const T* as() const { + mgb_assert(is_exactly(), "type mismatch"); + return reinterpret_cast(m_holder.get()); + } + template + bool is_exactly() const { + return std::type_index{typeid(T)} == std::type_index{*m_type}; + } + const std::type_info& type() const { + return *m_type; + } + bool operator==(std::nullptr_t nptr) const { + return m_holder == nullptr; + } + operator bool() const { + return m_holder != nullptr; + } +}; + class Profiler { public: @@ -57,7 +104,10 @@ public: uint64_t id; std::thread::id tid; profiler::Time time; - std::any data; + AnyPtr data; + Record() = default; + Record(uint64_t id, std::thread::id tid, profiler::Time time, AnyPtr data): + id{id}, tid{tid}, time{time}, data{std::move(data)} {}; }; enum Status: uint8_t { Running = 0, @@ -82,36 +132,52 @@ private: std::thread::id m_thread_id; std::vector m_records; std::atomic m_status = Running; + std::unordered_map m_mem_pools; static std::vector sm_records; static options_t sm_profile_options; static std::mutex sm_mutex; - static std::unordered_map sm_profilers; + // assume std::thread::id is unique + static std::unordered_map> sm_profilers; static Timer sm_timer; static profiler::HostTime sm_start_at; static std::atomic_uint64_t sm_last_id; static std::atomic_size_t sm_preferred_capacity; static bool sm_profiling; static constexpr bool sm_debug = false; - thread_local static std::unique_ptr tm_profiler; + thread_local static Profiler* tm_profiler; public: - Profiler() { - m_thread_id = std::this_thread::get_id(); - MGB_LOCK_GUARD(sm_mutex); - mgb_assert(sm_profilers.count(m_thread_id) == 0); - sm_profilers[m_thread_id] = this; - } - ~Profiler() { - MGB_LOCK_GUARD(sm_mutex); - mgb_assert(sm_profilers.count(m_thread_id) == 1); - sm_profilers.erase(m_thread_id); - sm_records.insert(sm_records.end(), m_records.begin(), m_records.end()); + explicit Profiler(std::thread::id tid): m_thread_id{tid} { + mgb_assert(tid == std::this_thread::get_id(), "thread id mismatch"); } public: static Profiler& get_instance() { + if (!tm_profiler) { + MGB_LOCK_GUARD(sm_mutex); + auto& profiler = sm_profilers[std::this_thread::get_id()]; + if (!profiler) { + profiler = std::make_unique(std::this_thread::get_id()); + } + tm_profiler = profiler.get(); + } return *tm_profiler; } + template + static MemPool& get_mem_pool() { + thread_local MemPool* t_pool = nullptr; + if (t_pool == nullptr) { + auto& pool = get_instance().m_mem_pools[typeid(MemPool)]; + if (pool == nullptr) { + pool = AnyPtr(new MemPool(), {nullptr, [](void*, void* ptr){ + delete reinterpret_cast*>(ptr); + }}); + } + t_pool = pool.as>(); + } + return *t_pool; + } + static uint64_t next_id() { return sm_last_id++; } @@ -119,13 +185,19 @@ public: template static uint64_t record(TArgs&&... args) { auto& profiler = get_instance(); + auto& mem_pool = get_mem_pool(); if constexpr (sm_debug) { Status expected = Running; mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording)); } uint64_t id = next_id(); profiler::Time time = sm_timer.record_host(); - profiler.m_records.push_back({id, std::this_thread::get_id(), time, T{std::forward(args)...}}); + auto deleter = [](void* obj, void* ptr){ + reinterpret_cast*>(obj)->free(reinterpret_cast(ptr)); + }; + profiler.m_records.emplace_back(id, profiler.m_thread_id, time, AnyPtr{ + mem_pool.alloc(T{std::forward(args)...}), {&mem_pool, deleter} + }); if constexpr (sm_debug) { Status expected = Recording; mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running)); @@ -146,7 +218,9 @@ public: std::vector profile_data = std::move(sm_records); for (auto&& [tid, profiler]: sm_profilers) { sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size()); - profile_data.insert(profile_data.end(), profiler->m_records.begin(), profiler->m_records.end()); + profile_data.insert(profile_data.end(), + std::make_move_iterator(profiler->m_records.begin()), + std::make_move_iterator(profiler->m_records.end())); profiler->m_records.clear(); profiler->m_records.reserve(sm_preferred_capacity); } @@ -160,11 +234,11 @@ public: mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running)); } } - bundle.entries = profile_data; + bundle.entries = std::move(profile_data); bundle.options = get_options(); bundle.start_at = sm_start_at; bundle.thread_dict = get_thread_dict(); - return bundle; + return std::move(bundle); } static option_t get_option(std::string key, option_t default_val) { @@ -203,31 +277,6 @@ public: }; -class ProfileDataCollector { -public: - template - using SubCollector = std::function; -private: - std::unordered_map> m_collectors; -public: - template - ProfileDataCollector& handle(SubCollector collector) { - auto erased = [collector](uint64_t id, std::thread::id tid, uint64_t time, std::any data){ - collector(id, tid, time, std::any_cast(std::move(data))); - }; - m_collectors[typeid(T)] = erased; - return *this; - } - void operator()(uint64_t id, std::thread::id tid, uint64_t time, std::any event) { - std::type_index type = event.type(); - if (m_collectors.count(type) == 0) { - return; - } - auto& handler = m_collectors.at(type); - handler(id, tid, time, std::move(event)); - } -}; - #define MGB_RECORD_EVENT(type, ...) \ if (mgb::imperative::Profiler::is_profiling()) { \ mgb::imperative::Profiler::record(type{__VA_ARGS__}); \ diff --git a/imperative/src/test/profiler.cpp b/imperative/src/test/profiler.cpp index f773b4ee9..c5d2599bf 100644 --- a/imperative/src/test/profiler.cpp +++ b/imperative/src/test/profiler.cpp @@ -25,11 +25,11 @@ TEST(TestProfiler, ImperativeLogProfile) { imperative_log_profile("XXX"); auto results = imperative::Profiler::collect(); imperative::Profiler::stop_profile(); - mgb_assert(results.size() == 2); - auto* event_start = std::any_cast(&results[0].second.data); - auto* event_finish = std::any_cast(&results[1].second.data); + mgb_assert(results.entries.size() == 2); + auto* event_start = results.entries[0].data.as(); + auto* event_finish = results.entries[1].data.as(); mgb_assert(event_start && event_start->title == "XXX"); mgb_assert(event_finish && event_finish->title == "XXX"); - mgb_assert(results[0].second.time < results[1].second.time); - mgb_assert(results[0].second.id < results[1].second.id); + mgb_assert(results.entries[0].time < results.entries[1].time); + mgb_assert(results.entries[0].id < results.entries[1].id); } -- GitLab