From 5b6be4d7585ee1c8f142157b6abf89de88be2c5b Mon Sep 17 00:00:00 2001 From: liutiexing <74819124+liutiexing@users.noreply.github.com> Date: Thu, 16 Dec 2021 11:18:33 +0800 Subject: [PATCH] Adapt host event recorder to profiler (#37766) * add align for WorkQueue * add spinlock * merge develop * merge * Add EventsWaiter * Revert "Add EventsWaiter" This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2. * add os_info * update * update * update * update * update * update for bugfix * update * update * update Co-authored-by: liutiexing --- paddle/fluid/platform/CMakeLists.txt | 7 +- paddle/fluid/platform/device_tracer.cc | 70 ++++++++++++-- paddle/fluid/platform/device_tracer.h | 18 ++-- paddle/fluid/platform/event.h | 13 ++- paddle/fluid/platform/os_info.cc | 38 ++++++++ paddle/fluid/platform/os_info.h | 99 +++++++++++++++++++ paddle/fluid/platform/profiler.cc | 123 ++++++++++++++++++++++-- paddle/fluid/platform/profiler_helper.h | 4 +- 8 files changed, 333 insertions(+), 39 deletions(-) create mode 100644 paddle/fluid/platform/os_info.cc create mode 100644 paddle/fluid/platform/os_info.h diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index d8d41e9d918..728c6af1812 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -45,6 +45,7 @@ IF(WITH_XBYAK) ENDIF() cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS}) cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) +cc_library(os_info SRCS os_info.cc DEPS enforce device_tracer) IF(WITH_GPU) nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph) @@ -165,13 +166,13 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) - nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda) + nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) elseif(WITH_ROCM) - hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce) + hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce) hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) else() - cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce) + cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce) cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place) endif() diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 8160a06ddea..28c51251627 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -22,8 +22,14 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/platform/device_tracer.h" +DECLARE_bool(enable_host_event_recorder_hook); + namespace paddle { namespace platform { + +// Used only by DeviceTracer +uint64_t GetThreadIdFromSystemThreadId(uint32_t id); + namespace { // Tracking the nested block stacks of each thread. #ifdef PADDLE_WITH_SW @@ -40,7 +46,8 @@ thread_local std::deque annotation_stack; static std::deque main_thread_annotation_stack{}; static std::deque main_thread_annotation_stack_name{}; -std::map system_thread_id_map; +std::map system_thread_id_map; +std::mutex system_thread_id_map_mutex; std::once_flag tracer_once_flag; DeviceTracer *tracer = nullptr; @@ -299,6 +306,47 @@ class DeviceTracerImpl : public DeviceTracer { local_correlations_pairs->push_front(std::make_pair(id, event)); } + void AddAnnotations(const std::map &thr_events) { + for (auto &tmp : active_kind_records_) { + for (const ActiveKindRecord &r : tmp) { + auto iter = thr_events.find(r.thread_id); + if (iter == thr_events.end()) { + VLOG(10) << __func__ << " " << r.name + << " Missing tid: " << r.thread_id; + continue; + } + const ThreadEvents &evts = iter->second; + auto evt_iter = evts.upper_bound(r.end_ns); + if (evt_iter == evts.end()) { + VLOG(10) << __func__ << " Missing Record " << r.name + << " tid: " << r.thread_id << " end_ns: " << r.end_ns; + continue; + } + if (evt_iter != evts.begin()) { + auto prev_iter = std::prev(evt_iter); + if (prev_iter->first >= r.end_ns) { + evt_iter = prev_iter; + } else { + VLOG(10) << __func__ << " prev end_ns " << prev_iter->first + << " end_ns: " << r.end_ns; + } + } + Event *evt = evt_iter->second.first; + uint64_t start_ns = evt_iter->second.second; + if (start_ns > r.start_ns) { + VLOG(10) << __func__ << " Mismatch Record " << r.name + << " tid: " << r.thread_id << " start_ns: " << r.start_ns + << " end_ns: " << r.end_ns << ", event " << evt->name() + << " start_ns: " << start_ns; + continue; + } + VLOG(10) << __func__ << " tid: " << r.thread_id << " Add correlation " + << r.correlation_id << "<->" << evt->name(); + AddAnnotation(r.correlation_id, evt); + } + } + } + void AddCPURecords(const std::string &anno, uint64_t start_ns, uint64_t end_ns, int64_t device_id, int64_t thread_id) { if (anno.empty()) { @@ -357,7 +405,7 @@ class DeviceTracerImpl : public DeviceTracer { void AddActiveKindRecords(const std::string &anno, uint64_t start_ns, uint64_t end_ns, int64_t device_id, - int64_t thread_id, uint32_t correlation_id) { + uint64_t thread_id, uint32_t correlation_id) { if (anno.empty()) { VLOG(1) << "Empty timeline annotation."; return; @@ -524,7 +572,7 @@ class DeviceTracerImpl : public DeviceTracer { event->set_detail_info(c->second->attr()); find++; } else { - VLOG(10) << "Missing Kernel Event: " + r.name; + VLOG(10) << __func__ << " Missing Kernel Event: " + r.name; miss++; event->set_name(r.name); } @@ -533,7 +581,8 @@ class DeviceTracerImpl : public DeviceTracer { event->set_sub_device_id(r.stream_id); event->set_device_id(r.device_id); } - VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find; + VLOG(1) << __func__ << " KernelRecord event miss: " << miss + << " find: " << find; for (auto &tmp : cpu_records_) { for (const CPURecord &r : tmp) { @@ -583,7 +632,8 @@ class DeviceTracerImpl : public DeviceTracer { event->set_device_id(r.device_id); event->mutable_memcopy()->set_bytes(r.bytes); } - VLOG(1) << "MemRecord event miss: " << miss << " find: " << find; + VLOG(1) << __func__ << " MemRecord event miss: " << miss + << " find: " << find; for (auto &tmp : mem_info_record_) { for (const auto &r : tmp) { @@ -633,6 +683,9 @@ class DeviceTracerImpl : public DeviceTracer { #ifdef PADDLE_WITH_CUPTI static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const void *cbdata) { + if (LIKELY(FLAGS_enable_host_event_recorder_hook)) { + return; + } auto *cbInfo = reinterpret_cast(cbdata); DeviceTracerImpl *tracer = reinterpret_cast(userdata); if (cbInfo->callbackSite == CUPTI_API_ENTER) { @@ -712,6 +765,7 @@ Event *CurAnnotation() { if (annotation_stack.empty()) return nullptr; return annotation_stack.back(); } + std::string CurAnnotationName() { if (annotation_stack.empty()) return "Unknown"; return annotation_stack.back()->name(); @@ -730,13 +784,13 @@ uint32_t GetCurSystemThreadId() { return id; } -void RecoreCurThreadId(int32_t id) { +void RecoreCurThreadId(uint64_t id) { + std::lock_guard lock(system_thread_id_map_mutex); auto gid = GetCurSystemThreadId(); - VLOG(1) << "RecoreCurThreadId: " << gid << " -> " << id; system_thread_id_map[gid] = id; } -int32_t GetThreadIdFromSystemThreadId(uint32_t id) { +uint64_t GetThreadIdFromSystemThreadId(uint32_t id) { auto it = system_thread_id_map.find(id); if (it != system_thread_id_map.end()) return it->second; // return origin id if no event is recorded in this thread. diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index ef06d0d609e..9d6e435c845 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -18,8 +18,8 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/event.h" +#include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.pb.h" namespace paddle { @@ -30,12 +30,6 @@ namespace platform { ////////////////////// class Event; -inline uint64_t PosixInNsec() { - struct timeval tv; - gettimeofday(&tv, nullptr); - return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); -} - // DeviceTracer performs the following tasks: // 1. Register cuda callbacks for various events: kernel, memcpy, etc. // 2. Collect cuda statistics: start/end ts, memory, etc. @@ -84,7 +78,7 @@ class DeviceTracer { uint64_t start_ns; uint64_t end_ns; int64_t device_id; - int64_t thread_id; + uint64_t thread_id; uint32_t correlation_id; }; @@ -101,6 +95,9 @@ class DeviceTracer { // human-readable annotations. virtual void AddAnnotation(uint32_t id, Event* event) = 0; + virtual void AddAnnotations( + const std::map& thr_events) = 0; + virtual void AddMemRecords(const std::string& name, uint64_t start_ns, uint64_t end_ns, int64_t device_id, int64_t stream_id, uint32_t correlation_id, @@ -111,7 +108,7 @@ class DeviceTracer { int64_t thread_id) = 0; virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns, uint64_t end_ns, int64_t device_id, - int64_t thread_id, + uint64_t thread_id, uint32_t correlation_id) = 0; virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, @@ -154,7 +151,6 @@ void ClearCurBlock(); int BlockDepth(); // Set current thread id, so we can map the system thread id to thread id. -void RecoreCurThreadId(int32_t id); -int32_t GetThreadIdFromSystemThreadId(uint32_t id); +void RecoreCurThreadId(uint64_t id); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index 136dc2d7252..0d1eee31684 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once +#include #include +#include #ifdef PADDLE_WITH_CUDA #include #endif @@ -48,7 +50,7 @@ class Event { void set_parent(Event* parent) { parent_ = parent; } std::string name() const { return name_; } EventRole role() const { return role_; } - uint32_t thread_id() const { return thread_id_; } + uint64_t thread_id() const { return thread_id_; } void set_name(std::string name) { name_ = name; } void set_role(EventRole role) { role_ = role; } std::string attr() const { return attr_; } @@ -66,7 +68,7 @@ class Event { EventType type_; std::string name_{}; Event* parent_{nullptr}; - uint32_t thread_id_; + uint64_t thread_id_; EventRole role_{}; int64_t cpu_ns_; bool visited_status_{false}; @@ -88,6 +90,9 @@ class Event { #endif }; +using EventWithStartNs = std::pair; +using ThreadEvents = std::map; + class MemEvent { public: MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes, @@ -105,7 +110,7 @@ class MemEvent { uint64_t end_ns() const { return end_ns_; } size_t bytes() const { return bytes_; } Place place() const { return place_; } - int64_t thread_id() const { return thread_id_; } + uint64_t thread_id() const { return thread_id_; } const std::string& annotation() const { return annotation_; } private: @@ -114,7 +119,7 @@ class MemEvent { uint64_t end_ns_ = 0; size_t bytes_; Place place_; - int64_t thread_id_; + uint64_t thread_id_; std::string annotation_; }; diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc new file mode 100644 index 00000000000..9af89645f54 --- /dev/null +++ b/paddle/fluid/platform/os_info.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/os_info.h" +#include +#include "paddle/fluid/platform/device_tracer.h" + +namespace paddle { +namespace platform { + +ThreadId::ThreadId() { + std_tid_ = std::hash()(std::this_thread::get_id()); + std::stringstream ss; + ss << std::this_thread::get_id(); + cupti_tid_ = static_cast(std::stoull(ss.str())); + RecoreCurThreadId(MainTid()); // For DeviceTracer +} + +ThreadIdRegistry::~ThreadIdRegistry() { + std::lock_guard lock(lock_); + for (auto id_pair : id_map_) { + delete id_pair.second; + } +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/os_info.h b/paddle/fluid/platform/os_info.h new file mode 100644 index 00000000000..b243429fd5a --- /dev/null +++ b/paddle/fluid/platform/os_info.h @@ -0,0 +1,99 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/port.h" +#ifdef _POSIX_C_SOURCE +#include +#endif + +namespace paddle { +namespace platform { + +// Get current time in nanoseconds +inline uint64_t PosixInNsec() { +#ifdef _POSIX_C_SOURCE + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + return tp.tv_sec * 1000 * 1000 * 1000 + tp.tv_nsec; +#else + struct timeval tv; + gettimeofday(&tv, nullptr); + return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); +#endif +} + +// All kinds of Ids for OS thread +class ThreadId { + public: + ThreadId(); + + uint64_t MainTid() const { return std_tid_; } + + uint64_t StdTid() const { return std_tid_; } + + uint32_t CuptiTid() const { return cupti_tid_; } + + uint64_t SysTid() const { return sys_tid_; } + + private: + uint64_t std_tid_ = 0; // std::hash + uint32_t cupti_tid_ = 0; // thread_id used by Nvidia CUPTI + uint64_t sys_tid_ = 0; // OS-specific, Linux: gettid +}; + +class ThreadIdRegistry { + public: + // singleton + static ThreadIdRegistry& GetInstance() { + static ThreadIdRegistry instance; + return instance; + } + + const ThreadId* GetThreadId(uint64_t std_id) { + std::lock_guard lock(lock_); + if (LIKELY(id_map_.find(std_id) != id_map_.end())) { + return id_map_[std_id]; + } + return nullptr; + } + + const ThreadId& CurrentThreadId() { + static thread_local ThreadId* tid_ = nullptr; + if (LIKELY(tid_ != nullptr)) { + return *tid_; + } + tid_ = new ThreadId; + std::lock_guard lock(lock_); + id_map_[tid_->StdTid()] = tid_; + return *tid_; + } + + private: + ThreadIdRegistry() = default; + DISABLE_COPY_AND_ASSIGN(ThreadIdRegistry); + ~ThreadIdRegistry(); + + std::mutex lock_; + std::unordered_map id_map_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 476a1929101..1bedd5b1308 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -25,10 +25,14 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/dynload/nvtx.h" #endif +#include "paddle/fluid/platform/os_info.h" PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); +DEFINE_bool(enable_host_event_recorder_hook, false, + "enable HostEventRecorder, hook Profiler"); + namespace paddle { namespace platform { @@ -298,12 +302,8 @@ class HostEventRecorder { std::unordered_map thread_recorders_; }; -static uint64_t GetThreadId() { - return std::hash{}(std::this_thread::get_id()); -} - ThreadEventRecorder::ThreadEventRecorder() { - thread_id_ = GetThreadId(); + thread_id_ = ThreadIdRegistry::GetInstance().CurrentThreadId().MainTid(); HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this); } @@ -352,7 +352,7 @@ RecordEvent::RecordEvent(const char *name, const EventRole role) { } #endif #endif - if (UNLIKELY(g_enable_host_event_recorder_hook == false)) { + if (UNLIKELY(FLAGS_enable_host_event_recorder_hook == false)) { OriginalConstruct(name, role, "none"); return; } @@ -370,7 +370,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) { } #endif #endif - if (UNLIKELY(g_enable_host_event_recorder_hook == false)) { + if (UNLIKELY(FLAGS_enable_host_event_recorder_hook == false)) { OriginalConstruct(name, role, "none"); return; } @@ -389,7 +389,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role, } #endif #endif - if (UNLIKELY(g_enable_host_event_recorder_hook == false)) { + if (UNLIKELY(FLAGS_enable_host_event_recorder_hook == false)) { OriginalConstruct(name, role, attr); return; } @@ -425,7 +425,7 @@ RecordEvent::~RecordEvent() { #endif #endif uint64_t end_ns = PosixInNsec(); - if (LIKELY(g_enable_host_event_recorder_hook)) { + if (LIKELY(FLAGS_enable_host_event_recorder_hook)) { if (LIKELY(shallow_copy_name_ != nullptr)) { HostEventRecorder::GetInstance().RecordEvent(shallow_copy_name_, start_ns_, end_ns, role_); @@ -546,6 +546,11 @@ void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, } void Mark(const std::string &name) { + if (FLAGS_enable_host_event_recorder_hook) { + HostEventRecorder::GetInstance().RecordEvent(name, 0, 0, + EventRole::kOrdinary); + return; + } GetEventList().Record(EventType::kMark, name, g_thread_id); } @@ -598,9 +603,14 @@ void ResetProfiler() { } } +static std::map DockHostEventRecorderHostPart(); +static void DockHostEventRecorderDevicePart( + const std::map &thr_events); + void DisableProfiler(EventSortingKey sorted_key, const std::string &profile_path) { SynchronizeAllDevice(); + auto thr_events = DockHostEventRecorderHostPart(); MemEvenRecorder::Instance().Flush(); std::lock_guard l(profiler_mu); @@ -612,6 +622,7 @@ void DisableProfiler(EventSortingKey sorted_key, DeviceTracer *tracer = GetDeviceTracer(); if (tracer->IsEnabled()) { tracer->Disable(); + DockHostEventRecorderDevicePart(thr_events); tracer->GenEventKernelCudaElapsedTime(); tracer->GenProfile(profile_path); } @@ -634,6 +645,7 @@ void CompleteProfilerEvents(proto::Profile *tracer_profile, std::vector> *time_events, std::vector> *mem_events) { SynchronizeAllDevice(); + auto thr_events = DockHostEventRecorderHostPart(); MemEvenRecorder::Instance().Flush(); std::lock_guard l(profiler_mu); @@ -645,6 +657,7 @@ void CompleteProfilerEvents(proto::Profile *tracer_profile, DeviceTracer *tracer = GetDeviceTracer(); if (tracer->IsEnabled() && tracer_profile != nullptr) { tracer->Disable(); + DockHostEventRecorderDevicePart(thr_events); tracer->GenEventKernelCudaElapsedTime(); *tracer_profile = tracer->GetProfile(); } @@ -719,7 +732,7 @@ void NvprofEnableRecordEvent() { void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; } -void EnableHostEventRecorder() { g_enable_host_event_recorder_hook = true; } +void EnableHostEventRecorder() { FLAGS_enable_host_event_recorder_hook = true; } std::string PrintHostEvents() { std::ostringstream oss; @@ -734,5 +747,95 @@ std::string PrintHostEvents() { return oss.str(); } +static void EmulateEventPushAndPop(const HostEventSection &host_sec, + std::map *out) { + for (const auto &thr_sec : host_sec.thr_sections) { + uint64_t tid = thr_sec.thread_id; + auto cur_thr_list = std::make_shared>(); + g_all_event_lists.emplace_front(cur_thr_list); + // for nesting events + std::stack evt_stk; + std::stack prefix_stk; + std::map start2evt; + for (size_t i = 0; i < thr_sec.events.size(); ++i) { + const auto &evt = thr_sec.events[i]; + start2evt[evt.start_ns] = i; + } + auto iter = start2evt.begin(); + // loop events + for (size_t i = 0; i < thr_sec.events.size(); ++i) { + const auto &thr_evts = thr_sec.events; + const auto &evt = thr_evts[i]; + // For nesting events + while (!evt_stk.empty() && thr_evts[evt_stk.top()].end_ns <= evt.end_ns) { + evt_stk.pop(); + prefix_stk.pop(); + } + while (iter != start2evt.end() && + thr_evts[iter->second].start_ns < evt.start_ns) { + if (thr_evts[iter->second].end_ns > evt.start_ns) { + evt_stk.push(iter->second); + std::string prefix = thr_evts[iter->second].name; + if (!prefix_stk.empty()) { + prefix = prefix_stk.top() + "/" + prefix; + } + prefix_stk.push(prefix); + } + ++iter; + } + // Record orig event pair + std::string name = + prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name; + const char *attr = (evt.attr == nullptr ? "none" : evt.attr); + Event *orig_evt = cur_thr_list->Record(EventType::kPushRange, name, tid, + evt.role, attr); + (*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns); + cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr); + } + } +} + +static void EmulateCPURecordsAdd(const HostEventSection &host_sec) { + DeviceTracer *tracer = GetDeviceTracer(); + if (tracer == nullptr) { + return; + } + for (const auto &thr_sec : host_sec.thr_sections) { + uint64_t tid = thr_sec.thread_id; + for (const auto &evt : thr_sec.events) { + tracer->AddCPURecords(evt.name, evt.start_ns, evt.end_ns, BlockDepth(), + tid); + } + } +} + +static void EmulateCorrelation( + const std::map &thr_events) { + DeviceTracer *tracer = GetDeviceTracer(); + if (tracer == nullptr) { + return; + } + tracer->AddAnnotations(thr_events); +} + +static std::map DockHostEventRecorderHostPart() { + std::map thr_events; + if (FLAGS_enable_host_event_recorder_hook == false) { + return thr_events; + } + auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents(); + EmulateEventPushAndPop(host_evt_sec, &thr_events); + EmulateCPURecordsAdd(host_evt_sec); + return std::move(thr_events); +} + +static void DockHostEventRecorderDevicePart( + const std::map &thr_events) { + if (FLAGS_enable_host_event_recorder_hook == false) { + return; + } + EmulateCorrelation(thr_events); +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 4277f7d4dc6..c0b7fd417f2 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -47,11 +47,9 @@ static TracerOption g_tracer_option = TracerOption::kDefault; static ProfilerState g_state = ProfilerState::kDisabled; // To hook RecordEvent's events, use it to nvtx timeline static bool g_enable_nvprof_hook = false; -// To hook RecordEvent, use HostEventRecorder -static bool g_enable_host_event_recorder_hook = false; // The thread local event list only can be accessed by the specific thread // The thread index of each thread -static thread_local int32_t g_thread_id; +static thread_local uint64_t g_thread_id; // The g_next_thread_id is a global counter for threads, by the g_thread_id and // g_next_thread_id, we can know how many threads have created EventList. static uint32_t g_next_thread_id = 0; -- GitLab