未验证 提交 5b6be4d7 编写于 作者: L liutiexing 提交者: GitHub

Adapt host event recorder to profiler (#37766)

* add align for WorkQueue

* add spinlock

* merge develop

* merge

* Add EventsWaiter

* Revert "Add EventsWaiter"

This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2.

* add os_info

* update

* update

* update

* update

* update

* update for bugfix

* update

* update

* update
Co-authored-by: Nliutiexing <liutiexing@google.com>
上级 dd3afc9d
...@@ -45,6 +45,7 @@ IF(WITH_XBYAK) ...@@ -45,6 +45,7 @@ IF(WITH_XBYAK)
ENDIF() ENDIF()
cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS}) cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
cc_library(os_info SRCS os_info.cc DEPS enforce device_tracer)
IF(WITH_GPU) IF(WITH_GPU)
nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph) nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
...@@ -165,13 +166,13 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri ...@@ -165,13 +166,13 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
if(WITH_GPU) if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda) nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda)
nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
elseif(WITH_ROCM) elseif(WITH_ROCM)
hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce) hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce)
hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
else() else()
cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce) cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce)
cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place) cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
endif() endif()
......
...@@ -22,8 +22,14 @@ limitations under the License. */ ...@@ -22,8 +22,14 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/device_tracer.h"
DECLARE_bool(enable_host_event_recorder_hook);
namespace paddle { namespace paddle {
namespace platform { namespace platform {
// Used only by DeviceTracer
uint64_t GetThreadIdFromSystemThreadId(uint32_t id);
namespace { namespace {
// Tracking the nested block stacks of each thread. // Tracking the nested block stacks of each thread.
#ifdef PADDLE_WITH_SW #ifdef PADDLE_WITH_SW
...@@ -40,7 +46,8 @@ thread_local std::deque<Event *> annotation_stack; ...@@ -40,7 +46,8 @@ thread_local std::deque<Event *> annotation_stack;
static std::deque<Event *> main_thread_annotation_stack{}; static std::deque<Event *> main_thread_annotation_stack{};
static std::deque<std::string> main_thread_annotation_stack_name{}; static std::deque<std::string> main_thread_annotation_stack_name{};
std::map<uint32_t, int32_t> system_thread_id_map; std::map<uint32_t, uint64_t> system_thread_id_map;
std::mutex system_thread_id_map_mutex;
std::once_flag tracer_once_flag; std::once_flag tracer_once_flag;
DeviceTracer *tracer = nullptr; DeviceTracer *tracer = nullptr;
...@@ -299,6 +306,47 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -299,6 +306,47 @@ class DeviceTracerImpl : public DeviceTracer {
local_correlations_pairs->push_front(std::make_pair(id, event)); local_correlations_pairs->push_front(std::make_pair(id, event));
} }
void AddAnnotations(const std::map<uint64_t, ThreadEvents> &thr_events) {
for (auto &tmp : active_kind_records_) {
for (const ActiveKindRecord &r : tmp) {
auto iter = thr_events.find(r.thread_id);
if (iter == thr_events.end()) {
VLOG(10) << __func__ << " " << r.name
<< " Missing tid: " << r.thread_id;
continue;
}
const ThreadEvents &evts = iter->second;
auto evt_iter = evts.upper_bound(r.end_ns);
if (evt_iter == evts.end()) {
VLOG(10) << __func__ << " Missing Record " << r.name
<< " tid: " << r.thread_id << " end_ns: " << r.end_ns;
continue;
}
if (evt_iter != evts.begin()) {
auto prev_iter = std::prev(evt_iter);
if (prev_iter->first >= r.end_ns) {
evt_iter = prev_iter;
} else {
VLOG(10) << __func__ << " prev end_ns " << prev_iter->first
<< " end_ns: " << r.end_ns;
}
}
Event *evt = evt_iter->second.first;
uint64_t start_ns = evt_iter->second.second;
if (start_ns > r.start_ns) {
VLOG(10) << __func__ << " Mismatch Record " << r.name
<< " tid: " << r.thread_id << " start_ns: " << r.start_ns
<< " end_ns: " << r.end_ns << ", event " << evt->name()
<< " start_ns: " << start_ns;
continue;
}
VLOG(10) << __func__ << " tid: " << r.thread_id << " Add correlation "
<< r.correlation_id << "<->" << evt->name();
AddAnnotation(r.correlation_id, evt);
}
}
}
void AddCPURecords(const std::string &anno, uint64_t start_ns, void AddCPURecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, int64_t thread_id) { uint64_t end_ns, int64_t device_id, int64_t thread_id) {
if (anno.empty()) { if (anno.empty()) {
...@@ -357,7 +405,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -357,7 +405,7 @@ class DeviceTracerImpl : public DeviceTracer {
void AddActiveKindRecords(const std::string &anno, uint64_t start_ns, void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, uint64_t end_ns, int64_t device_id,
int64_t thread_id, uint32_t correlation_id) { uint64_t thread_id, uint32_t correlation_id) {
if (anno.empty()) { if (anno.empty()) {
VLOG(1) << "Empty timeline annotation."; VLOG(1) << "Empty timeline annotation.";
return; return;
...@@ -524,7 +572,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -524,7 +572,7 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_detail_info(c->second->attr()); event->set_detail_info(c->second->attr());
find++; find++;
} else { } else {
VLOG(10) << "Missing Kernel Event: " + r.name; VLOG(10) << __func__ << " Missing Kernel Event: " + r.name;
miss++; miss++;
event->set_name(r.name); event->set_name(r.name);
} }
...@@ -533,7 +581,8 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -533,7 +581,8 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_sub_device_id(r.stream_id); event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find; VLOG(1) << __func__ << " KernelRecord event miss: " << miss
<< " find: " << find;
for (auto &tmp : cpu_records_) { for (auto &tmp : cpu_records_) {
for (const CPURecord &r : tmp) { for (const CPURecord &r : tmp) {
...@@ -583,7 +632,8 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -583,7 +632,8 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes); event->mutable_memcopy()->set_bytes(r.bytes);
} }
VLOG(1) << "MemRecord event miss: " << miss << " find: " << find; VLOG(1) << __func__ << " MemRecord event miss: " << miss
<< " find: " << find;
for (auto &tmp : mem_info_record_) { for (auto &tmp : mem_info_record_) {
for (const auto &r : tmp) { for (const auto &r : tmp) {
...@@ -633,6 +683,9 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -633,6 +683,9 @@ class DeviceTracerImpl : public DeviceTracer {
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain, static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const void *cbdata) { CUpti_CallbackId cbid, const void *cbdata) {
if (LIKELY(FLAGS_enable_host_event_recorder_hook)) {
return;
}
auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata); auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata); DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
if (cbInfo->callbackSite == CUPTI_API_ENTER) { if (cbInfo->callbackSite == CUPTI_API_ENTER) {
...@@ -712,6 +765,7 @@ Event *CurAnnotation() { ...@@ -712,6 +765,7 @@ Event *CurAnnotation() {
if (annotation_stack.empty()) return nullptr; if (annotation_stack.empty()) return nullptr;
return annotation_stack.back(); return annotation_stack.back();
} }
std::string CurAnnotationName() { std::string CurAnnotationName() {
if (annotation_stack.empty()) return "Unknown"; if (annotation_stack.empty()) return "Unknown";
return annotation_stack.back()->name(); return annotation_stack.back()->name();
...@@ -730,13 +784,13 @@ uint32_t GetCurSystemThreadId() { ...@@ -730,13 +784,13 @@ uint32_t GetCurSystemThreadId() {
return id; return id;
} }
void RecoreCurThreadId(int32_t id) { void RecoreCurThreadId(uint64_t id) {
std::lock_guard<std::mutex> lock(system_thread_id_map_mutex);
auto gid = GetCurSystemThreadId(); auto gid = GetCurSystemThreadId();
VLOG(1) << "RecoreCurThreadId: " << gid << " -> " << id;
system_thread_id_map[gid] = id; system_thread_id_map[gid] = id;
} }
int32_t GetThreadIdFromSystemThreadId(uint32_t id) { uint64_t GetThreadIdFromSystemThreadId(uint32_t id) {
auto it = system_thread_id_map.find(id); auto it = system_thread_id_map.find(id);
if (it != system_thread_id_map.end()) return it->second; if (it != system_thread_id_map.end()) return it->second;
// return origin id if no event is recorded in this thread. // return origin id if no event is recorded in this thread.
......
...@@ -18,8 +18,8 @@ limitations under the License. */ ...@@ -18,8 +18,8 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler.pb.h"
namespace paddle { namespace paddle {
...@@ -30,12 +30,6 @@ namespace platform { ...@@ -30,12 +30,6 @@ namespace platform {
////////////////////// //////////////////////
class Event; class Event;
inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
// DeviceTracer performs the following tasks: // DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc. // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc. // 2. Collect cuda statistics: start/end ts, memory, etc.
...@@ -84,7 +78,7 @@ class DeviceTracer { ...@@ -84,7 +78,7 @@ class DeviceTracer {
uint64_t start_ns; uint64_t start_ns;
uint64_t end_ns; uint64_t end_ns;
int64_t device_id; int64_t device_id;
int64_t thread_id; uint64_t thread_id;
uint32_t correlation_id; uint32_t correlation_id;
}; };
...@@ -101,6 +95,9 @@ class DeviceTracer { ...@@ -101,6 +95,9 @@ class DeviceTracer {
// human-readable annotations. // human-readable annotations.
virtual void AddAnnotation(uint32_t id, Event* event) = 0; virtual void AddAnnotation(uint32_t id, Event* event) = 0;
virtual void AddAnnotations(
const std::map<uint64_t, ThreadEvents>& thr_events) = 0;
virtual void AddMemRecords(const std::string& name, uint64_t start_ns, virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, uint64_t end_ns, int64_t device_id,
int64_t stream_id, uint32_t correlation_id, int64_t stream_id, uint32_t correlation_id,
...@@ -111,7 +108,7 @@ class DeviceTracer { ...@@ -111,7 +108,7 @@ class DeviceTracer {
int64_t thread_id) = 0; int64_t thread_id) = 0;
virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns, virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, uint64_t end_ns, int64_t device_id,
int64_t thread_id, uint64_t thread_id,
uint32_t correlation_id) = 0; uint32_t correlation_id) = 0;
virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns,
...@@ -154,7 +151,6 @@ void ClearCurBlock(); ...@@ -154,7 +151,6 @@ void ClearCurBlock();
int BlockDepth(); int BlockDepth();
// Set current thread id, so we can map the system thread id to thread id. // Set current thread id, so we can map the system thread id to thread id.
void RecoreCurThreadId(int32_t id); void RecoreCurThreadId(uint64_t id);
int32_t GetThreadIdFromSystemThreadId(uint32_t id);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <string> #include <string>
#include <utility>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif #endif
...@@ -48,7 +50,7 @@ class Event { ...@@ -48,7 +50,7 @@ class Event {
void set_parent(Event* parent) { parent_ = parent; } void set_parent(Event* parent) { parent_ = parent; }
std::string name() const { return name_; } std::string name() const { return name_; }
EventRole role() const { return role_; } EventRole role() const { return role_; }
uint32_t thread_id() const { return thread_id_; } uint64_t thread_id() const { return thread_id_; }
void set_name(std::string name) { name_ = name; } void set_name(std::string name) { name_ = name; }
void set_role(EventRole role) { role_ = role; } void set_role(EventRole role) { role_ = role; }
std::string attr() const { return attr_; } std::string attr() const { return attr_; }
...@@ -66,7 +68,7 @@ class Event { ...@@ -66,7 +68,7 @@ class Event {
EventType type_; EventType type_;
std::string name_{}; std::string name_{};
Event* parent_{nullptr}; Event* parent_{nullptr};
uint32_t thread_id_; uint64_t thread_id_;
EventRole role_{}; EventRole role_{};
int64_t cpu_ns_; int64_t cpu_ns_;
bool visited_status_{false}; bool visited_status_{false};
...@@ -88,6 +90,9 @@ class Event { ...@@ -88,6 +90,9 @@ class Event {
#endif #endif
}; };
using EventWithStartNs = std::pair<Event*, uint64_t>;
using ThreadEvents = std::map<uint64_t, EventWithStartNs>;
class MemEvent { class MemEvent {
public: public:
MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes, MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
...@@ -105,7 +110,7 @@ class MemEvent { ...@@ -105,7 +110,7 @@ class MemEvent {
uint64_t end_ns() const { return end_ns_; } uint64_t end_ns() const { return end_ns_; }
size_t bytes() const { return bytes_; } size_t bytes() const { return bytes_; }
Place place() const { return place_; } Place place() const { return place_; }
int64_t thread_id() const { return thread_id_; } uint64_t thread_id() const { return thread_id_; }
const std::string& annotation() const { return annotation_; } const std::string& annotation() const { return annotation_; }
private: private:
...@@ -114,7 +119,7 @@ class MemEvent { ...@@ -114,7 +119,7 @@ class MemEvent {
uint64_t end_ns_ = 0; uint64_t end_ns_ = 0;
size_t bytes_; size_t bytes_;
Place place_; Place place_;
int64_t thread_id_; uint64_t thread_id_;
std::string annotation_; std::string annotation_;
}; };
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/os_info.h"
#include <sstream>
#include "paddle/fluid/platform/device_tracer.h"
namespace paddle {
namespace platform {
ThreadId::ThreadId() {
std_tid_ = std::hash<std::thread::id>()(std::this_thread::get_id());
std::stringstream ss;
ss << std::this_thread::get_id();
cupti_tid_ = static_cast<uint32_t>(std::stoull(ss.str()));
RecoreCurThreadId(MainTid()); // For DeviceTracer
}
ThreadIdRegistry::~ThreadIdRegistry() {
std::lock_guard<std::mutex> lock(lock_);
for (auto id_pair : id_map_) {
delete id_pair.second;
}
}
} // namespace platform
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <mutex>
#include <thread>
#include <unordered_map>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/port.h"
#ifdef _POSIX_C_SOURCE
#include <time.h>
#endif
namespace paddle {
namespace platform {
// Get current time in nanoseconds
inline uint64_t PosixInNsec() {
#ifdef _POSIX_C_SOURCE
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
return tp.tv_sec * 1000 * 1000 * 1000 + tp.tv_nsec;
#else
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
#endif
}
// All kinds of Ids for OS thread
class ThreadId {
public:
ThreadId();
uint64_t MainTid() const { return std_tid_; }
uint64_t StdTid() const { return std_tid_; }
uint32_t CuptiTid() const { return cupti_tid_; }
uint64_t SysTid() const { return sys_tid_; }
private:
uint64_t std_tid_ = 0; // std::hash<std::thread::id>
uint32_t cupti_tid_ = 0; // thread_id used by Nvidia CUPTI
uint64_t sys_tid_ = 0; // OS-specific, Linux: gettid
};
class ThreadIdRegistry {
public:
// singleton
static ThreadIdRegistry& GetInstance() {
static ThreadIdRegistry instance;
return instance;
}
const ThreadId* GetThreadId(uint64_t std_id) {
std::lock_guard<std::mutex> lock(lock_);
if (LIKELY(id_map_.find(std_id) != id_map_.end())) {
return id_map_[std_id];
}
return nullptr;
}
const ThreadId& CurrentThreadId() {
static thread_local ThreadId* tid_ = nullptr;
if (LIKELY(tid_ != nullptr)) {
return *tid_;
}
tid_ = new ThreadId;
std::lock_guard<std::mutex> lock(lock_);
id_map_[tid_->StdTid()] = tid_;
return *tid_;
}
private:
ThreadIdRegistry() = default;
DISABLE_COPY_AND_ASSIGN(ThreadIdRegistry);
~ThreadIdRegistry();
std::mutex lock_;
std::unordered_map<uint64_t, ThreadId*> id_map_;
};
} // namespace platform
} // namespace paddle
...@@ -25,10 +25,14 @@ limitations under the License. */ ...@@ -25,10 +25,14 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nvtx.h" #include "paddle/fluid/platform/dynload/nvtx.h"
#endif #endif
#include "paddle/fluid/platform/os_info.h"
PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false, PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
"Enable rpc profiler or not."); "Enable rpc profiler or not.");
DEFINE_bool(enable_host_event_recorder_hook, false,
"enable HostEventRecorder, hook Profiler");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -298,12 +302,8 @@ class HostEventRecorder { ...@@ -298,12 +302,8 @@ class HostEventRecorder {
std::unordered_map<uint64_t, ThreadEventRecorder *> thread_recorders_; std::unordered_map<uint64_t, ThreadEventRecorder *> thread_recorders_;
}; };
static uint64_t GetThreadId() {
return std::hash<std::thread::id>{}(std::this_thread::get_id());
}
ThreadEventRecorder::ThreadEventRecorder() { ThreadEventRecorder::ThreadEventRecorder() {
thread_id_ = GetThreadId(); thread_id_ = ThreadIdRegistry::GetInstance().CurrentThreadId().MainTid();
HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this); HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this);
} }
...@@ -352,7 +352,7 @@ RecordEvent::RecordEvent(const char *name, const EventRole role) { ...@@ -352,7 +352,7 @@ RecordEvent::RecordEvent(const char *name, const EventRole role) {
} }
#endif #endif
#endif #endif
if (UNLIKELY(g_enable_host_event_recorder_hook == false)) { if (UNLIKELY(FLAGS_enable_host_event_recorder_hook == false)) {
OriginalConstruct(name, role, "none"); OriginalConstruct(name, role, "none");
return; return;
} }
...@@ -370,7 +370,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) { ...@@ -370,7 +370,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
} }
#endif #endif
#endif #endif
if (UNLIKELY(g_enable_host_event_recorder_hook == false)) { if (UNLIKELY(FLAGS_enable_host_event_recorder_hook == false)) {
OriginalConstruct(name, role, "none"); OriginalConstruct(name, role, "none");
return; return;
} }
...@@ -389,7 +389,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role, ...@@ -389,7 +389,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role,
} }
#endif #endif
#endif #endif
if (UNLIKELY(g_enable_host_event_recorder_hook == false)) { if (UNLIKELY(FLAGS_enable_host_event_recorder_hook == false)) {
OriginalConstruct(name, role, attr); OriginalConstruct(name, role, attr);
return; return;
} }
...@@ -425,7 +425,7 @@ RecordEvent::~RecordEvent() { ...@@ -425,7 +425,7 @@ RecordEvent::~RecordEvent() {
#endif #endif
#endif #endif
uint64_t end_ns = PosixInNsec(); uint64_t end_ns = PosixInNsec();
if (LIKELY(g_enable_host_event_recorder_hook)) { if (LIKELY(FLAGS_enable_host_event_recorder_hook)) {
if (LIKELY(shallow_copy_name_ != nullptr)) { if (LIKELY(shallow_copy_name_ != nullptr)) {
HostEventRecorder::GetInstance().RecordEvent(shallow_copy_name_, HostEventRecorder::GetInstance().RecordEvent(shallow_copy_name_,
start_ns_, end_ns, role_); start_ns_, end_ns, role_);
...@@ -546,6 +546,11 @@ void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, ...@@ -546,6 +546,11 @@ void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
} }
void Mark(const std::string &name) { void Mark(const std::string &name) {
if (FLAGS_enable_host_event_recorder_hook) {
HostEventRecorder::GetInstance().RecordEvent(name, 0, 0,
EventRole::kOrdinary);
return;
}
GetEventList().Record(EventType::kMark, name, g_thread_id); GetEventList().Record(EventType::kMark, name, g_thread_id);
} }
...@@ -598,9 +603,14 @@ void ResetProfiler() { ...@@ -598,9 +603,14 @@ void ResetProfiler() {
} }
} }
static std::map<uint64_t, ThreadEvents> DockHostEventRecorderHostPart();
static void DockHostEventRecorderDevicePart(
const std::map<uint64_t, ThreadEvents> &thr_events);
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string &profile_path) { const std::string &profile_path) {
SynchronizeAllDevice(); SynchronizeAllDevice();
auto thr_events = DockHostEventRecorderHostPart();
MemEvenRecorder::Instance().Flush(); MemEvenRecorder::Instance().Flush();
std::lock_guard<std::mutex> l(profiler_mu); std::lock_guard<std::mutex> l(profiler_mu);
...@@ -612,6 +622,7 @@ void DisableProfiler(EventSortingKey sorted_key, ...@@ -612,6 +622,7 @@ void DisableProfiler(EventSortingKey sorted_key,
DeviceTracer *tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer->IsEnabled()) { if (tracer->IsEnabled()) {
tracer->Disable(); tracer->Disable();
DockHostEventRecorderDevicePart(thr_events);
tracer->GenEventKernelCudaElapsedTime(); tracer->GenEventKernelCudaElapsedTime();
tracer->GenProfile(profile_path); tracer->GenProfile(profile_path);
} }
...@@ -634,6 +645,7 @@ void CompleteProfilerEvents(proto::Profile *tracer_profile, ...@@ -634,6 +645,7 @@ void CompleteProfilerEvents(proto::Profile *tracer_profile,
std::vector<std::vector<Event>> *time_events, std::vector<std::vector<Event>> *time_events,
std::vector<std::vector<MemEvent>> *mem_events) { std::vector<std::vector<MemEvent>> *mem_events) {
SynchronizeAllDevice(); SynchronizeAllDevice();
auto thr_events = DockHostEventRecorderHostPart();
MemEvenRecorder::Instance().Flush(); MemEvenRecorder::Instance().Flush();
std::lock_guard<std::mutex> l(profiler_mu); std::lock_guard<std::mutex> l(profiler_mu);
...@@ -645,6 +657,7 @@ void CompleteProfilerEvents(proto::Profile *tracer_profile, ...@@ -645,6 +657,7 @@ void CompleteProfilerEvents(proto::Profile *tracer_profile,
DeviceTracer *tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer->IsEnabled() && tracer_profile != nullptr) { if (tracer->IsEnabled() && tracer_profile != nullptr) {
tracer->Disable(); tracer->Disable();
DockHostEventRecorderDevicePart(thr_events);
tracer->GenEventKernelCudaElapsedTime(); tracer->GenEventKernelCudaElapsedTime();
*tracer_profile = tracer->GetProfile(); *tracer_profile = tracer->GetProfile();
} }
...@@ -719,7 +732,7 @@ void NvprofEnableRecordEvent() { ...@@ -719,7 +732,7 @@ void NvprofEnableRecordEvent() {
void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; } void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; }
void EnableHostEventRecorder() { g_enable_host_event_recorder_hook = true; } void EnableHostEventRecorder() { FLAGS_enable_host_event_recorder_hook = true; }
std::string PrintHostEvents() { std::string PrintHostEvents() {
std::ostringstream oss; std::ostringstream oss;
...@@ -734,5 +747,95 @@ std::string PrintHostEvents() { ...@@ -734,5 +747,95 @@ std::string PrintHostEvents() {
return oss.str(); return oss.str();
} }
static void EmulateEventPushAndPop(const HostEventSection &host_sec,
std::map<uint64_t, ThreadEvents> *out) {
for (const auto &thr_sec : host_sec.thr_sections) {
uint64_t tid = thr_sec.thread_id;
auto cur_thr_list = std::make_shared<EventList<Event>>();
g_all_event_lists.emplace_front(cur_thr_list);
// for nesting events
std::stack<size_t> evt_stk;
std::stack<std::string> prefix_stk;
std::map<uint64_t, size_t> start2evt;
for (size_t i = 0; i < thr_sec.events.size(); ++i) {
const auto &evt = thr_sec.events[i];
start2evt[evt.start_ns] = i;
}
auto iter = start2evt.begin();
// loop events
for (size_t i = 0; i < thr_sec.events.size(); ++i) {
const auto &thr_evts = thr_sec.events;
const auto &evt = thr_evts[i];
// For nesting events
while (!evt_stk.empty() && thr_evts[evt_stk.top()].end_ns <= evt.end_ns) {
evt_stk.pop();
prefix_stk.pop();
}
while (iter != start2evt.end() &&
thr_evts[iter->second].start_ns < evt.start_ns) {
if (thr_evts[iter->second].end_ns > evt.start_ns) {
evt_stk.push(iter->second);
std::string prefix = thr_evts[iter->second].name;
if (!prefix_stk.empty()) {
prefix = prefix_stk.top() + "/" + prefix;
}
prefix_stk.push(prefix);
}
++iter;
}
// Record orig event pair
std::string name =
prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name;
const char *attr = (evt.attr == nullptr ? "none" : evt.attr);
Event *orig_evt = cur_thr_list->Record(EventType::kPushRange, name, tid,
evt.role, attr);
(*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns);
cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr);
}
}
}
static void EmulateCPURecordsAdd(const HostEventSection &host_sec) {
DeviceTracer *tracer = GetDeviceTracer();
if (tracer == nullptr) {
return;
}
for (const auto &thr_sec : host_sec.thr_sections) {
uint64_t tid = thr_sec.thread_id;
for (const auto &evt : thr_sec.events) {
tracer->AddCPURecords(evt.name, evt.start_ns, evt.end_ns, BlockDepth(),
tid);
}
}
}
static void EmulateCorrelation(
const std::map<uint64_t, ThreadEvents> &thr_events) {
DeviceTracer *tracer = GetDeviceTracer();
if (tracer == nullptr) {
return;
}
tracer->AddAnnotations(thr_events);
}
static std::map<uint64_t, ThreadEvents> DockHostEventRecorderHostPart() {
std::map<uint64_t, ThreadEvents> thr_events;
if (FLAGS_enable_host_event_recorder_hook == false) {
return thr_events;
}
auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents();
EmulateEventPushAndPop(host_evt_sec, &thr_events);
EmulateCPURecordsAdd(host_evt_sec);
return std::move(thr_events);
}
static void DockHostEventRecorderDevicePart(
const std::map<uint64_t, ThreadEvents> &thr_events) {
if (FLAGS_enable_host_event_recorder_hook == false) {
return;
}
EmulateCorrelation(thr_events);
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -47,11 +47,9 @@ static TracerOption g_tracer_option = TracerOption::kDefault; ...@@ -47,11 +47,9 @@ static TracerOption g_tracer_option = TracerOption::kDefault;
static ProfilerState g_state = ProfilerState::kDisabled; static ProfilerState g_state = ProfilerState::kDisabled;
// To hook RecordEvent's events, use it to nvtx timeline // To hook RecordEvent's events, use it to nvtx timeline
static bool g_enable_nvprof_hook = false; static bool g_enable_nvprof_hook = false;
// To hook RecordEvent, use HostEventRecorder
static bool g_enable_host_event_recorder_hook = false;
// The thread local event list only can be accessed by the specific thread // The thread local event list only can be accessed by the specific thread
// The thread index of each thread // The thread index of each thread
static thread_local int32_t g_thread_id; static thread_local uint64_t g_thread_id;
// The g_next_thread_id is a global counter for threads, by the g_thread_id and // The g_next_thread_id is a global counter for threads, by the g_thread_id and
// g_next_thread_id, we can know how many threads have created EventList. // g_next_thread_id, we can know how many threads have created EventList.
static uint32_t g_next_thread_id = 0; static uint32_t g_next_thread_id = 0;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册