未验证 提交 dbfe5333 编写于 作者: W wangchaochaohu 提交者: GitHub

Add pe profiler Event (#24611)

上级 55b664a1
...@@ -31,6 +31,7 @@ limitations under the License. */ ...@@ -31,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DECLARE_double(eager_delete_tensor_gb); DECLARE_double(eager_delete_tensor_gb);
...@@ -820,6 +821,8 @@ void ParallelExecutor::BCastParamsToDevices( ...@@ -820,6 +821,8 @@ void ParallelExecutor::BCastParamsToDevices(
FetchResultType ParallelExecutor::Run( FetchResultType ParallelExecutor::Run(
const std::vector<std::string> &fetch_tensors, bool return_merged) { const std::vector<std::string> &fetch_tensors, bool return_merged) {
VLOG(3) << "enter ParallelExecutor Run"; VLOG(3) << "enter ParallelExecutor Run";
platform::RecordEvent parallel_executor_event(
"ParallelExecutor::Run", paddle::platform::EventRole::kSpecial);
#ifdef WITH_GPERFTOOLS #ifdef WITH_GPERFTOOLS
if (gProfileStarted) { if (gProfileStarted) {
ProfilerFlush(); ProfilerFlush();
......
...@@ -40,6 +40,9 @@ namespace { ...@@ -40,6 +40,9 @@ namespace {
thread_local std::deque<int> block_id_stack; thread_local std::deque<int> block_id_stack;
// Tracking the nested event stacks. // Tracking the nested event stacks.
thread_local std::deque<Event *> annotation_stack; thread_local std::deque<Event *> annotation_stack;
// stack to strore event sunch as pe and so on
static std::deque<Event *> main_thread_annotation_stack{};
static std::deque<std::string> main_thread_annotation_stack_name{};
std::map<uint32_t, int32_t> system_thread_id_map; std::map<uint32_t, int32_t> system_thread_id_map;
...@@ -638,15 +641,49 @@ DeviceTracer *GetDeviceTracer() { ...@@ -638,15 +641,49 @@ DeviceTracer *GetDeviceTracer() {
return tracer; return tracer;
} }
void SetCurAnnotation(Event *event) { std::string SetCurAnnotation(Event *event) {
if (!annotation_stack.empty()) { std::string ret;
if (!annotation_stack.empty() && event->role() != EventRole::kSpecial) {
event->set_parent(annotation_stack.back()); event->set_parent(annotation_stack.back());
event->set_name(annotation_stack.back()->name() + "/" + event->name()); event->set_name(annotation_stack.back()->name() + "/" + event->name());
} }
annotation_stack.push_back(event); annotation_stack.push_back(event);
if (!main_thread_annotation_stack_name.empty() && !annotation_stack.empty() &&
main_thread_annotation_stack.back()->thread_id() !=
annotation_stack.back()->thread_id()) {
ret = main_thread_annotation_stack_name.back() + "/" + event->name();
} else {
ret = event->name();
}
if (event->role() == EventRole::kSpecial) {
std::string name = event->name();
if (!main_thread_annotation_stack_name.empty()) {
name = main_thread_annotation_stack_name.back() + "/" + event->name();
}
main_thread_annotation_stack_name.push_back(name);
main_thread_annotation_stack.push_back(event);
}
return ret;
} }
void ClearCurAnnotation() { annotation_stack.pop_back(); } void ClearCurAnnotation() {
if (!main_thread_annotation_stack_name.empty() && !annotation_stack.empty() &&
main_thread_annotation_stack.back()->thread_id() !=
annotation_stack.back()->thread_id()) {
annotation_stack.back()->set_name(main_thread_annotation_stack_name.back() +
"/" + annotation_stack.back()->name());
}
if (!main_thread_annotation_stack.empty() &&
main_thread_annotation_stack.back()->name() ==
annotation_stack.back()->name()) {
main_thread_annotation_stack_name.pop_back();
main_thread_annotation_stack.pop_back();
}
annotation_stack.pop_back();
}
Event *CurAnnotation() { Event *CurAnnotation() {
if (annotation_stack.empty()) return nullptr; if (annotation_stack.empty()) return nullptr;
......
...@@ -137,7 +137,7 @@ class DeviceTracer { ...@@ -137,7 +137,7 @@ class DeviceTracer {
DeviceTracer* GetDeviceTracer(); DeviceTracer* GetDeviceTracer();
// Set a name for the cuda kernel operation being launched by the thread. // Set a name for the cuda kernel operation being launched by the thread.
void SetCurAnnotation(Event* event); std::string SetCurAnnotation(Event* event);
// Clear the name after the operation is done. // Clear the name after the operation is done.
void ClearCurAnnotation(); void ClearCurAnnotation();
// Current name of the operation being run in the thread. // Current name of the operation being run in the thread.
......
...@@ -29,6 +29,7 @@ enum class EventRole { ...@@ -29,6 +29,7 @@ enum class EventRole {
kOrdinary, // only record op time with op type key kOrdinary, // only record op time with op type key
kInnerOp, // record op detail time with op type key kInnerOp, // record op detail time with op type key
kUniqueOp, // record op detail time with op unique name key kUniqueOp, // record op detail time with op unique name key
kSpecial, // record event such as PE which is outer of thread local
}; };
class Event { class Event {
......
...@@ -73,8 +73,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) { ...@@ -73,8 +73,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
Event *e = PushEvent(name, role); Event *e = PushEvent(name, role);
// Maybe need the same push/pop behavior. // Maybe need the same push/pop behavior.
SetCurAnnotation(e); name_ = SetCurAnnotation(e);
name_ = e->name();
} }
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
...@@ -86,7 +85,7 @@ RecordEvent::~RecordEvent() { ...@@ -86,7 +85,7 @@ RecordEvent::~RecordEvent() {
BlockDepth(), g_thread_id); BlockDepth(), g_thread_id);
} }
ClearCurAnnotation(); ClearCurAnnotation();
PopEvent(name_); PopEvent(name_, role_);
} }
void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
...@@ -187,8 +186,8 @@ Event *PushEvent(const std::string &name, const EventRole role) { ...@@ -187,8 +186,8 @@ Event *PushEvent(const std::string &name, const EventRole role) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role); return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role);
} }
void PopEvent(const std::string &name) { void PopEvent(const std::string &name, const EventRole role) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id); GetEventList().Record(EventType::kPopRange, name, g_thread_id, role);
} }
void EnableProfiler(ProfilerState state) { void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled, PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled,
......
...@@ -197,7 +197,7 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, ...@@ -197,7 +197,7 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place, const std::string& annotation); const Place& place, const std::string& annotation);
Event* PushEvent(const std::string& name, const EventRole role); Event* PushEvent(const std::string& name, const EventRole role);
void PopEvent(const std::string& name); void PopEvent(const std::string& name, const EventRole role);
// Return the event list of all threads. Assumed the returned value calls // Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> GetAllEvents(); std::vector<std::vector<Event>> GetAllEvents();
......
...@@ -22,12 +22,12 @@ limitations under the License. */ ...@@ -22,12 +22,12 @@ limitations under the License. */
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <random> #include <random>
#include <set>
#include <stack> #include <stack>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
...@@ -283,7 +283,8 @@ std::function<bool(const EventItem &, const EventItem &)> SetSortedFunc( ...@@ -283,7 +283,8 @@ std::function<bool(const EventItem &, const EventItem &)> SetSortedFunc(
void SetEvent(bool merge_thread, const Event &analyze_event, void SetEvent(bool merge_thread, const Event &analyze_event,
size_t *max_name_width, std::list<Event> *pushed_events, size_t *max_name_width, std::list<Event> *pushed_events,
std::vector<EventItem> *event_items, std::vector<EventItem> *event_items,
std::unordered_map<std::string, int> *event_idx) { std::unordered_map<std::string, int> *event_idx,
const std::set<std::string> &main_thread_event_name) {
if (analyze_event.type() == EventType::kPushRange) { if (analyze_event.type() == EventType::kPushRange) {
pushed_events->push_back(analyze_event); pushed_events->push_back(analyze_event);
} else if (analyze_event.type() == EventType::kPopRange) { } else if (analyze_event.type() == EventType::kPopRange) {
...@@ -313,8 +314,35 @@ void SetEvent(bool merge_thread, const Event &analyze_event, ...@@ -313,8 +314,35 @@ void SetEvent(bool merge_thread, const Event &analyze_event,
if (merge_thread) { if (merge_thread) {
event_name = rit->name(); event_name = rit->name();
} else { } else {
event_name = if (!main_thread_event_name.empty()) {
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); auto origin_name = rit->name();
int index = 1;
int split_pos = 0;
while ((split_pos = FindNthReversePos(origin_name, '/', index)) !=
-1) {
auto prefix_str = origin_name.substr(0, split_pos);
if (main_thread_event_name.count(prefix_str)) {
break;
}
index++;
}
if (split_pos == -1 && !main_thread_event_name.count(rit->name())) {
event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
rit->name();
} else {
if (!main_thread_event_name.count(rit->name())) {
event_name =
origin_name.substr(0, split_pos + 1) + "thread" +
std::to_string(rit->thread_id()) + "::" +
origin_name.substr(split_pos + 1, origin_name.length() - 1);
} else {
event_name = rit->name();
}
}
} else {
event_name =
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
}
} }
auto print_name_size = event_name.size(); auto print_name_size = event_name.size();
int found_pos = 0; int found_pos = 0;
...@@ -608,6 +636,16 @@ void AnalyzeEvent( ...@@ -608,6 +636,16 @@ void AnalyzeEvent(
std::function<bool(const EventItem &, const EventItem &)> sorted_func, std::function<bool(const EventItem &, const EventItem &)> sorted_func,
EventSortingKey sorted_by, size_t *max_name_width, OverHead *overhead, EventSortingKey sorted_by, size_t *max_name_width, OverHead *overhead,
bool merge_thread) { bool merge_thread) {
// In oreder to deal with special event in main thread
std::set<std::string> main_thread_event_name;
for (size_t i = 0; i < (*analyze_events).size(); i++) {
for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
Event event = (*analyze_events)[i][j];
if (event.role() == EventRole::kSpecial) {
main_thread_event_name.insert(event.name());
}
}
}
for (size_t i = 0; i < (*analyze_events).size(); i++) { for (size_t i = 0; i < (*analyze_events).size(); i++) {
double total = 0.; // the total time in one thread double total = 0.; // the total time in one thread
std::list<Event> pushed_events; std::list<Event> pushed_events;
...@@ -618,8 +656,10 @@ void AnalyzeEvent( ...@@ -618,8 +656,10 @@ void AnalyzeEvent(
for (size_t j = 0; j < (*analyze_events)[i].size(); j++) { for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
Event analyze_event = (*analyze_events)[i][j]; Event analyze_event = (*analyze_events)[i][j];
SetEvent(merge_thread, analyze_event, max_name_width, &pushed_events, if (!(analyze_event.role() == EventRole::kSpecial && !merge_thread)) {
&event_items, &event_idx); SetEvent(merge_thread, analyze_event, max_name_width, &pushed_events,
&event_items, &event_idx, main_thread_event_name);
}
} }
auto table_size = event_items.size(); auto table_size = event_items.size();
......
...@@ -59,7 +59,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -59,7 +59,7 @@ TEST(RecordEvent, RecordEvent) {
PushEvent(name, EventRole::kOrdinary); PushEvent(name, EventRole::kOrdinary);
int counter = 1; int counter = 1;
while (counter != i * 1000) counter++; while (counter != i * 1000) counter++;
PopEvent(name); PopEvent(name, EventRole::kOrdinary);
} }
} }
...@@ -109,7 +109,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -109,7 +109,7 @@ TEST(RecordEvent, RecordEvent) {
// Bad Usage: // Bad Usage:
PushEvent("event_without_pop", EventRole::kOrdinary); PushEvent("event_without_pop", EventRole::kOrdinary);
PopEvent("event_without_push"); PopEvent("event_without_push", EventRole::kOrdinary);
std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents(); std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
int cuda_startup_count = 0; int cuda_startup_count = 0;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册