未验证 提交 dbfe5333 编写于 作者: W wangchaochaohu 提交者: GitHub

Add pe profiler Event (#24611)

上级 55b664a1
......@@ -31,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_double(eager_delete_tensor_gb);
......@@ -820,6 +821,8 @@ void ParallelExecutor::BCastParamsToDevices(
FetchResultType ParallelExecutor::Run(
const std::vector<std::string> &fetch_tensors, bool return_merged) {
VLOG(3) << "enter ParallelExecutor Run";
platform::RecordEvent parallel_executor_event(
"ParallelExecutor::Run", paddle::platform::EventRole::kSpecial);
#ifdef WITH_GPERFTOOLS
if (gProfileStarted) {
ProfilerFlush();
......
......@@ -40,6 +40,9 @@ namespace {
thread_local std::deque<int> block_id_stack;
// Tracking the nested event stacks.
thread_local std::deque<Event *> annotation_stack;
// stack to strore event sunch as pe and so on
static std::deque<Event *> main_thread_annotation_stack{};
static std::deque<std::string> main_thread_annotation_stack_name{};
std::map<uint32_t, int32_t> system_thread_id_map;
......@@ -638,15 +641,49 @@ DeviceTracer *GetDeviceTracer() {
return tracer;
}
void SetCurAnnotation(Event *event) {
if (!annotation_stack.empty()) {
std::string SetCurAnnotation(Event *event) {
std::string ret;
if (!annotation_stack.empty() && event->role() != EventRole::kSpecial) {
event->set_parent(annotation_stack.back());
event->set_name(annotation_stack.back()->name() + "/" + event->name());
}
annotation_stack.push_back(event);
if (!main_thread_annotation_stack_name.empty() && !annotation_stack.empty() &&
main_thread_annotation_stack.back()->thread_id() !=
annotation_stack.back()->thread_id()) {
ret = main_thread_annotation_stack_name.back() + "/" + event->name();
} else {
ret = event->name();
}
if (event->role() == EventRole::kSpecial) {
std::string name = event->name();
if (!main_thread_annotation_stack_name.empty()) {
name = main_thread_annotation_stack_name.back() + "/" + event->name();
}
main_thread_annotation_stack_name.push_back(name);
main_thread_annotation_stack.push_back(event);
}
return ret;
}
void ClearCurAnnotation() { annotation_stack.pop_back(); }
void ClearCurAnnotation() {
if (!main_thread_annotation_stack_name.empty() && !annotation_stack.empty() &&
main_thread_annotation_stack.back()->thread_id() !=
annotation_stack.back()->thread_id()) {
annotation_stack.back()->set_name(main_thread_annotation_stack_name.back() +
"/" + annotation_stack.back()->name());
}
if (!main_thread_annotation_stack.empty() &&
main_thread_annotation_stack.back()->name() ==
annotation_stack.back()->name()) {
main_thread_annotation_stack_name.pop_back();
main_thread_annotation_stack.pop_back();
}
annotation_stack.pop_back();
}
Event *CurAnnotation() {
if (annotation_stack.empty()) return nullptr;
......
......@@ -137,7 +137,7 @@ class DeviceTracer {
DeviceTracer* GetDeviceTracer();
// Set a name for the cuda kernel operation being launched by the thread.
void SetCurAnnotation(Event* event);
std::string SetCurAnnotation(Event* event);
// Clear the name after the operation is done.
void ClearCurAnnotation();
// Current name of the operation being run in the thread.
......
......@@ -29,6 +29,7 @@ enum class EventRole {
kOrdinary, // only record op time with op type key
kInnerOp, // record op detail time with op type key
kUniqueOp, // record op detail time with op unique name key
kSpecial, // record event such as PE which is outer of thread local
};
class Event {
......
......@@ -73,8 +73,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
// lock is not needed, the code below is thread-safe
Event *e = PushEvent(name, role);
// Maybe need the same push/pop behavior.
SetCurAnnotation(e);
name_ = e->name();
name_ = SetCurAnnotation(e);
}
RecordEvent::~RecordEvent() {
......@@ -86,7 +85,7 @@ RecordEvent::~RecordEvent() {
BlockDepth(), g_thread_id);
}
ClearCurAnnotation();
PopEvent(name_);
PopEvent(name_, role_);
}
void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
......@@ -187,8 +186,8 @@ Event *PushEvent(const std::string &name, const EventRole role) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role);
}
void PopEvent(const std::string &name) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id);
void PopEvent(const std::string &name, const EventRole role) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id, role);
}
void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled,
......
......@@ -197,7 +197,7 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place, const std::string& annotation);
Event* PushEvent(const std::string& name, const EventRole role);
void PopEvent(const std::string& name);
void PopEvent(const std::string& name, const EventRole role);
// Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> GetAllEvents();
......
......@@ -22,12 +22,12 @@ limitations under the License. */
#include <memory>
#include <mutex> // NOLINT
#include <random>
#include <set>
#include <stack>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif // PADDLE_WITH_CUDA
......@@ -283,7 +283,8 @@ std::function<bool(const EventItem &, const EventItem &)> SetSortedFunc(
void SetEvent(bool merge_thread, const Event &analyze_event,
size_t *max_name_width, std::list<Event> *pushed_events,
std::vector<EventItem> *event_items,
std::unordered_map<std::string, int> *event_idx) {
std::unordered_map<std::string, int> *event_idx,
const std::set<std::string> &main_thread_event_name) {
if (analyze_event.type() == EventType::kPushRange) {
pushed_events->push_back(analyze_event);
} else if (analyze_event.type() == EventType::kPopRange) {
......@@ -312,10 +313,37 @@ void SetEvent(bool merge_thread, const Event &analyze_event,
std::string event_name;
if (merge_thread) {
event_name = rit->name();
} else {
if (!main_thread_event_name.empty()) {
auto origin_name = rit->name();
int index = 1;
int split_pos = 0;
while ((split_pos = FindNthReversePos(origin_name, '/', index)) !=
-1) {
auto prefix_str = origin_name.substr(0, split_pos);
if (main_thread_event_name.count(prefix_str)) {
break;
}
index++;
}
if (split_pos == -1 && !main_thread_event_name.count(rit->name())) {
event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
rit->name();
} else {
if (!main_thread_event_name.count(rit->name())) {
event_name =
origin_name.substr(0, split_pos + 1) + "thread" +
std::to_string(rit->thread_id()) + "::" +
origin_name.substr(split_pos + 1, origin_name.length() - 1);
} else {
event_name = rit->name();
}
}
} else {
event_name =
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
}
}
auto print_name_size = event_name.size();
int found_pos = 0;
if (rit->role() == EventRole::kInnerOp &&
......@@ -608,6 +636,16 @@ void AnalyzeEvent(
std::function<bool(const EventItem &, const EventItem &)> sorted_func,
EventSortingKey sorted_by, size_t *max_name_width, OverHead *overhead,
bool merge_thread) {
// In oreder to deal with special event in main thread
std::set<std::string> main_thread_event_name;
for (size_t i = 0; i < (*analyze_events).size(); i++) {
for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
Event event = (*analyze_events)[i][j];
if (event.role() == EventRole::kSpecial) {
main_thread_event_name.insert(event.name());
}
}
}
for (size_t i = 0; i < (*analyze_events).size(); i++) {
double total = 0.; // the total time in one thread
std::list<Event> pushed_events;
......@@ -618,8 +656,10 @@ void AnalyzeEvent(
for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
Event analyze_event = (*analyze_events)[i][j];
if (!(analyze_event.role() == EventRole::kSpecial && !merge_thread)) {
SetEvent(merge_thread, analyze_event, max_name_width, &pushed_events,
&event_items, &event_idx);
&event_items, &event_idx, main_thread_event_name);
}
}
auto table_size = event_items.size();
......
......@@ -59,7 +59,7 @@ TEST(RecordEvent, RecordEvent) {
PushEvent(name, EventRole::kOrdinary);
int counter = 1;
while (counter != i * 1000) counter++;
PopEvent(name);
PopEvent(name, EventRole::kOrdinary);
}
}
......@@ -109,7 +109,7 @@ TEST(RecordEvent, RecordEvent) {
// Bad Usage:
PushEvent("event_without_pop", EventRole::kOrdinary);
PopEvent("event_without_push");
PopEvent("event_without_push", EventRole::kOrdinary);
std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
int cuda_startup_count = 0;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册