未验证 提交 72ff5a09 编写于 作者: Z Zhang Ting 提交者: GitHub

fix print bug of profile, test=develop (#22804)

上级 4e8bc024
...@@ -173,7 +173,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -173,7 +173,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform::RecordEvent op_type_record_event(Type()); platform::RecordEvent op_type_record_event(Type());
auto op_name = platform::OpName(outputs_, Type()); auto op_name = platform::OpName(outputs_, Type());
platform::RecordEvent op_name_record_event( platform::RecordEvent op_name_record_event(
op_name, platform::RecordRole::kUniqueOp); op_name, platform::EventRole::kUniqueOp);
RunImpl(scope, place); RunImpl(scope, place);
} }
...@@ -957,7 +957,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -957,7 +957,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
Scope* transfer_scope = nullptr; Scope* transfer_scope = nullptr;
{ {
platform::RecordEvent record_event("prepare_data", platform::RecordEvent record_event("prepare_data",
platform::RecordRole::kInnerOp); platform::EventRole::kInnerOp);
transfer_scope = PrepareData(scope, *kernel_type_, &transfered_inplace_vars, transfer_scope = PrepareData(scope, *kernel_type_, &transfered_inplace_vars,
runtime_ctx); runtime_ctx);
} }
...@@ -971,7 +971,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -971,7 +971,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
if (!all_kernels_must_compute_runtime_shape_) { if (!all_kernels_must_compute_runtime_shape_) {
platform::RecordEvent record_event("infer_shape", platform::RecordEvent record_event("infer_shape",
platform::RecordRole::kInnerOp); platform::EventRole::kInnerOp);
RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx); RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
} }
...@@ -984,7 +984,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -984,7 +984,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// not Scope. Imperative mode only pass inputs and get outputs. // not Scope. Imperative mode only pass inputs and get outputs.
{ {
platform::RecordEvent record_event("compute", platform::RecordEvent record_event("compute",
platform::RecordRole::kInnerOp); platform::EventRole::kInnerOp);
(*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx, (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
kernel_configs)); kernel_configs));
} }
......
...@@ -25,18 +25,27 @@ namespace platform { ...@@ -25,18 +25,27 @@ namespace platform {
enum class EventType { kMark, kPushRange, kPopRange }; enum class EventType { kMark, kPushRange, kPopRange };
enum class EventRole {
kOrdinary, // only record op time with op type key
kInnerOp, // record op detail time with op type key
kUniqueOp, // record op detail time with op unique name key
};
class Event { class Event {
public: public:
// The DeviceContext is used to get the cuda stream. // The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr. // If CPU profiling mode, can pass nullptr.
Event(EventType type, std::string name, uint32_t thread_id); Event(EventType type, std::string name, uint32_t thread_id,
EventRole role = EventRole::kOrdinary);
const EventType& type() const; const EventType& type() const;
Event* parent() const { return parent_; } Event* parent() const { return parent_; }
void set_parent(Event* parent) { parent_ = parent; } void set_parent(Event* parent) { parent_ = parent; }
std::string name() const { return name_; } std::string name() const { return name_; }
EventRole role() const { return role_; }
uint32_t thread_id() const { return thread_id_; } uint32_t thread_id() const { return thread_id_; }
void set_name(std::string name) { name_ = name; } void set_name(std::string name) { name_ = name; }
void set_role(EventRole role) { role_ = role; }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#ifndef PADDLE_WITH_CUPTI #ifndef PADDLE_WITH_CUPTI
...@@ -53,6 +62,7 @@ class Event { ...@@ -53,6 +62,7 @@ class Event {
std::string name_{}; std::string name_{};
Event* parent_{nullptr}; Event* parent_{nullptr};
uint32_t thread_id_; uint32_t thread_id_;
EventRole role_{};
int64_t cpu_ns_; int64_t cpu_ns_;
bool visited_status_{false}; bool visited_status_{false};
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -42,8 +42,9 @@ namespace platform { ...@@ -42,8 +42,9 @@ namespace platform {
MemEvenRecorder MemEvenRecorder::recorder; MemEvenRecorder MemEvenRecorder::recorder;
Event::Event(EventType type, std::string name, uint32_t thread_id) Event::Event(EventType type, std::string name, uint32_t thread_id,
: type_(type), name_(name), thread_id_(thread_id) { EventRole role)
: type_(type), name_(name), thread_id_(thread_id), role_(role) {
cpu_ns_ = GetTimeInNsec(); cpu_ns_ = GetTimeInNsec();
} }
...@@ -62,12 +63,12 @@ double Event::CudaElapsedMs(const Event &e) const { ...@@ -62,12 +63,12 @@ double Event::CudaElapsedMs(const Event &e) const {
#endif #endif
} }
RecordEvent::RecordEvent(const std::string &name, const RecordRole role) RecordEvent::RecordEvent(const std::string &name, const EventRole role)
: is_enabled_(false), start_ns_(PosixInNsec()), role_(role) { : is_enabled_(false), start_ns_(PosixInNsec()), role_(role) {
if (g_state == ProfilerState::kDisabled || name.empty()) return; if (g_state == ProfilerState::kDisabled || name.empty()) return;
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
is_enabled_ = true; is_enabled_ = true;
Event *e = PushEvent(name); Event *e = PushEvent(name, role);
// Maybe need the same push/pop behavior. // Maybe need the same push/pop behavior.
SetCurAnnotation(e); SetCurAnnotation(e);
name_ = e->name(); name_ = e->name();
...@@ -179,8 +180,8 @@ void Mark(const std::string &name) { ...@@ -179,8 +180,8 @@ void Mark(const std::string &name) {
GetEventList().Record(EventType::kMark, name, g_thread_id); GetEventList().Record(EventType::kMark, name, g_thread_id);
} }
Event *PushEvent(const std::string &name) { Event *PushEvent(const std::string &name, const EventRole role) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id); return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role);
} }
void PopEvent(const std::string &name) { void PopEvent(const std::string &name) {
......
...@@ -43,12 +43,6 @@ enum class ProfilerState { ...@@ -43,12 +43,6 @@ enum class ProfilerState {
kAll, // Profile both CPU and GPU. (Currently experimental). kAll, // Profile both CPU and GPU. (Currently experimental).
}; };
enum class RecordRole {
kOrdinary, // only record op time with op type key
kInnerOp, // record op detail time with op type key
kUniqueOp, // record op detail time with op unique name key
};
// it is the flag to control to print the profiling result // it is the flag to control to print the profiling result
enum class TracerOption { enum class TracerOption {
kDefault, // print the different op type profiling result kDefault, // print the different op type profiling result
...@@ -86,6 +80,7 @@ struct EventItem { ...@@ -86,6 +80,7 @@ struct EventItem {
double cpu_time; double cpu_time;
double gpu_time; double gpu_time;
float ratio; float ratio;
EventRole role;
}; };
struct OverHead { struct OverHead {
...@@ -128,7 +123,7 @@ struct MemEvenRecorder { ...@@ -128,7 +123,7 @@ struct MemEvenRecorder {
struct RecordEvent { struct RecordEvent {
RecordEvent(const std::string& name, RecordEvent(const std::string& name,
const RecordRole role = RecordRole::kOrdinary); const EventRole role = EventRole::kOrdinary);
~RecordEvent(); ~RecordEvent();
...@@ -139,7 +134,7 @@ struct RecordEvent { ...@@ -139,7 +134,7 @@ struct RecordEvent {
// Need to distinguish name by op type, block_id, program_id and perhaps // Need to distinguish name by op type, block_id, program_id and perhaps
// different kernel invocations within an op. // different kernel invocations within an op.
std::string full_name_; std::string full_name_;
RecordRole role_{RecordRole::kOrdinary}; EventRole role_{EventRole::kOrdinary};
}; };
class RecordRPCEvent { class RecordRPCEvent {
...@@ -201,7 +196,7 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, ...@@ -201,7 +196,7 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place, const std::string& annotation); const Place& place, const std::string& annotation);
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place, const std::string& annotation); const Place& place, const std::string& annotation);
Event* PushEvent(const std::string& name); Event* PushEvent(const std::string& name, const EventRole role);
void PopEvent(const std::string& name); void PopEvent(const std::string& name);
// Return the event list of all threads. Assumed the returned value calls // Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
......
...@@ -304,9 +304,9 @@ void SetEvent(bool merge_thread, Event analyze_event, size_t *max_name_width, ...@@ -304,9 +304,9 @@ void SetEvent(bool merge_thread, Event analyze_event, size_t *max_name_width,
if (event_idx->find(event_name) == event_idx->end()) { if (event_idx->find(event_name) == event_idx->end()) {
event_idx->insert({event_name, event_items->size()}); event_idx->insert({event_name, event_items->size()});
EventItem event_item = {event_name, 1, event_time, EventItem event_item = {event_name, 1, event_time, event_time,
event_time, event_time, event_time, event_time, event_time, cpu_time, gpu_time,
cpu_time, gpu_time, 0.}; 0., rit->role()};
event_items->push_back(event_item); event_items->push_back(event_item);
} else { } else {
int index = event_idx->at(event_name); int index = event_idx->at(event_name);
...@@ -335,8 +335,10 @@ void SetEvent(bool merge_thread, Event analyze_event, size_t *max_name_width, ...@@ -335,8 +335,10 @@ void SetEvent(bool merge_thread, Event analyze_event, size_t *max_name_width,
void ComputeOverhead(const std::multimap<std::string, EventItem> &sub_child_map, void ComputeOverhead(const std::multimap<std::string, EventItem> &sub_child_map,
OverHead *overhead) { OverHead *overhead) {
EventItem memcpy_async = {"GpuMemcpyAsync", 0, 0., 0., 0., 0., 0., 0., 0.0f}; EventItem memcpy_async = {
EventItem memcpy_sync = {"GpuMemcpySync", 0, 0., 0., 0., 0., 0., 0., 0.0f}; "GpuMemcpyAsync", 0, 0., 0., 0., 0., 0., 0., 0.0f, EventRole::kOrdinary};
EventItem memcpy_sync = {"GpuMemcpySync", 0, 0., 0., 0., 0., 0., 0., 0.0f,
EventRole::kOrdinary};
for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) {
if (it->second.name.find("compute") != std::string::npos) { if (it->second.name.find("compute") != std::string::npos) {
overhead->compute_ratio += it->second.ratio; overhead->compute_ratio += it->second.ratio;
...@@ -359,6 +361,29 @@ void ComputeOverhead(const std::multimap<std::string, EventItem> &sub_child_map, ...@@ -359,6 +361,29 @@ void ComputeOverhead(const std::multimap<std::string, EventItem> &sub_child_map,
overhead->sub_memcpy_items = {memcpy_async, memcpy_sync}; overhead->sub_memcpy_items = {memcpy_async, memcpy_sync};
} }
std::string FindOrdinaryParent(
const std::multimap<std::string, EventItem> &sub_child_map,
std::string name) {
bool find_name = false;
std::string parent = name;
EventRole role;
for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) {
if (it->second.name == name) {
role = it->second.role;
parent = it->first;
find_name = true;
break;
}
}
if (find_name && role == EventRole::kOrdinary) {
return name;
} else if (find_name && role != EventRole::kOrdinary) {
return FindOrdinaryParent(sub_child_map, parent);
} else {
return parent;
}
}
// When TracerOption is KDefault, OpDetail will be recorded but only default // When TracerOption is KDefault, OpDetail will be recorded but only default
// profile result will be printed. // profile result will be printed.
// GpuMemcpy should be printed in kDefault setting, however it offten occurs // GpuMemcpy should be printed in kDefault setting, however it offten occurs
...@@ -376,11 +401,7 @@ void GetChildMap(const std::multimap<std::string, EventItem> &sub_child_map, ...@@ -376,11 +401,7 @@ void GetChildMap(const std::multimap<std::string, EventItem> &sub_child_map,
} else { } else {
for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) {
if (it->second.name.find("GpuMemcpy") != std::string::npos) { if (it->second.name.find("GpuMemcpy") != std::string::npos) {
std::string parent_name = it->first; std::string parent_name = FindOrdinaryParent(sub_child_map, it->first);
auto left_pos = it->first.find("/");
if (left_pos != std::string::npos) {
parent_name = it->first.substr(0, left_pos);
}
auto item = it->second; auto item = it->second;
auto right_pos = item.name.rfind("/"); auto right_pos = item.name.rfind("/");
if (right_pos != std::string::npos) { if (right_pos != std::string::npos) {
...@@ -389,6 +410,9 @@ void GetChildMap(const std::multimap<std::string, EventItem> &sub_child_map, ...@@ -389,6 +410,9 @@ void GetChildMap(const std::multimap<std::string, EventItem> &sub_child_map,
item.name = parent_name + "/" + child_name; item.name = parent_name + "/" + child_name;
} }
child_map->insert(std::pair<std::string, EventItem>(parent_name, item)); child_map->insert(std::pair<std::string, EventItem>(parent_name, item));
} else if (it->second.role == EventRole::kOrdinary) {
child_map->insert(
std::pair<std::string, EventItem>(it->first, it->second));
} }
} }
} }
......
...@@ -40,6 +40,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -40,6 +40,7 @@ TEST(RecordEvent, RecordEvent) {
using paddle::platform::PopEvent; using paddle::platform::PopEvent;
using paddle::platform::ProfilerState; using paddle::platform::ProfilerState;
using paddle::platform::EventSortingKey; using paddle::platform::EventSortingKey;
using paddle::platform::EventRole;
ProfilerState state = ProfilerState::kCPU; ProfilerState state = ProfilerState::kCPU;
EnableProfiler(state); EnableProfiler(state);
...@@ -55,7 +56,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -55,7 +56,7 @@ TEST(RecordEvent, RecordEvent) {
for (int loop = 0; loop < 3; ++loop) { for (int loop = 0; loop < 3; ++loop) {
for (int i = 1; i < 5; ++i) { for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i); std::string name = "op_" + std::to_string(i);
PushEvent(name); PushEvent(name, EventRole::kOrdinary);
int counter = 1; int counter = 1;
while (counter != i * 1000) counter++; while (counter != i * 1000) counter++;
PopEvent(name); PopEvent(name);
...@@ -107,7 +108,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -107,7 +108,7 @@ TEST(RecordEvent, RecordEvent) {
} }
// Bad Usage: // Bad Usage:
PushEvent("event_without_pop"); PushEvent("event_without_pop", EventRole::kOrdinary);
PopEvent("event_without_push"); PopEvent("event_without_push");
std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents(); std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册