未验证 提交 72ff5a09 编写于 作者: Z Zhang Ting 提交者: GitHub

fix print bug of profile, test=develop (#22804)

上级 4e8bc024
......@@ -173,7 +173,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform::RecordEvent op_type_record_event(Type());
auto op_name = platform::OpName(outputs_, Type());
platform::RecordEvent op_name_record_event(
op_name, platform::RecordRole::kUniqueOp);
op_name, platform::EventRole::kUniqueOp);
RunImpl(scope, place);
}
......@@ -957,7 +957,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
Scope* transfer_scope = nullptr;
{
platform::RecordEvent record_event("prepare_data",
platform::RecordRole::kInnerOp);
platform::EventRole::kInnerOp);
transfer_scope = PrepareData(scope, *kernel_type_, &transfered_inplace_vars,
runtime_ctx);
}
......@@ -971,7 +971,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
if (!all_kernels_must_compute_runtime_shape_) {
platform::RecordEvent record_event("infer_shape",
platform::RecordRole::kInnerOp);
platform::EventRole::kInnerOp);
RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
this->InferShape(&infer_shape_ctx);
}
......@@ -984,7 +984,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// not Scope. Imperative mode only pass inputs and get outputs.
{
platform::RecordEvent record_event("compute",
platform::RecordRole::kInnerOp);
platform::EventRole::kInnerOp);
(*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
kernel_configs));
}
......
......@@ -25,18 +25,27 @@ namespace platform {
enum class EventType { kMark, kPushRange, kPopRange };
enum class EventRole {
kOrdinary, // only record op time with op type key
kInnerOp, // record op detail time with op type key
kUniqueOp, // record op detail time with op unique name key
};
class Event {
public:
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event(EventType type, std::string name, uint32_t thread_id);
Event(EventType type, std::string name, uint32_t thread_id,
EventRole role = EventRole::kOrdinary);
const EventType& type() const;
Event* parent() const { return parent_; }
void set_parent(Event* parent) { parent_ = parent; }
std::string name() const { return name_; }
EventRole role() const { return role_; }
uint32_t thread_id() const { return thread_id_; }
void set_name(std::string name) { name_ = name; }
void set_role(EventRole role) { role_ = role; }
#ifdef PADDLE_WITH_CUDA
#ifndef PADDLE_WITH_CUPTI
......@@ -53,6 +62,7 @@ class Event {
std::string name_{};
Event* parent_{nullptr};
uint32_t thread_id_;
EventRole role_{};
int64_t cpu_ns_;
bool visited_status_{false};
#ifdef PADDLE_WITH_CUDA
......
......@@ -42,8 +42,9 @@ namespace platform {
MemEvenRecorder MemEvenRecorder::recorder;
Event::Event(EventType type, std::string name, uint32_t thread_id)
: type_(type), name_(name), thread_id_(thread_id) {
Event::Event(EventType type, std::string name, uint32_t thread_id,
EventRole role)
: type_(type), name_(name), thread_id_(thread_id), role_(role) {
cpu_ns_ = GetTimeInNsec();
}
......@@ -62,12 +63,12 @@ double Event::CudaElapsedMs(const Event &e) const {
#endif
}
RecordEvent::RecordEvent(const std::string &name, const RecordRole role)
RecordEvent::RecordEvent(const std::string &name, const EventRole role)
: is_enabled_(false), start_ns_(PosixInNsec()), role_(role) {
if (g_state == ProfilerState::kDisabled || name.empty()) return;
// lock is not needed, the code below is thread-safe
is_enabled_ = true;
Event *e = PushEvent(name);
Event *e = PushEvent(name, role);
// Maybe need the same push/pop behavior.
SetCurAnnotation(e);
name_ = e->name();
......@@ -179,8 +180,8 @@ void Mark(const std::string &name) {
GetEventList().Record(EventType::kMark, name, g_thread_id);
}
Event *PushEvent(const std::string &name) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
Event *PushEvent(const std::string &name, const EventRole role) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role);
}
void PopEvent(const std::string &name) {
......
......@@ -43,12 +43,6 @@ enum class ProfilerState {
kAll, // Profile both CPU and GPU. (Currently experimental).
};
enum class RecordRole {
kOrdinary, // only record op time with op type key
kInnerOp, // record op detail time with op type key
kUniqueOp, // record op detail time with op unique name key
};
// it is the flag to control to print the profiling result
enum class TracerOption {
kDefault, // print the different op type profiling result
......@@ -86,6 +80,7 @@ struct EventItem {
double cpu_time;
double gpu_time;
float ratio;
EventRole role;
};
struct OverHead {
......@@ -128,7 +123,7 @@ struct MemEvenRecorder {
struct RecordEvent {
RecordEvent(const std::string& name,
const RecordRole role = RecordRole::kOrdinary);
const EventRole role = EventRole::kOrdinary);
~RecordEvent();
......@@ -139,7 +134,7 @@ struct RecordEvent {
// Need to distinguish name by op type, block_id, program_id and perhaps
// different kernel invocations within an op.
std::string full_name_;
RecordRole role_{RecordRole::kOrdinary};
EventRole role_{EventRole::kOrdinary};
};
class RecordRPCEvent {
......@@ -201,7 +196,7 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place, const std::string& annotation);
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place, const std::string& annotation);
Event* PushEvent(const std::string& name);
Event* PushEvent(const std::string& name, const EventRole role);
void PopEvent(const std::string& name);
// Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
......
......@@ -304,9 +304,9 @@ void SetEvent(bool merge_thread, Event analyze_event, size_t *max_name_width,
if (event_idx->find(event_name) == event_idx->end()) {
event_idx->insert({event_name, event_items->size()});
EventItem event_item = {event_name, 1, event_time,
event_time, event_time, event_time,
cpu_time, gpu_time, 0.};
EventItem event_item = {event_name, 1, event_time, event_time,
event_time, event_time, cpu_time, gpu_time,
0., rit->role()};
event_items->push_back(event_item);
} else {
int index = event_idx->at(event_name);
......@@ -335,8 +335,10 @@ void SetEvent(bool merge_thread, Event analyze_event, size_t *max_name_width,
void ComputeOverhead(const std::multimap<std::string, EventItem> &sub_child_map,
OverHead *overhead) {
EventItem memcpy_async = {"GpuMemcpyAsync", 0, 0., 0., 0., 0., 0., 0., 0.0f};
EventItem memcpy_sync = {"GpuMemcpySync", 0, 0., 0., 0., 0., 0., 0., 0.0f};
EventItem memcpy_async = {
"GpuMemcpyAsync", 0, 0., 0., 0., 0., 0., 0., 0.0f, EventRole::kOrdinary};
EventItem memcpy_sync = {"GpuMemcpySync", 0, 0., 0., 0., 0., 0., 0., 0.0f,
EventRole::kOrdinary};
for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) {
if (it->second.name.find("compute") != std::string::npos) {
overhead->compute_ratio += it->second.ratio;
......@@ -359,6 +361,29 @@ void ComputeOverhead(const std::multimap<std::string, EventItem> &sub_child_map,
overhead->sub_memcpy_items = {memcpy_async, memcpy_sync};
}
std::string FindOrdinaryParent(
const std::multimap<std::string, EventItem> &sub_child_map,
std::string name) {
bool find_name = false;
std::string parent = name;
EventRole role;
for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) {
if (it->second.name == name) {
role = it->second.role;
parent = it->first;
find_name = true;
break;
}
}
if (find_name && role == EventRole::kOrdinary) {
return name;
} else if (find_name && role != EventRole::kOrdinary) {
return FindOrdinaryParent(sub_child_map, parent);
} else {
return parent;
}
}
// When TracerOption is KDefault, OpDetail will be recorded but only default
// profile result will be printed.
// GpuMemcpy should be printed in kDefault setting, however it offten occurs
......@@ -376,11 +401,7 @@ void GetChildMap(const std::multimap<std::string, EventItem> &sub_child_map,
} else {
for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) {
if (it->second.name.find("GpuMemcpy") != std::string::npos) {
std::string parent_name = it->first;
auto left_pos = it->first.find("/");
if (left_pos != std::string::npos) {
parent_name = it->first.substr(0, left_pos);
}
std::string parent_name = FindOrdinaryParent(sub_child_map, it->first);
auto item = it->second;
auto right_pos = item.name.rfind("/");
if (right_pos != std::string::npos) {
......@@ -389,6 +410,9 @@ void GetChildMap(const std::multimap<std::string, EventItem> &sub_child_map,
item.name = parent_name + "/" + child_name;
}
child_map->insert(std::pair<std::string, EventItem>(parent_name, item));
} else if (it->second.role == EventRole::kOrdinary) {
child_map->insert(
std::pair<std::string, EventItem>(it->first, it->second));
}
}
}
......
......@@ -40,6 +40,7 @@ TEST(RecordEvent, RecordEvent) {
using paddle::platform::PopEvent;
using paddle::platform::ProfilerState;
using paddle::platform::EventSortingKey;
using paddle::platform::EventRole;
ProfilerState state = ProfilerState::kCPU;
EnableProfiler(state);
......@@ -55,7 +56,7 @@ TEST(RecordEvent, RecordEvent) {
for (int loop = 0; loop < 3; ++loop) {
for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i);
PushEvent(name);
PushEvent(name, EventRole::kOrdinary);
int counter = 1;
while (counter != i * 1000) counter++;
PopEvent(name);
......@@ -107,7 +108,7 @@ TEST(RecordEvent, RecordEvent) {
}
// Bad Usage:
PushEvent("event_without_pop");
PushEvent("event_without_pop", EventRole::kOrdinary);
PopEvent("event_without_push");
std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册