未验证 提交 c3876cf8 编写于 作者: W wangchaochaohu 提交者: GitHub

add support for nested profiling event and printing in different level (#22061)

* add support for nested profiling event and printing in different level
上级 019e2019
......@@ -166,16 +166,11 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#endif
}
// The profile has a process-wide mutex, results in serious performance
// issue
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) {
platform::RecordEvent record_event(Type());
RunImpl(scope, place);
} else {
{
platform::RecordEvent record_event(Type() + "_op");
RunImpl(scope, place);
}
VLOG(3) << place << " " << DebugStringEx(&scope);
} catch (platform::EnforceNotMet& exception) {
framework::InsertCallStackInfo(Type(), Attrs(), &exception);
......@@ -953,9 +948,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// do data transformScope &transfer_scope;
std::vector<std::string> transfered_inplace_vars;
auto* transfer_scope =
PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
Scope* transfer_scope = nullptr;
{
platform::RecordEvent record_event("prepare_data");
transfer_scope = PrepareData(scope, *kernel_type_, &transfered_inplace_vars,
runtime_ctx);
}
// exec scope is the scope that kernel actually executed on.
const Scope& exec_scope =
(transfer_scope == nullptr ? scope : *transfer_scope);
......@@ -965,6 +963,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
}
if (!all_kernels_must_compute_runtime_shape_) {
platform::RecordEvent record_event("infer_shape");
RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
this->InferShape(&infer_shape_ctx);
}
......@@ -975,8 +974,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs.
(*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
kernel_configs));
{
platform::RecordEvent record_event("compute");
(*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
kernel_configs));
}
if (!transfered_inplace_vars.empty()) {
// there is inplace variable has been transfered.
......
......@@ -446,6 +446,11 @@ class DeviceTracerImpl : public DeviceTracer {
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
Event *e = c->second;
Event *parent = e->parent();
while (parent) {
parent->AddCudaElapsedTime(r.start_ns, r.end_ns);
parent = parent->parent();
}
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
}
}
......@@ -453,6 +458,11 @@ class DeviceTracerImpl : public DeviceTracer {
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
Event *e = c->second;
Event *parent = e->parent();
while (parent) {
parent->AddCudaElapsedTime(r.start_ns, r.end_ns);
parent = parent->parent();
}
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
}
}
......@@ -622,7 +632,13 @@ DeviceTracer *GetDeviceTracer() {
return tracer;
}
void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); }
void SetCurAnnotation(Event *event) {
if (!annotation_stack.empty()) {
event->set_parent(annotation_stack.back());
event->set_name(annotation_stack.back()->name() + "/" + event->name());
}
annotation_stack.push_back(event);
}
void ClearCurAnnotation() { annotation_stack.pop_back(); }
......
......@@ -32,8 +32,11 @@ class Event {
Event(EventType type, std::string name, uint32_t thread_id);
const EventType& type() const;
Event* parent() const { return parent_; }
void set_parent(Event* parent) { parent_ = parent; }
std::string name() const { return name_; }
uint32_t thread_id() const { return thread_id_; }
void set_name(std::string name) { name_ = name; }
#ifdef PADDLE_WITH_CUDA
#ifndef PADDLE_WITH_CUPTI
......@@ -47,9 +50,11 @@ class Event {
private:
EventType type_;
std::string name_;
std::string name_{};
Event* parent_{nullptr};
uint32_t thread_id_;
int64_t cpu_ns_;
bool visited_status_{false};
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUPTI
int64_t gpu_ns_ = 0;
......
......@@ -19,6 +19,7 @@ limitations under the License. */
#include <map>
#include <mutex> // NOLINT
#include <random>
#include <stack>
#include <string>
#include <vector>
......@@ -138,14 +139,14 @@ void PopEvent(const std::string &name) {
RecordEvent::RecordEvent(const std::string &name)
: is_enabled_(false), start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return;
if (g_state == ProfilerState::kDisabled || name.empty()) return;
// lock is not needed, the code below is thread-safe
is_enabled_ = true;
name_ = name;
Event *e = PushEvent(name_);
Event *e = PushEvent(name);
// Maybe need the same push/pop behavior.
SetCurAnnotation(e);
name_ = e->name();
}
RecordEvent::~RecordEvent() {
......@@ -320,48 +321,86 @@ struct EventItem {
// Print results
void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
const std::multimap<std::string, EventItem> &child_map,
const std::string &sorted_domain, const size_t name_width,
const size_t data_width, bool merge_thread) {
// Output header information
std::cout << "\n------------------------->"
<< " Profiling Report "
<< "<-------------------------\n\n";
std::string place;
if (g_state == ProfilerState::kCPU) {
place = "CPU";
} else if (g_state == ProfilerState::kCUDA) {
place = "CUDA";
} else if (g_state == ProfilerState::kAll) {
place = "All";
} else {
PADDLE_THROW("Invalid profiler state", g_state);
}
const size_t data_width, bool merge_thread, int print_depth,
int remove_len) {
if (print_depth == 0) {
// Output header information
std::cout << "\n------------------------->"
<< " Profiling Report "
<< "<-------------------------\n\n";
std::string place;
if (g_state == ProfilerState::kCPU) {
place = "CPU";
} else if (g_state == ProfilerState::kCUDA) {
place = "CUDA";
} else if (g_state == ProfilerState::kAll) {
place = "All";
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Except profiler state must to be one of ['CPU', 'GPU' 'ALL'], but "
"received Invalid profiler state %s",
g_state));
}
if (merge_thread) {
std::cout << "Note! This Report merge all thread info into one."
<< std::endl;
}
std::cout << "Place: " << place << std::endl;
std::cout << "Time unit: ms" << std::endl;
std::cout << "Sorted by " << sorted_domain
<< " in descending order in the same thread\n\n";
// Output events table
std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total";
if (g_state == ProfilerState::kAll) {
std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
<< std::setw(data_width * 2) << "GPU Time (Ratio)";
if (merge_thread) {
std::cout << "Note! This Report merge all thread info into one."
<< std::endl;
}
std::cout << "Place: " << place << std::endl;
std::cout << "Time unit: ms" << std::endl;
std::cout << "Sorted by " << sorted_domain
<< " in descending order in the same thread\n\n";
// Output events table
std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total";
if (g_state == ProfilerState::kAll) {
std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
<< std::setw(data_width * 2) << "GPU Time (Ratio)";
}
std::cout << std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave."
<< std::setw(data_width) << "Ratio." << std::endl;
}
std::cout << std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave."
<< std::setw(data_width) << "Ratio." << std::endl;
if (print_depth >= 100) return;
int print_depth_next = print_depth;
for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
const EventItem &event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time;
auto event_item = events_table[i][j];
std::vector<std::vector<EventItem>> child_table;
std::vector<EventItem> table;
bool do_next = false;
std::string op_end_str = "_op";
for (auto it = child_map.begin(); it != child_map.end(); it++) {
if (it->first == event_item.name) {
table.push_back(it->second);
do_next = it->second.name.rfind(op_end_str) ==
(it->second.name.length() - op_end_str.length());
}
}
child_table.push_back(table);
if (do_next)
print_depth_next = print_depth + 1;
else
print_depth_next = print_depth + 100;
auto name_len = event_item.name.length();
std::string print_name = event_item.name.substr(remove_len, name_len);
std::string delimiter;
for (int i = 0; i < print_depth; i++) {
delimiter = " " + delimiter;
}
print_name = delimiter + print_name;
size_t pos = 0;
while ((pos = print_name.find(op_end_str)) != std::string::npos) {
print_name.erase(pos, op_end_str.length());
}
std::cout << std::setw(name_width) << print_name << std::setw(data_width)
<< event_item.calls << std::setw(data_width)
<< event_item.total_time;
if (g_state == ProfilerState::kAll) {
std::cout << std::setw(data_width * 2)
<< string::Sprintf(
......@@ -376,19 +415,15 @@ void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
<< std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time
<< std::setw(data_width) << event_item.ratio << std::endl;
PrintProfiler(child_table, child_map, sorted_domain, name_width,
data_width, merge_thread, print_depth_next, 0);
}
}
std::cout << std::endl;
}
// Parse the event list and output the profiling report
void ParseEvents(const std::vector<std::vector<Event>> &events,
bool merge_thread,
EventSortingKey sorted_by = EventSortingKey::kDefault) {
if (g_state == ProfilerState::kDisabled) return;
if (merge_thread && events.size() < 2) return;
std::string sorted_domain;
std::function<bool(const EventItem &, const EventItem &)> SetSortedFunc(
EventSortingKey sorted_by, std::string *domain) {
auto sorted_domain = *domain;
std::function<bool(const EventItem &, const EventItem &)> sorted_func;
switch (sorted_by) {
case EventSortingKey::kCalls:
......@@ -436,6 +471,88 @@ void ParseEvents(const std::vector<std::vector<Event>> &events,
default:
sorted_domain = "event first end time";
}
return sorted_func;
}
void SetEvent(bool merge_thread, Event analyze_event, size_t *max_name_width,
std::list<Event> *pushed_events,
std::vector<EventItem> *event_items,
std::unordered_map<std::string, int> *event_idx) {
if (analyze_event.type() == EventType::kPushRange) {
pushed_events->push_back(analyze_event);
} else if (analyze_event.type() == EventType::kPopRange) {
std::list<Event>::reverse_iterator rit = pushed_events->rbegin();
while (rit != pushed_events->rend() &&
rit->name() != analyze_event.name()) {
++rit;
}
// to find the father name event name
if (rit != pushed_events->rend()) {
double event_time = 0;
double gpu_time = 0.0f;
#ifdef PADDLE_WITH_CUDA
gpu_time = rit->CudaElapsedMs(analyze_event);
#endif
double cpu_time = rit->CpuElapsedMs(analyze_event);
if (g_state == ProfilerState::kCUDA) {
event_time = gpu_time;
} else if (g_state == ProfilerState::kCPU) {
event_time = cpu_time;
} else {
event_time = gpu_time + cpu_time;
}
std::string event_name;
if (merge_thread) {
event_name = rit->name();
*max_name_width = std::max(*max_name_width, event_name.size());
} else {
event_name =
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
*max_name_width = std::max(*max_name_width, event_name.size());
}
if (event_idx->find(event_name) == event_idx->end()) {
event_idx->insert({event_name, event_items->size()});
EventItem event_item = {event_name, 1, event_time,
event_time, event_time, event_time,
cpu_time, gpu_time, 0.};
event_items->push_back(event_item);
} else {
int index = event_idx->at(event_name);
event_items->at(index).calls += 1;
// total time
event_items->at(index).total_time += event_time;
// min time
event_items->at(index).min_time =
std::min(event_time, event_items->at(index).min_time);
// max time
event_items->at(index).max_time =
std::max(event_time, event_items->at(index).max_time);
event_items->at(index).gpu_time += gpu_time;
event_items->at(index).cpu_time += cpu_time;
}
// remove the push marker from the list
pushed_events->erase((++rit).base());
} else {
LOG(WARNING) << "Cannot find the push marker of event \'"
<< analyze_event.name()
<< "\', which will be ignored in profiling report.";
}
}
}
// Parse the event list and output the profiling report
void ParseEvents(const std::vector<std::vector<Event>> &events,
bool merge_thread,
EventSortingKey sorted_by = EventSortingKey::kDefault) {
if (g_state == ProfilerState::kDisabled) return;
if (merge_thread && events.size() < 2) return;
std::string sorted_domain;
std::function<bool(const EventItem &, const EventItem &)> sorted_func;
sorted_func = SetSortedFunc(sorted_by, &sorted_domain);
const std::vector<std::vector<Event>> *analyze_events;
std::vector<std::vector<Event>> merged_events_list;
......@@ -453,91 +570,65 @@ void ParseEvents(const std::vector<std::vector<Event>> &events,
}
std::vector<std::vector<EventItem>> events_table;
std::multimap<std::string, EventItem> child_map;
size_t max_name_width = 0;
for (size_t i = 0; i < (*analyze_events).size(); i++) {
double total = 0.; // the total time in one thread
std::list<Event> pushed_events;
std::vector<EventItem> event_items;
std::vector<EventItem> main_event_items;
std::unordered_map<std::string, int> event_idx;
for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
if ((*analyze_events)[i][j].type() == EventType::kPushRange) {
pushed_events.push_back((*analyze_events)[i][j]);
} else if ((*analyze_events)[i][j].type() == EventType::kPopRange) {
std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
while (rit != pushed_events.rend() &&
rit->name() != (*analyze_events)[i][j].name()) {
++rit;
}
Event analyze_event = (*analyze_events)[i][j];
SetEvent(merge_thread, analyze_event, &max_name_width, &pushed_events,
&event_items, &event_idx);
}
if (rit != pushed_events.rend()) {
double event_time = 0;
double gpu_time = 0.0f;
#ifdef PADDLE_WITH_CUDA
gpu_time = rit->CudaElapsedMs((*analyze_events)[i][j]);
#endif
double cpu_time = rit->CpuElapsedMs((*analyze_events)[i][j]);
if (g_state == ProfilerState::kCUDA) {
event_time = gpu_time;
} else if (g_state == ProfilerState::kCPU) {
event_time = cpu_time;
} else {
event_time = gpu_time + cpu_time;
}
total += event_time;
std::string event_name;
if (merge_thread) {
event_name = rit->name();
max_name_width = std::max(max_name_width, event_name.size());
} else {
event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
rit->name();
max_name_width = std::max(max_name_width, event_name.size());
}
if (event_idx.find(event_name) == event_idx.end()) {
event_idx[event_name] = event_items.size();
EventItem event_item = {event_name, 1, event_time,
event_time, event_time, event_time,
cpu_time, gpu_time, 0.};
event_items.push_back(event_item);
} else {
int index = event_idx[event_name];
event_items[index].calls += 1;
// total time
event_items[index].total_time += event_time;
// min time
event_items[index].min_time =
std::min(event_time, event_items[index].min_time);
// max time
event_items[index].max_time =
std::max(event_time, event_items[index].max_time);
event_items[index].gpu_time += gpu_time;
event_items[index].cpu_time += cpu_time;
}
// remove the push marker from the list
pushed_events.erase((++rit).base());
} else {
LOG(WARNING) << "Cannot find the push marker of event \'"
<< (*analyze_events)[i][j].name()
<< "\', which will be ignored in profiling report.";
auto table_size = event_items.size();
std::vector<int> child_index(table_size, 0);
for (size_t j = 0; j < table_size; ++j) {
std::string fname = event_items[j].name;
std::string grad_name = event_items[j].name + "_grad";
for (size_t k = 0; k < table_size; ++k) {
std::string cname = event_items[k].name;
bool condition = cname.length() > fname.length() &&
cname.rfind(fname, 0) == 0 &&
!cname.rfind(grad_name, 0) == 0 &&
(cname[fname.length()] == '/' &&
cname.rfind('/') == fname.length());
if (condition) {
child_map.insert(
std::pair<std::string, EventItem>(fname, event_items[k]));
child_index[k] = 1;
}
}
}
for (size_t j = 0; j < table_size; ++j) {
if (child_index[j] == 0) {
main_event_items.push_back(event_items[j]);
total += event_items[j].total_time;
}
}
// average time
for (auto &item : event_items) {
for (auto &item : main_event_items) {
item.ave_time = item.total_time / item.calls;
item.ratio = item.total_time / total;
}
for (auto it = child_map.begin(); it != child_map.end(); it++) {
it->second.ratio = it->second.total_time / total;
it->second.ave_time = it->second.ave_time / it->second.calls;
}
// sort
if (sorted_by != EventSortingKey::kDefault) {
std::sort(event_items.begin(), event_items.end(), sorted_func);
std::sort(main_event_items.begin(), main_event_items.end(), sorted_func);
}
events_table.push_back(event_items);
events_table.push_back(main_event_items);
// log warning if there are events with `push` but without `pop`
std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
while (rit != pushed_events.rend()) {
......@@ -548,8 +639,8 @@ void ParseEvents(const std::vector<std::vector<Event>> &events,
}
// Print report
PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12,
merge_thread);
PrintProfiler(events_table, child_map, sorted_domain, max_name_width + 8, 12,
merge_thread, 0, 0);
}
struct MemoryProfierReport {
......@@ -630,6 +721,28 @@ void DisableProfiler(EventSortingKey sorted_key,
}
std::vector<std::vector<Event>> all_events = GetAllEvents();
std::vector<std::string> op_name;
for (size_t i = 0; i < (all_events).size(); i++) {
for (size_t j = 0; j < (all_events)[i].size(); j++) {
std::string event_name = (all_events)[i][j].name();
size_t start = event_name.find('%', 0);
size_t end = event_name.rfind('%', event_name.length() - 1);
if (start == std::string::npos || end == std::string::npos) continue;
std::string search_str = event_name.substr(start, end - start + 1);
auto it = find(op_name.begin(), op_name.end(), search_str);
std::string replace_str;
if (it != op_name.end()) {
replace_str = std::to_string(std::distance(op_name.begin(), it));
} else {
replace_str = std::to_string(op_name.size());
op_name.push_back(search_str);
}
event_name.replace(start, end - start + 1, replace_str);
(all_events)[i][j].set_name(event_name);
}
}
ParseEvents(all_events, true, sorted_key);
ParseEvents(all_events, false, sorted_key);
if (VLOG_IS_ON(5)) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册