未验证 提交 5e90f5e1 编写于 作者: Y Yibing Liu 提交者: GitHub

Merge pull request #7043 from kuke/profiler_tool

Add the parsing part for the profiling tool
......@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/platform/profiler.h"
#include <iomanip>
#include <map>
#include "glog/logging.h"
namespace paddle {
namespace platform {
// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled;
// To record which timer the profiler used, CUDA or CPU.
static std::string g_profiler_place = "";
// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static thread_local int32_t g_thread_id;
......@@ -43,10 +48,7 @@ inline uint64_t GetTimeInNsec() {
Event::Event(EventKind kind, std::string name, uint32_t thread_id,
DeviceContext* dev_ctx)
: kind_(kind),
name_(std::move(name)),
thread_id_(thread_id),
has_cuda_(false) {
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
#ifdef PADDLE_WITH_CUDA
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
if (cuda_dev_ctx) {
......@@ -72,11 +74,11 @@ std::string Event::kind() const {
PADDLE_THROW("Unknown EventKind.");
}
double Event::CpuElapsedUs(const Event& e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000.0);
double Event::CpuElapsedMs(const Event& e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
}
double Event::CudaElapsedUs(const Event& e) const {
double Event::CudaElapsedMs(const Event& e) const {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(e.has_cuda() && has_cuda());
PADDLE_ENFORCE(e.device() == device());
......@@ -84,7 +86,7 @@ double Event::CudaElapsedUs(const Event& e) const {
PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
float ms;
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
return ms * 1000.0;
return ms;
#else
PADDLE_THROW("CUDA is not enabled");
#endif
......@@ -113,21 +115,27 @@ inline EventList& GetEventList() {
}
void Mark(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
dev_ctx);
GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
}
void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
}
void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
}
RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx;
GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
dev_ctx_);
name_ = name;
PushEvent(name_, dev_ctx_);
}
RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled) return;
GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id,
dev_ctx_);
PopEvent(name_, dev_ctx_);
}
void EnableProfiler(ProfilerState state) {
......@@ -138,6 +146,7 @@ void EnableProfiler(ProfilerState state) {
"The profiling state should be disabled when calling ",
"EnableProfiler.");
g_state = state;
g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
#ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) {
// Generate some dummy evenets first to reduce the startup overhead.
......@@ -169,5 +178,152 @@ std::vector<std::vector<Event>> DisableProfiler() {
return result;
}
void ParseEvents(std::vector<std::vector<Event>>& events,
EventSortingKey sorted_by) {
if (g_profiler_place == "") return;
std::string sorted_domain;
std::function<bool(EventItem&, EventItem&)> sorted_func;
switch (sorted_by) {
case EventSortingKey::kCalls:
sorted_domain = "number of calls";
sorted_func = [](EventItem& a, EventItem& b) {
return a.calls > b.calls;
};
break;
case EventSortingKey::kTotal:
sorted_domain = "total time";
sorted_func = [](EventItem& a, EventItem& b) {
return a.total_time > b.total_time;
};
break;
case EventSortingKey::kMin:
sorted_domain = "minimum time";
sorted_func = [](EventItem& a, EventItem& b) {
return a.min_time > b.min_time;
};
break;
case EventSortingKey::kMax:
sorted_domain = "maximum time";
sorted_func = [](EventItem& a, EventItem& b) {
return a.max_time > b.max_time;
};
break;
case EventSortingKey::kAve:
sorted_domain = "average time";
sorted_func = [](EventItem& a, EventItem& b) {
return a.ave_time > b.ave_time;
};
break;
default:
sorted_domain = "event end time";
}
std::vector<std::vector<EventItem>> events_table;
size_t max_name_width = 0;
for (size_t i = 0; i < events.size(); i++) {
std::list<Event> pushed_events;
std::vector<EventItem> event_items;
std::unordered_map<std::string, int> event_idx;
for (size_t j = 0; j < events[i].size(); j++) {
if (events[i][j].kind() == "push") {
pushed_events.push_back(events[i][j]);
} else if (events[i][j].kind() == "pop") {
std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
while (rit != pushed_events.rend() &&
rit->name() != events[i][j].name()) {
++rit;
}
if (rit != pushed_events.rend()) {
double event_time = (g_profiler_place == "CUDA")
? rit->CudaElapsedMs(events[i][j])
: rit->CpuElapsedMs(events[i][j]);
std::string event_name =
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
max_name_width = std::max(max_name_width, event_name.size());
if (event_idx.find(event_name) == event_idx.end()) {
event_idx[event_name] = event_items.size();
EventItem event_item = {event_name, 1, event_time,
event_time, event_time, event_time};
event_items.push_back(event_item);
} else {
int index = event_idx[event_name];
event_items[index].calls += 1;
// total time
event_items[index].total_time += event_time;
// min time
event_items[index].min_time =
std::min(event_time, event_items[index].min_time);
// max time
event_items[index].max_time =
std::max(event_time, event_items[index].max_time);
}
// remove the push marker from the list
pushed_events.erase((++rit).base());
} else {
LOG(WARNING) << "Cannot find the push marker of event \'"
<< events[i][j].name()
<< "\', which will be ignored in profiling report.";
}
}
}
// average time
for (auto& item : event_items) {
item.ave_time = item.total_time / item.calls;
}
// sort
if (sorted_by != EventSortingKey::kDefault) {
std::sort(event_items.begin(), event_items.end(), sorted_func);
}
events_table.push_back(event_items);
// log warning if there are events with `push` but without `pop`
std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
while (rit != pushed_events.rend()) {
LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
<< "\', which will be ignored in profiling report.";
++rit;
}
}
// Print report
PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12);
}
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width,
const size_t data_width) {
// Output header information
std::cout << "\n------------------------->"
<< " Profiling Report "
<< "<-------------------------\n\n";
std::cout << "Place: " << g_profiler_place << std::endl;
std::cout << "Time unit: ms" << std::endl;
std::cout << "Sorted by " << sorted_domain
<< " in descending order in the same thread\n\n";
// Output events table
std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
EventItem& event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time
<< std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time << std::endl;
}
}
std::cout << std::endl;
}
} // namespace platform
} // namespace paddle
......@@ -33,6 +33,7 @@ class Event {
std::string kind() const;
std::string name() const { return name_; }
uint32_t thread_id() const { return thread_id_; }
bool has_cuda() const { return has_cuda_; }
#ifdef PADDLE_WITH_CUDA
......@@ -40,8 +41,8 @@ class Event {
int device() const { return device_; }
#endif
double CpuElapsedUs(const Event& e) const;
double CudaElapsedUs(const Event& e) const;
double CpuElapsedMs(const Event& e) const;
double CudaElapsedMs(const Event& e) const;
private:
EventKind kind_;
......@@ -94,6 +95,10 @@ enum ProfilerState {
void Mark(const std::string& name, DeviceContext* dev_ctx);
void PushEvent(const std::string& name, DeviceContext* dev_ctx);
void PopEvent(const std::string& name, DeviceContext* dev_ctx);
struct RecordEvent {
explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
......@@ -101,6 +106,8 @@ struct RecordEvent {
// The device context is used by Event to get the current cuda stream.
DeviceContext* dev_ctx_;
// Event name
std::string name_;
};
// Enable the profiling function.
......@@ -110,5 +117,26 @@ void EnableProfiler(ProfilerState state);
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> DisableProfiler();
// The information of each event given in the profiling report
struct EventItem {
std::string name;
int calls;
double total_time;
double min_time;
double max_time;
double ave_time;
};
// Candidate keys to sort the profiling report
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
// Parse the event list and output the profiling report
void ParseEvents(std::vector<std::vector<Event>>&,
EventSortingKey sorted_by = EventSortingKey::kDefault);
// Print results
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width,
const size_t data_width);
} // namespace platform
} // namespace paddle
......@@ -26,7 +26,7 @@ TEST(Event, CpuElapsedTime) {
counter++;
}
Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
}
#ifdef PADDLE_WITH_CUDA
......@@ -45,7 +45,7 @@ TEST(Event, CudaElapsedTime) {
counter++;
}
Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
EXPECT_GT(start_event.CudaElapsedUs(stop_event), 0);
EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
}
#endif
......@@ -55,6 +55,7 @@ TEST(RecordEvent, RecordEvent) {
using paddle::platform::EventKind;
using paddle::platform::RecordEvent;
using paddle::platform::ProfilerState;
using paddle::platform::EventSortingKey;
ProfilerState state = ProfilerState::kCPU;
DeviceContext* dev_ctx = nullptr;
......@@ -67,13 +68,45 @@ TEST(RecordEvent, RecordEvent) {
#endif
EnableProfiler(state);
/* Usage 1:
* PushEvent(evt_name, dev_ctx);
* ...
* code to be analyzed
* ...
* PopEvent(evt_name, dev_ctx);
*/
for (int loop = 0; loop < 3; ++loop) {
for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i);
PushEvent(name, dev_ctx);
int counter = 1;
while (counter != i * 1000) counter++;
PopEvent(name, dev_ctx);
}
}
/* Usage 2:
* {
* RecordEvent record_event(name, dev_ctx);
* ...
* code to be analyzed
* ...
* }
*/
for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i);
std::string name = "evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx);
int counter = 1;
while (counter != i * 1000) counter++;
}
// Bad Usage:
PushEvent("event_without_pop", dev_ctx);
PopEvent("event_without_push", dev_ctx);
std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
// Will remove parsing-related code from test later
ParseEvents(events, EventSortingKey::kTotal);
int cuda_startup_count = 0;
int start_profiler_count = 0;
int stop_profiler_count = 0;
......@@ -85,9 +118,9 @@ TEST(RecordEvent, RecordEvent) {
if (events[i][j].name() == "push") {
EXPECT_EQ(events[i][j + 1].name(), "pop");
#ifdef PADDLE_WITH_CUDA
EXPECT_GT(events[i][j].CudaElapsedUs(events[i][j + 1]), 0);
EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
#else
EXPECT_GT(events[i][j].CpuElapsedUs(events[i][j + 1]), 0);
EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
#endif
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册