提交 add367c3 编写于 作者: Q qingqing01 提交者: Yi Wang

Code cleanup in the profiler code. (#9782)

上级 326b434a
...@@ -15,8 +15,11 @@ limitations under the License. */ ...@@ -15,8 +15,11 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <sys/time.h> #include <sys/time.h>
#include <time.h> #include <time.h>
#include <algorithm>
#include <iomanip> #include <iomanip>
#include <map> #include <map>
#include <mutex> // NOLINT
#include <string>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
...@@ -28,10 +31,10 @@ limitations under the License. */ ...@@ -28,10 +31,10 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct EventList;
// The profiler state, the initial value is ProfilerState::kDisabled // The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled; static ProfilerState g_state = ProfilerState::kDisabled;
// To record which timer the profiler used, CUDA or CPU.
static std::string g_profiler_place = "";
// The thread local event list only can be accessed by the specific thread // The thread local event list only can be accessed by the specific thread
// The thread index of each thread // The thread index of each thread
static thread_local int32_t g_thread_id; static thread_local int32_t g_thread_id;
...@@ -45,6 +48,39 @@ static std::list<std::shared_ptr<EventList>> g_all_event_lists; ...@@ -45,6 +48,39 @@ static std::list<std::shared_ptr<EventList>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread // The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList> g_event_list; static thread_local std::shared_ptr<EventList> g_event_list;
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(Event);
constexpr static size_t kEventAlign = alignof(Event);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
void Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
}
std::vector<Event> Reduce() {
std::vector<Event> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<Event>> event_blocks;
};
inline uint64_t GetTimeInNsec() { inline uint64_t GetTimeInNsec() {
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady, using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock, std::chrono::high_resolution_clock,
...@@ -60,9 +96,9 @@ inline uint64_t PosixInNsec() { ...@@ -60,9 +96,9 @@ inline uint64_t PosixInNsec() {
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec); return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
} }
Event::Event(EventKind kind, std::string name, uint32_t thread_id, Event::Event(EventType type, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx) const DeviceContext* dev_ctx)
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false; has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
if (has_cuda_) { if (has_cuda_) {
...@@ -76,17 +112,7 @@ Event::Event(EventKind kind, std::string name, uint32_t thread_id, ...@@ -76,17 +112,7 @@ Event::Event(EventKind kind, std::string name, uint32_t thread_id,
cpu_ns_ = GetTimeInNsec(); cpu_ns_ = GetTimeInNsec();
} }
std::string Event::kind() const { const EventType& Event::type() const { return type_; }
switch (kind_) {
case EventKind::kMark:
return "mark";
case EventKind::kPushRange:
return "push";
case EventKind::kPopRange:
return "pop";
}
PADDLE_THROW("Unknown EventKind.");
}
double Event::CpuElapsedMs(const Event& e) const { double Event::CpuElapsedMs(const Event& e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000000.0); return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
...@@ -129,15 +155,15 @@ inline EventList& GetEventList() { ...@@ -129,15 +155,15 @@ inline EventList& GetEventList() {
} }
void Mark(const std::string& name, const DeviceContext* dev_ctx) { void Mark(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx); GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx);
} }
void PushEvent(const std::string& name, const DeviceContext* dev_ctx) { void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx);
} }
void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx);
} }
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
...@@ -197,12 +223,7 @@ void EnableProfiler(ProfilerState state) { ...@@ -197,12 +223,7 @@ void EnableProfiler(ProfilerState state) {
"The profiling state should be disabled when calling ", "The profiling state should be disabled when calling ",
"EnableProfiler."); "EnableProfiler.");
g_state = state; g_state = state;
if (g_state == ProfilerState::kCUDA) { if (g_state == ProfilerState::kAll) {
g_profiler_place = "CUDA";
} else if (g_state == ProfilerState::kCPU) {
g_profiler_place = "CPU";
} else {
g_profiler_place = "All";
GetDeviceTracer()->Enable(); GetDeviceTracer()->Enable();
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -240,27 +261,63 @@ std::vector<std::vector<Event>> GetAllEvents() { ...@@ -240,27 +261,63 @@ std::vector<std::vector<Event>> GetAllEvents() {
return result; return result;
} }
void DisableProfiler(EventSortingKey sorted_key, // The information of each event given in the profiling report
const std::string& profile_path) { struct EventItem {
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, std::string name;
"Can't disable profiling, since it's not starting."); int calls;
// Mark the profiling stop. double total_time;
Mark("_stop_profiler_", nullptr); double min_time;
g_state = ProfilerState::kDisabled; double max_time;
double ave_time;
};
// Print results
void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
const std::string& sorted_domain, const size_t name_width,
const size_t data_width) {
// Output header information
std::cout << "\n------------------------->"
<< " Profiling Report "
<< "<-------------------------\n\n";
std::string place;
if (g_state == ProfilerState::kCPU) {
place = "CPU";
} else if (g_state == ProfilerState::kCUDA) {
place = "CUDA";
} else if (g_state == ProfilerState::kAll) {
place = "All";
} else {
PADDLE_THROW("Invalid profiler state");
}
std::vector<std::vector<Event>> all_events = GetAllEvents(); std::cout << "Place: " << place << std::endl;
ParseEvents(all_events, sorted_key); std::cout << "Time unit: ms" << std::endl;
ResetProfiler(); std::cout << "Sorted by " << sorted_domain
DeviceTracer* tracer = GetDeviceTracer(); << " in descending order in the same thread\n\n";
if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) { // Output events table
tracer->Disable(); std::cout.setf(std::ios::left);
tracer->GenProfile(profile_path); std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
const EventItem& event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time
<< std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time << std::endl;
}
} }
std::cout << std::endl;
} }
void ParseEvents(std::vector<std::vector<Event>>& events, // Parse the event list and output the profiling report
EventSortingKey sorted_by) { void ParseEvents(const std::vector<std::vector<Event>>& events,
if (g_profiler_place == "") return; EventSortingKey sorted_by = EventSortingKey::kDefault) {
if (g_state == ProfilerState::kDisabled) return;
std::string sorted_domain; std::string sorted_domain;
std::function<bool(const EventItem&, const EventItem&)> sorted_func; std::function<bool(const EventItem&, const EventItem&)> sorted_func;
...@@ -307,9 +364,9 @@ void ParseEvents(std::vector<std::vector<Event>>& events, ...@@ -307,9 +364,9 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
std::unordered_map<std::string, int> event_idx; std::unordered_map<std::string, int> event_idx;
for (size_t j = 0; j < events[i].size(); j++) { for (size_t j = 0; j < events[i].size(); j++) {
if (events[i][j].kind() == "push") { if (events[i][j].type() == EventType::kPushRange) {
pushed_events.push_back(events[i][j]); pushed_events.push_back(events[i][j]);
} else if (events[i][j].kind() == "pop") { } else if (events[i][j].type() == EventType::kPopRange) {
std::list<Event>::reverse_iterator rit = pushed_events.rbegin(); std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
while (rit != pushed_events.rend() && while (rit != pushed_events.rend() &&
rit->name() != events[i][j].name()) { rit->name() != events[i][j].name()) {
...@@ -317,8 +374,8 @@ void ParseEvents(std::vector<std::vector<Event>>& events, ...@@ -317,8 +374,8 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
} }
if (rit != pushed_events.rend()) { if (rit != pushed_events.rend()) {
double event_time = double event_time = (g_state == ProfilerState::kCUDA ||
(g_profiler_place == "CUDA" || g_profiler_place == "All") g_state == ProfilerState::kAll)
? rit->CudaElapsedMs(events[i][j]) ? rit->CudaElapsedMs(events[i][j])
: rit->CpuElapsedMs(events[i][j]); : rit->CpuElapsedMs(events[i][j]);
...@@ -376,35 +433,22 @@ void ParseEvents(std::vector<std::vector<Event>>& events, ...@@ -376,35 +433,22 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12); PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
} }
void PrintProfiler(std::vector<std::vector<EventItem>>& events_table, void DisableProfiler(EventSortingKey sorted_key,
std::string& sorted_domain, const size_t name_width, const std::string& profile_path) {
const size_t data_width) { PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
// Output header information "Can't disable profiling, since it's not starting.");
std::cout << "\n------------------------->" // Mark the profiling stop.
<< " Profiling Report " Mark("_stop_profiler_", nullptr);
<< "<-------------------------\n\n";
std::cout << "Place: " << g_profiler_place << std::endl; std::vector<std::vector<Event>> all_events = GetAllEvents();
std::cout << "Time unit: ms" << std::endl; ParseEvents(all_events, sorted_key);
std::cout << "Sorted by " << sorted_domain ResetProfiler();
<< " in descending order in the same thread\n\n"; DeviceTracer* tracer = GetDeviceTracer();
// Output events table if (g_state == ProfilerState::kAll && tracer && tracer->IsEnabled()) {
std::cout.setf(std::ios::left); tracer->Disable();
std::cout << std::setw(name_width) << "Event" << std::setw(data_width) tracer->GenProfile(profile_path);
<< "Calls" << std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
EventItem& event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time
<< std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time << std::endl;
}
} }
std::cout << std::endl; g_state = ProfilerState::kDisabled;
} }
} // namespace platform } // namespace platform
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <forward_list> #include <forward_list>
#include <list> #include <list>
#include <mutex> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler.pb.h"
...@@ -23,16 +23,16 @@ limitations under the License. */ ...@@ -23,16 +23,16 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
enum EventKind { kMark, kPushRange, kPopRange }; enum EventType { kMark, kPushRange, kPopRange };
class Event { class Event {
public: public:
// The DeviceContext is used to get the cuda stream. // The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr. // If CPU profiling mode, can pass nullptr.
Event(EventKind kind, std::string name, uint32_t thread_id, Event(EventType type, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx); const DeviceContext* dev_ctx);
std::string kind() const; const EventType& type() const;
std::string name() const { return name_; } std::string name() const { return name_; }
uint32_t thread_id() const { return thread_id_; } uint32_t thread_id() const { return thread_id_; }
bool has_cuda() const { return has_cuda_; } bool has_cuda() const { return has_cuda_; }
...@@ -46,7 +46,7 @@ class Event { ...@@ -46,7 +46,7 @@ class Event {
double CudaElapsedMs(const Event& e) const; double CudaElapsedMs(const Event& e) const;
private: private:
EventKind kind_; EventType type_;
std::string name_; std::string name_;
uint32_t thread_id_; uint32_t thread_id_;
int64_t cpu_ns_; int64_t cpu_ns_;
...@@ -57,39 +57,6 @@ class Event { ...@@ -57,39 +57,6 @@ class Event {
#endif #endif
}; };
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(Event);
constexpr static size_t kEventAlign = alignof(Event);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
void Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
}
std::vector<Event> Reduce() {
std::vector<Event> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<Event>> event_blocks;
};
enum ProfilerState { enum ProfilerState {
kDisabled, // disabled state kDisabled, // disabled state
kCPU, // CPU profiling state kCPU, // CPU profiling state
...@@ -136,16 +103,6 @@ struct RecordThread { ...@@ -136,16 +103,6 @@ struct RecordThread {
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> GetAllEvents(); std::vector<std::vector<Event>> GetAllEvents();
// The information of each event given in the profiling report
struct EventItem {
std::string name;
int calls;
double total_time;
double min_time;
double max_time;
double ave_time;
};
// Candidate keys to sort the profiling report // Candidate keys to sort the profiling report
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
...@@ -158,14 +115,5 @@ void ResetProfiler(); ...@@ -158,14 +115,5 @@ void ResetProfiler();
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path); const std::string& profile_path);
// Parse the event list and output the profiling report
void ParseEvents(std::vector<std::vector<Event>>&,
EventSortingKey sorted_by = EventSortingKey::kDefault);
// Print results
void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width,
const size_t data_width);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -13,22 +13,23 @@ See the License for the specific language governing permissions and ...@@ -13,22 +13,23 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <string>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "cuda_runtime.h" #include <cuda_runtime.h>
#endif #endif
#include "gtest/gtest.h" #include "gtest/gtest.h"
TEST(Event, CpuElapsedTime) { TEST(Event, CpuElapsedTime) {
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventKind; using paddle::platform::EventType;
Event start_event(EventKind::kPushRange, "test", 0, nullptr); Event start_event(EventType::kPushRange, "test", 0, nullptr);
EXPECT_TRUE(start_event.has_cuda() == false); EXPECT_TRUE(start_event.has_cuda() == false);
int counter = 0; int counter = 0;
while (counter != 1000) { while (counter != 1000) {
counter++; counter++;
} }
Event stop_event(EventKind::kPopRange, "test", 0, nullptr); Event stop_event(EventType::kPopRange, "test", 0, nullptr);
EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0); EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
} }
...@@ -38,16 +39,16 @@ TEST(Event, CudaElapsedTime) { ...@@ -38,16 +39,16 @@ TEST(Event, CudaElapsedTime) {
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
using paddle::platform::CUDAPlace; using paddle::platform::CUDAPlace;
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventKind; using paddle::platform::EventType;
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0)); DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
Event start_event(EventKind::kPushRange, "test", 0, dev_ctx); Event start_event(EventType::kPushRange, "test", 0, dev_ctx);
EXPECT_TRUE(start_event.has_cuda() == true); EXPECT_TRUE(start_event.has_cuda() == true);
int counter = 0; int counter = 0;
while (counter != 1000) { while (counter != 1000) {
counter++; counter++;
} }
Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx); Event stop_event(EventType::kPopRange, "test", 0, dev_ctx);
EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0); EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
} }
#endif #endif
...@@ -55,7 +56,7 @@ TEST(Event, CudaElapsedTime) { ...@@ -55,7 +56,7 @@ TEST(Event, CudaElapsedTime) {
TEST(RecordEvent, RecordEvent) { TEST(RecordEvent, RecordEvent) {
using paddle::platform::DeviceContext; using paddle::platform::DeviceContext;
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventKind; using paddle::platform::EventType;
using paddle::platform::RecordEvent; using paddle::platform::RecordEvent;
using paddle::platform::ProfilerState; using paddle::platform::ProfilerState;
using paddle::platform::EventSortingKey; using paddle::platform::EventSortingKey;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册