From add367c3f4b77103cda50b9359492f34a366477c Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Tue, 10 Apr 2018 02:25:26 +0800 Subject: [PATCH] Code cleanup in the profiler code. (#9782) --- paddle/fluid/platform/profiler.cc | 194 +++++++++++++++---------- paddle/fluid/platform/profiler.h | 62 +------- paddle/fluid/platform/profiler_test.cc | 17 ++- 3 files changed, 133 insertions(+), 140 deletions(-) diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index b25206ff35c..412cdda286c 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -15,8 +15,11 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include #include +#include #include #include +#include // NOLINT +#include #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA @@ -28,10 +31,10 @@ limitations under the License. */ namespace paddle { namespace platform { +struct EventList; + // The profiler state, the initial value is ProfilerState::kDisabled static ProfilerState g_state = ProfilerState::kDisabled; -// To record which timer the profiler used, CUDA or CPU. -static std::string g_profiler_place = ""; // The thread local event list only can be accessed by the specific thread // The thread index of each thread static thread_local int32_t g_thread_id; @@ -45,6 +48,39 @@ static std::list> g_all_event_lists; // The thread local event list only can be accessed by the specific thread static thread_local std::shared_ptr g_event_list; +struct EventList { + constexpr static size_t kMB = 1024 * 1024; + constexpr static size_t kEventBlockSize = 16 * kMB; + constexpr static size_t kEventSize = sizeof(Event); + constexpr static size_t kEventAlign = alignof(Event); + constexpr static size_t kNumBlock = + kEventBlockSize / + ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); + + template + void Record(Args&&... args) { + if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) { + event_blocks.emplace_front(); + event_blocks.front().reserve(kNumBlock); + } + event_blocks.front().emplace_back(std::forward(args)...); + } + + std::vector Reduce() { + std::vector result; + for (auto& block : event_blocks) { + result.insert(result.begin(), std::make_move_iterator(block.begin()), + std::make_move_iterator(block.end())); + } + event_blocks.clear(); + return result; + } + + void Clear() { event_blocks.clear(); } + + std::forward_list> event_blocks; +}; + inline uint64_t GetTimeInNsec() { using clock = std::conditional(tv.tv_sec) * 1000000 + tv.tv_usec); } -Event::Event(EventKind kind, std::string name, uint32_t thread_id, +Event::Event(EventType type, std::string name, uint32_t thread_id, const DeviceContext* dev_ctx) - : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { + : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) { #ifdef PADDLE_WITH_CUDA has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false; if (has_cuda_) { @@ -76,17 +112,7 @@ Event::Event(EventKind kind, std::string name, uint32_t thread_id, cpu_ns_ = GetTimeInNsec(); } -std::string Event::kind() const { - switch (kind_) { - case EventKind::kMark: - return "mark"; - case EventKind::kPushRange: - return "push"; - case EventKind::kPopRange: - return "pop"; - } - PADDLE_THROW("Unknown EventKind."); -} +const EventType& Event::type() const { return type_; } double Event::CpuElapsedMs(const Event& e) const { return (e.cpu_ns_ - cpu_ns_) / (1000000.0); @@ -129,15 +155,15 @@ inline EventList& GetEventList() { } void Mark(const std::string& name, const DeviceContext* dev_ctx) { - GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx); + GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx); } void PushEvent(const std::string& name, const DeviceContext* dev_ctx) { - GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx); + GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx); } void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { - GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); + GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx); } RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) @@ -197,12 +223,7 @@ void EnableProfiler(ProfilerState state) { "The profiling state should be disabled when calling ", "EnableProfiler."); g_state = state; - if (g_state == ProfilerState::kCUDA) { - g_profiler_place = "CUDA"; - } else if (g_state == ProfilerState::kCPU) { - g_profiler_place = "CPU"; - } else { - g_profiler_place = "All"; + if (g_state == ProfilerState::kAll) { GetDeviceTracer()->Enable(); } #ifdef PADDLE_WITH_CUDA @@ -240,27 +261,63 @@ std::vector> GetAllEvents() { return result; } -void DisableProfiler(EventSortingKey sorted_key, - const std::string& profile_path) { - PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, - "Can't disable profiling, since it's not starting."); - // Mark the profiling stop. - Mark("_stop_profiler_", nullptr); - g_state = ProfilerState::kDisabled; +// The information of each event given in the profiling report +struct EventItem { + std::string name; + int calls; + double total_time; + double min_time; + double max_time; + double ave_time; +}; + +// Print results +void PrintProfiler(const std::vector>& events_table, + const std::string& sorted_domain, const size_t name_width, + const size_t data_width) { + // Output header information + std::cout << "\n------------------------->" + << " Profiling Report " + << "<-------------------------\n\n"; + std::string place; + if (g_state == ProfilerState::kCPU) { + place = "CPU"; + } else if (g_state == ProfilerState::kCUDA) { + place = "CUDA"; + } else if (g_state == ProfilerState::kAll) { + place = "All"; + } else { + PADDLE_THROW("Invalid profiler state"); + } - std::vector> all_events = GetAllEvents(); - ParseEvents(all_events, sorted_key); - ResetProfiler(); - DeviceTracer* tracer = GetDeviceTracer(); - if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) { - tracer->Disable(); - tracer->GenProfile(profile_path); + std::cout << "Place: " << place << std::endl; + std::cout << "Time unit: ms" << std::endl; + std::cout << "Sorted by " << sorted_domain + << " in descending order in the same thread\n\n"; + // Output events table + std::cout.setf(std::ios::left); + std::cout << std::setw(name_width) << "Event" << std::setw(data_width) + << "Calls" << std::setw(data_width) << "Total" + << std::setw(data_width) << "Min." << std::setw(data_width) + << "Max." << std::setw(data_width) << "Ave." << std::endl; + for (size_t i = 0; i < events_table.size(); ++i) { + for (size_t j = 0; j < events_table[i].size(); ++j) { + const EventItem& event_item = events_table[i][j]; + std::cout << std::setw(name_width) << event_item.name + << std::setw(data_width) << event_item.calls + << std::setw(data_width) << event_item.total_time + << std::setw(data_width) << event_item.min_time + << std::setw(data_width) << event_item.max_time + << std::setw(data_width) << event_item.ave_time << std::endl; + } } + std::cout << std::endl; } -void ParseEvents(std::vector>& events, - EventSortingKey sorted_by) { - if (g_profiler_place == "") return; +// Parse the event list and output the profiling report +void ParseEvents(const std::vector>& events, + EventSortingKey sorted_by = EventSortingKey::kDefault) { + if (g_state == ProfilerState::kDisabled) return; std::string sorted_domain; std::function sorted_func; @@ -307,9 +364,9 @@ void ParseEvents(std::vector>& events, std::unordered_map event_idx; for (size_t j = 0; j < events[i].size(); j++) { - if (events[i][j].kind() == "push") { + if (events[i][j].type() == EventType::kPushRange) { pushed_events.push_back(events[i][j]); - } else if (events[i][j].kind() == "pop") { + } else if (events[i][j].type() == EventType::kPopRange) { std::list::reverse_iterator rit = pushed_events.rbegin(); while (rit != pushed_events.rend() && rit->name() != events[i][j].name()) { @@ -317,10 +374,10 @@ void ParseEvents(std::vector>& events, } if (rit != pushed_events.rend()) { - double event_time = - (g_profiler_place == "CUDA" || g_profiler_place == "All") - ? rit->CudaElapsedMs(events[i][j]) - : rit->CpuElapsedMs(events[i][j]); + double event_time = (g_state == ProfilerState::kCUDA || + g_state == ProfilerState::kAll) + ? rit->CudaElapsedMs(events[i][j]) + : rit->CpuElapsedMs(events[i][j]); std::string event_name = "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); @@ -376,35 +433,22 @@ void ParseEvents(std::vector>& events, PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12); } -void PrintProfiler(std::vector>& events_table, - std::string& sorted_domain, const size_t name_width, - const size_t data_width) { - // Output header information - std::cout << "\n------------------------->" - << " Profiling Report " - << "<-------------------------\n\n"; - std::cout << "Place: " << g_profiler_place << std::endl; - std::cout << "Time unit: ms" << std::endl; - std::cout << "Sorted by " << sorted_domain - << " in descending order in the same thread\n\n"; - // Output events table - std::cout.setf(std::ios::left); - std::cout << std::setw(name_width) << "Event" << std::setw(data_width) - << "Calls" << std::setw(data_width) << "Total" - << std::setw(data_width) << "Min." << std::setw(data_width) - << "Max." << std::setw(data_width) << "Ave." << std::endl; - for (size_t i = 0; i < events_table.size(); ++i) { - for (size_t j = 0; j < events_table[i].size(); ++j) { - EventItem& event_item = events_table[i][j]; - std::cout << std::setw(name_width) << event_item.name - << std::setw(data_width) << event_item.calls - << std::setw(data_width) << event_item.total_time - << std::setw(data_width) << event_item.min_time - << std::setw(data_width) << event_item.max_time - << std::setw(data_width) << event_item.ave_time << std::endl; - } +void DisableProfiler(EventSortingKey sorted_key, + const std::string& profile_path) { + PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, + "Can't disable profiling, since it's not starting."); + // Mark the profiling stop. + Mark("_stop_profiler_", nullptr); + + std::vector> all_events = GetAllEvents(); + ParseEvents(all_events, sorted_key); + ResetProfiler(); + DeviceTracer* tracer = GetDeviceTracer(); + if (g_state == ProfilerState::kAll && tracer && tracer->IsEnabled()) { + tracer->Disable(); + tracer->GenProfile(profile_path); } - std::cout << std::endl; + g_state = ProfilerState::kDisabled; } } // namespace platform diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index de9a5cc20d7..b07427c8f69 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include #include -#include +#include #include #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/profiler.pb.h" @@ -23,16 +23,16 @@ limitations under the License. */ namespace paddle { namespace platform { -enum EventKind { kMark, kPushRange, kPopRange }; +enum EventType { kMark, kPushRange, kPopRange }; class Event { public: // The DeviceContext is used to get the cuda stream. // If CPU profiling mode, can pass nullptr. - Event(EventKind kind, std::string name, uint32_t thread_id, + Event(EventType type, std::string name, uint32_t thread_id, const DeviceContext* dev_ctx); - std::string kind() const; + const EventType& type() const; std::string name() const { return name_; } uint32_t thread_id() const { return thread_id_; } bool has_cuda() const { return has_cuda_; } @@ -46,7 +46,7 @@ class Event { double CudaElapsedMs(const Event& e) const; private: - EventKind kind_; + EventType type_; std::string name_; uint32_t thread_id_; int64_t cpu_ns_; @@ -57,39 +57,6 @@ class Event { #endif }; -struct EventList { - constexpr static size_t kMB = 1024 * 1024; - constexpr static size_t kEventBlockSize = 16 * kMB; - constexpr static size_t kEventSize = sizeof(Event); - constexpr static size_t kEventAlign = alignof(Event); - constexpr static size_t kNumBlock = - kEventBlockSize / - ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); - - template - void Record(Args&&... args) { - if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) { - event_blocks.emplace_front(); - event_blocks.front().reserve(kNumBlock); - } - event_blocks.front().emplace_back(std::forward(args)...); - } - - std::vector Reduce() { - std::vector result; - for (auto& block : event_blocks) { - result.insert(result.begin(), std::make_move_iterator(block.begin()), - std::make_move_iterator(block.end())); - } - event_blocks.clear(); - return result; - } - - void Clear() { event_blocks.clear(); } - - std::forward_list> event_blocks; -}; - enum ProfilerState { kDisabled, // disabled state kCPU, // CPU profiling state @@ -136,16 +103,6 @@ struct RecordThread { // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector> GetAllEvents(); -// The information of each event given in the profiling report -struct EventItem { - std::string name; - int calls; - double total_time; - double min_time; - double max_time; - double ave_time; -}; - // Candidate keys to sort the profiling report enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; @@ -158,14 +115,5 @@ void ResetProfiler(); void DisableProfiler(EventSortingKey sorted_key, const std::string& profile_path); -// Parse the event list and output the profiling report -void ParseEvents(std::vector>&, - EventSortingKey sorted_by = EventSortingKey::kDefault); - -// Print results -void PrintProfiler(std::vector>& events_table, - std::string& sorted_domain, const size_t name_width, - const size_t data_width); - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index 45cc271bb88..61f467814ba 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -13,22 +13,23 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler.h" +#include #ifdef PADDLE_WITH_CUDA -#include "cuda_runtime.h" +#include #endif #include "gtest/gtest.h" TEST(Event, CpuElapsedTime) { using paddle::platform::Event; - using paddle::platform::EventKind; + using paddle::platform::EventType; - Event start_event(EventKind::kPushRange, "test", 0, nullptr); + Event start_event(EventType::kPushRange, "test", 0, nullptr); EXPECT_TRUE(start_event.has_cuda() == false); int counter = 0; while (counter != 1000) { counter++; } - Event stop_event(EventKind::kPopRange, "test", 0, nullptr); + Event stop_event(EventType::kPopRange, "test", 0, nullptr); EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0); } @@ -38,16 +39,16 @@ TEST(Event, CudaElapsedTime) { using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; using paddle::platform::Event; - using paddle::platform::EventKind; + using paddle::platform::EventType; DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0)); - Event start_event(EventKind::kPushRange, "test", 0, dev_ctx); + Event start_event(EventType::kPushRange, "test", 0, dev_ctx); EXPECT_TRUE(start_event.has_cuda() == true); int counter = 0; while (counter != 1000) { counter++; } - Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx); + Event stop_event(EventType::kPopRange, "test", 0, dev_ctx); EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0); } #endif @@ -55,7 +56,7 @@ TEST(Event, CudaElapsedTime) { TEST(RecordEvent, RecordEvent) { using paddle::platform::DeviceContext; using paddle::platform::Event; - using paddle::platform::EventKind; + using paddle::platform::EventType; using paddle::platform::RecordEvent; using paddle::platform::ProfilerState; using paddle::platform::EventSortingKey; -- GitLab