From 0a5fbb06508731aa55ffda3e4a68a9fabff2a72a Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Fri, 29 Dec 2017 18:04:03 +0800 Subject: [PATCH] Refine code struct. --- paddle/platform/device_context.h | 12 --- paddle/platform/profiler.cc | 149 +++++++++++++++++++++++++------ paddle/platform/profiler.h | 131 +++++---------------------- paddle/platform/profiler_test.cc | 12 +-- 4 files changed, 154 insertions(+), 150 deletions(-) diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 07e197ba0b..2b366e6383 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -115,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext { cudnnHandle_t cudnn_handle_; }; -class DeviceGuard { - public: - explicit DeviceGuard(int device) { - original_device_ = platform::GetCurrentDeviceId(); - platform::SetDeviceId(device); - } - ~DeviceGuard() { platform::SetDeviceId(original_device_); } - - private: - int original_device_; -}; - #endif /*! \brief device context pool singleton */ diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index 40b34b732c..4e89e5c600 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -17,34 +17,133 @@ limitations under the License. */ namespace paddle { namespace platform { -ProfilerState kState = ProfilerState::kDisabled; -uint32_t kNextThreadId = 0; -std::mutex kAllEventListsMutex; -std::list> kAllEventLists; -thread_local std::shared_ptr kEventList; -thread_local int32_t kThreadId; +// The profiler state, the initial value is ProfilerState::kDisabled +static ProfilerState g_state = ProfilerState::kDisabled; +// The thread local event list only can be accessed by the specific thread +// The thread index of each thread +static thread_local int32_t g_thread_id; +// The g_next_thread_id is a global counter for threads, by the g_thread_id and +// g_next_thread_id, we can know how many threads have created EventList. +static uint32_t g_next_thread_id = 0; +// The global mutex +static std::mutex g_all_event_lists_mutex; +// The total event lists of all threads +static std::list> g_all_event_lists; +// The thread local event list only can be accessed by the specific thread +static thread_local std::shared_ptr g_event_list; + +inline uint64_t GetTimeInNsec() { + using clock = std::conditional::type; + return std::chrono::duration_cast( + clock::now().time_since_epoch()) + .count(); +} + +Event::Event(EventKind kind, std::string name, uint32_t thread_id, + DeviceContext* dev_ctx) + : kind_(kind), + name_(std::move(name)), + thread_id_(thread_id), + has_cuda_(false) { +#ifdef PADDLE_WITH_CUDA + auto* cuda_dev_ctx = static_cast(dev_ctx); + if (cuda_dev_ctx) { + PADDLE_ENFORCE(cudaGetDevice(&device_)); + PADDLE_ENFORCE(cudaEventCreate(&event_)); + auto stream = cuda_dev_ctx->stream(); + PADDLE_ENFORCE(cudaEventRecord(event_, stream)); + has_cuda_ = true; + } +#endif + cpu_ns_ = GetTimeInNsec(); +} + +std::string Event::kind() const { + switch (kind_) { + case EventKind::kMark: + return "mark"; + case EventKind::kPushRange: + return "push"; + case EventKind::kPopRange: + return "pop"; + } + PADDLE_THROW("Unknown EventKind."); +} + +double Event::CpuElapsedUs(const Event& e) const { + return (e.cpu_ns_ - cpu_ns_) / (1000.0); +} + +double Event::CudaElapsedUs(const Event& e) const { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(e.has_cuda() && has_cuda()); + PADDLE_ENFORCE(e.device() == device()); + PADDLE_ENFORCE(cudaEventSynchronize(event_)); + PADDLE_ENFORCE(cudaEventSynchronize(e.event())); + float ms; + PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); + return ms * 1000.0; +#else + PADDLE_THROW("CUDA is not enabled"); +#endif +} + +#ifdef PADDLE_WITH_CUDA +static void ForEachDevice(std::function func) { + auto original_device = GetCurrentDeviceId(); + int count = GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + SetDeviceId(i); + func(i); + } + SetDeviceId(original_device); +} +#endif + +inline EventList& GetEventList() { + if (!g_event_list) { + std::lock_guard guard(g_all_event_lists_mutex); + g_event_list = std::make_shared(); + g_thread_id = g_next_thread_id++; + g_all_event_lists.emplace_front(g_event_list); + } + return *g_event_list; +} + +void Mark(const std::string& name, DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id, + dev_ctx); +} + +RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) { + if (g_state == ProfilerState::kDisabled) return; + dev_ctx_ = dev_ctx; + GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id, + dev_ctx_); +} + +RecordEvent::~RecordEvent() { + if (g_state == ProfilerState::kDisabled) return; + GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id, + dev_ctx_); +} void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE(state != ProfilerState::kDisabled, "Can't enbale profling, since the input state is ", "ProfilerState::kDisabled"); - PADDLE_ENFORCE(kState == ProfilerState::kDisabled, + PADDLE_ENFORCE(g_state == ProfilerState::kDisabled, "The profiling state should be disabled when calling ", "EnableProfiler."); - kState = state; + g_state = state; #ifdef PADDLE_WITH_CUDA - auto ForEachDevice = [](std::function op) { - int count = GetCUDADeviceCount(); - for (int i = 0; i < count; i++) { - DeviceGuard dev_guard(i); - op(i); - } - }; - if (kState == ProfilerState::kCUDA) { + if (g_state == ProfilerState::kCUDA) { // Generate some dummy evenets first to reduce the startup overhead. for (int i = 0; i < 5; i++) { ForEachDevice([](int d) { - DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(d)); + DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); Mark("_cuda_startup_", dev_ctx); dev_ctx->Wait(); }); @@ -52,20 +151,20 @@ void EnableProfiler(ProfilerState state) { } #endif // Mark the profiling start. - Mark("_start_profiler_"); + Mark("_start_profiler_", nullptr); } std::vector> DisableProfiler() { - PADDLE_ENFORCE(kState != ProfilerState::kDisabled, + PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, "Can't disable profiling, since it's not starting."); // Mark the profiling stop. - Mark("_stop_profiler_"); - kState = ProfilerState::kDisabled; + Mark("_stop_profiler_", nullptr); + g_state = ProfilerState::kDisabled; std::vector> result; - std::lock_guard guard(kAllEventListsMutex); - for (auto it = kAllEventLists.begin(); it != kAllEventLists.end(); ++it) { - auto& list = *it; - result.emplace_back(list->Reduce()); + std::lock_guard guard(g_all_event_lists_mutex); + for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); + ++it) { + result.emplace_back((*it)->Reduce()); } return result; } diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h index 2242635024..47104ea9d0 100644 --- a/paddle/platform/profiler.h +++ b/paddle/platform/profiler.h @@ -24,76 +24,24 @@ namespace platform { enum EventKind { kMark, kPushRange, kPopRange }; -inline uint64_t GetTimeInNsec() { - // using std::chrono; - using clock = std::conditional::type; - return std::chrono::duration_cast( - clock::now().time_since_epoch()) - .count(); -} - class Event { public: - // the DeviceContext is used to get the cuda stream. + // The DeviceContext is used to get the cuda stream. + // If CPU profiling mode, can pass nullptr. Event(EventKind kind, std::string name, uint32_t thread_id, - const platform::DeviceContext* dev_ctx = nullptr) - : kind_(kind), name_(std::move(name)), thread_id_(thread_id) { - has_cuda_ = false; -#ifdef PADDLE_WITH_CUDA - auto* cuda_dev_ctx = - static_cast(dev_ctx); - if (cuda_dev_ctx) { - PADDLE_ENFORCE(cudaGetDevice(&device_)); - PADDLE_ENFORCE(cudaEventCreate(&event_)); - auto stream = cuda_dev_ctx->stream(); - PADDLE_ENFORCE(cudaEventRecord(event_, stream)); - has_cuda_ = true; - } -#endif - cpu_ns_ = GetTimeInNsec(); - } - - std::string kind() const { - switch (kind_) { - case EventKind::kMark: - return "mark"; - case EventKind::kPushRange: - return "push"; - case EventKind::kPopRange: - return "pop"; - } - PADDLE_THROW("Unknown EventKind."); - } + DeviceContext* dev_ctx); + std::string kind() const; std::string name() const { return name_; } - bool has_cuda() const { return has_cuda_; } #ifdef PADDLE_WITH_CUDA cudaEvent_t event() const { return event_; } - int device() const { return device_; } #endif - double CpuElapsedUs(const Event& e) const { - return (e.cpu_ns_ - cpu_ns_) / (1000.0); - } - - double CudaElapsedUs(const Event& e) const { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE(e.has_cuda() && has_cuda()); - PADDLE_ENFORCE(e.device() == device()); - PADDLE_ENFORCE(cudaEventSynchronize(event_)); - PADDLE_ENFORCE(cudaEventSynchronize(e.event())); - float ms; - PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); - return ms * 1000.0; -#else - PADDLE_THROW("CUDA is not enabled"); -#endif - } + double CpuElapsedUs(const Event& e) const; + double CudaElapsedUs(const Event& e) const; private: EventKind kind_; @@ -108,11 +56,11 @@ class Event { }; struct EventList { - constexpr static std::size_t kMB = 1024 * 1024; - constexpr static std::size_t kEventBlockSize = 16 * kMB; - constexpr static std::size_t kEventSize = sizeof(Event); - constexpr static std::size_t kEventAlign = alignof(Event); - constexpr static std::size_t kNumBlock = + constexpr static size_t kMB = 1024 * 1024; + constexpr static size_t kEventBlockSize = 16 * kMB; + constexpr static size_t kEventSize = sizeof(Event); + constexpr static size_t kEventAlign = alignof(Event); + constexpr static size_t kNumBlock = kEventBlockSize / ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); @@ -139,58 +87,27 @@ struct EventList { }; enum ProfilerState { - kDisabled, - kCPU, - kCUDA, + kDisabled, // disabled state + kCPU, // CPU profiling state + kCUDA, // GPU profiling state }; -// The profiler state, the initial value is ProfilerState::kDisabled -extern ProfilerState kState; -// The global mutex -extern std::mutex kAllEventListsMutex; -// The total event lists of all threads -extern std::list> kAllEventLists; -// The thread local event list only can be accessed by the specific thread -extern thread_local std::shared_ptr kEventList; -// The thread index of each thread -extern thread_local int32_t kThreadId; -// The kNextThreadId is a global counter for threads, by the kThreadId and -// kNextThreadId, we can know how many threads have created EventList. -extern uint32_t kNextThreadId; - -inline EventList& GetEventList() { - if (!kEventList) { - std::lock_guard guard(kAllEventListsMutex); - kEventList = std::make_shared(); - kThreadId = kNextThreadId++; - kAllEventLists.emplace_front(kEventList); - } - return *kEventList; -} - -inline void Mark(const std::string name, - const platform::DeviceContext* dev_ctx = nullptr) { - GetEventList().Record(EventKind::kMark, std::move(name), kThreadId, dev_ctx); -} +void Mark(const std::string& name, DeviceContext* dev_ctx); struct RecordEvent { - explicit RecordEvent(const std::string name, - platform::DeviceContext* dev_ctx = nullptr) { - if (kState == ProfilerState::kDisabled) return; - dev_ctx_ = dev_ctx; - GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId, - dev_ctx_); - } + explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx); - ~RecordEvent() { - if (kState == ProfilerState::kDisabled) return; - GetEventList().Record(EventKind::kPopRange, std::string(), kThreadId, - dev_ctx_); - } - platform::DeviceContext* dev_ctx_; + ~RecordEvent(); + + // The device context is used by Event to get the current cuda stream. + DeviceContext* dev_ctx_; }; +// Enable the profiling function. void EnableProfiler(ProfilerState state); + +// Return the event list of all threads. Asummed the returned value calls +// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector> DisableProfiler(); } // namespace platform diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc index 5bd0a9d859..47cf7be146 100644 --- a/paddle/platform/profiler_test.cc +++ b/paddle/platform/profiler_test.cc @@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) { using paddle::platform::Event; using paddle::platform::EventKind; - Event start_event(EventKind::kPushRange, "test", 0); + Event start_event(EventKind::kPushRange, "test", 0, nullptr); EXPECT_TRUE(start_event.has_cuda() == false); int counter = 0; while (counter != 1000) { counter++; } - Event stop_event(EventKind::kPopRange, "test", 0); + Event stop_event(EventKind::kPopRange, "test", 0, nullptr); EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0); } @@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) { TEST(Event, CudaElapsedTime) { using paddle::platform::DeviceContext; using paddle::platform::CUDADeviceContext; - using paddle::platform::GPUPlace; + using paddle::platform::CUDAPlace; using paddle::platform::Event; using paddle::platform::EventKind; - DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(0)); + DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0)); Event start_event(EventKind::kPushRange, "test", 0, dev_ctx); EXPECT_TRUE(start_event.has_cuda() == true); int counter = 0; @@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) { DeviceContext* dev_ctx = nullptr; #ifdef PADDLE_WITH_CUDA using paddle::platform::CUDADeviceContext; - using paddle::platform::GPUPlace; + using paddle::platform::CUDAPlace; state = ProfilerState::kCUDA; dev_ctx = - new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace(0)); + new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0)); #endif EnableProfiler(state); -- GitLab