From 0a5fbb06508731aa55ffda3e4a68a9fabff2a72a Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Fri, 29 Dec 2017 18:04:03 +0800
Subject: [PATCH] Refine code struct.

---
 paddle/platform/device_context.h |  12 ---
 paddle/platform/profiler.cc      | 149 +++++++++++++++++++++++++------
 paddle/platform/profiler.h       | 131 +++++----------------------
 paddle/platform/profiler_test.cc |  12 +--
 4 files changed, 154 insertions(+), 150 deletions(-)
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 07e197ba0b3..2b366e6383d 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -115,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext {
   cudnnHandle_t cudnn_handle_;
 };
 
-class DeviceGuard {
- public:
-  explicit DeviceGuard(int device) {
-    original_device_ = platform::GetCurrentDeviceId();
-    platform::SetDeviceId(device);
-  }
-  ~DeviceGuard() { platform::SetDeviceId(original_device_); }
-
- private:
-  int original_device_;
-};
-
 #endif
 
 /*! \brief device context pool singleton */
diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc
index 40b34b732c9..4e89e5c600b 100644
--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -17,34 +17,133 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-ProfilerState kState = ProfilerState::kDisabled;
-uint32_t kNextThreadId = 0;
-std::mutex kAllEventListsMutex;
-std::list<std::shared_ptr<EventList>> kAllEventLists;
-thread_local std::shared_ptr<EventList> kEventList;
-thread_local int32_t kThreadId;
+// The profiler state, the initial value is ProfilerState::kDisabled
+static ProfilerState g_state = ProfilerState::kDisabled;
+// The thread local event list only can be accessed by the specific thread
+// The thread index of each thread
+static thread_local int32_t g_thread_id;
+// The g_next_thread_id is a global counter for threads, by the g_thread_id and
+// g_next_thread_id, we can know how many threads have created EventList.
+static uint32_t g_next_thread_id = 0;
+// The global mutex
+static std::mutex g_all_event_lists_mutex;
+// The total event lists of all threads
+static std::list<std::shared_ptr<EventList>> g_all_event_lists;
+// The thread local event list only can be accessed by the specific thread
+static thread_local std::shared_ptr<EventList> g_event_list;
+
+inline uint64_t GetTimeInNsec() {
+  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
+                                 std::chrono::high_resolution_clock,
+                                 std::chrono::steady_clock>::type;
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             clock::now().time_since_epoch())
+      .count();
+}
+
+Event::Event(EventKind kind, std::string name, uint32_t thread_id,
+             DeviceContext* dev_ctx)
+    : kind_(kind),
+      name_(std::move(name)),
+      thread_id_(thread_id),
+      has_cuda_(false) {
+#ifdef PADDLE_WITH_CUDA
+  auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+  if (cuda_dev_ctx) {
+    PADDLE_ENFORCE(cudaGetDevice(&device_));
+    PADDLE_ENFORCE(cudaEventCreate(&event_));
+    auto stream = cuda_dev_ctx->stream();
+    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
+    has_cuda_ = true;
+  }
+#endif
+  cpu_ns_ = GetTimeInNsec();
+}
+
+std::string Event::kind() const {
+  switch (kind_) {
+    case EventKind::kMark:
+      return "mark";
+    case EventKind::kPushRange:
+      return "push";
+    case EventKind::kPopRange:
+      return "pop";
+  }
+  PADDLE_THROW("Unknown EventKind.");
+}
+
+double Event::CpuElapsedUs(const Event& e) const {
+  return (e.cpu_ns_ - cpu_ns_) / (1000.0);
+}
+
+double Event::CudaElapsedUs(const Event& e) const {
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
+  PADDLE_ENFORCE(e.device() == device());
+  PADDLE_ENFORCE(cudaEventSynchronize(event_));
+  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
+  float ms;
+  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
+  return ms * 1000.0;
+#else
+  PADDLE_THROW("CUDA is not enabled");
+#endif
+}
+
+#ifdef PADDLE_WITH_CUDA
+static void ForEachDevice(std::function<void(int)> func) {
+  auto original_device = GetCurrentDeviceId();
+  int count = GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    func(i);
+  }
+  SetDeviceId(original_device);
+}
+#endif
+
+inline EventList& GetEventList() {
+  if (!g_event_list) {
+    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+    g_event_list = std::make_shared<EventList>();
+    g_thread_id = g_next_thread_id++;
+    g_all_event_lists.emplace_front(g_event_list);
+  }
+  return *g_event_list;
+}
+
+void Mark(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
+                        dev_ctx);
+}
+
+RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
+  if (g_state == ProfilerState::kDisabled) return;
+  dev_ctx_ = dev_ctx;
+  GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
+                        dev_ctx_);
+}
+
+RecordEvent::~RecordEvent() {
+  if (g_state == ProfilerState::kDisabled) return;
+  GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id,
+                        dev_ctx_);
+}
 
 void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",
                  "ProfilerState::kDisabled");
-  PADDLE_ENFORCE(kState == ProfilerState::kDisabled,
+  PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
                  "The profiling state should be disabled when calling ",
                  "EnableProfiler.");
-  kState = state;
+  g_state = state;
 #ifdef PADDLE_WITH_CUDA
-  auto ForEachDevice = [](std::function<void(int)> op) {
-    int count = GetCUDADeviceCount();
-    for (int i = 0; i < count; i++) {
-      DeviceGuard dev_guard(i);
-      op(i);
-    }
-  };
-  if (kState == ProfilerState::kCUDA) {
+  if (g_state == ProfilerState::kCUDA) {
     // Generate some dummy evenets first to reduce the startup overhead.
     for (int i = 0; i < 5; i++) {
       ForEachDevice([](int d) {
-        DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(d));
+        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
         Mark("_cuda_startup_", dev_ctx);
         dev_ctx->Wait();
       });
@@ -52,20 +151,20 @@ void EnableProfiler(ProfilerState state) {
   }
 #endif
   // Mark the profiling start.
-  Mark("_start_profiler_");
+  Mark("_start_profiler_", nullptr);
 }
 
 std::vector<std::vector<Event>> DisableProfiler() {
-  PADDLE_ENFORCE(kState != ProfilerState::kDisabled,
+  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
                  "Can't disable profiling, since it's not starting.");
   // Mark the profiling stop.
-  Mark("_stop_profiler_");
-  kState = ProfilerState::kDisabled;
+  Mark("_stop_profiler_", nullptr);
+  g_state = ProfilerState::kDisabled;
   std::vector<std::vector<Event>> result;
-  std::lock_guard<std::mutex> guard(kAllEventListsMutex);
-  for (auto it = kAllEventLists.begin(); it != kAllEventLists.end(); ++it) {
-    auto& list = *it;
-    result.emplace_back(list->Reduce());
+  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
+       ++it) {
+    result.emplace_back((*it)->Reduce());
   }
   return result;
 }
diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h
index 2242635024c..47104ea9d08 100644
--- a/paddle/platform/profiler.h
+++ b/paddle/platform/profiler.h
@@ -24,76 +24,24 @@ namespace platform {
 
 enum EventKind { kMark, kPushRange, kPopRange };
 
-inline uint64_t GetTimeInNsec() {
-  // using std::chrono;
-  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
-                                 std::chrono::high_resolution_clock,
-                                 std::chrono::steady_clock>::type;
-  return std::chrono::duration_cast<std::chrono::nanoseconds>(
-             clock::now().time_since_epoch())
-      .count();
-}
-
 class Event {
  public:
-  // the DeviceContext is used to get the cuda stream.
+  // The DeviceContext is used to get the cuda stream.
+  // If CPU profiling mode, can pass nullptr.
   Event(EventKind kind, std::string name, uint32_t thread_id,
-        const platform::DeviceContext* dev_ctx = nullptr)
-      : kind_(kind), name_(std::move(name)), thread_id_(thread_id) {
-    has_cuda_ = false;
-#ifdef PADDLE_WITH_CUDA
-    auto* cuda_dev_ctx =
-        static_cast<const platform::CUDADeviceContext*>(dev_ctx);
-    if (cuda_dev_ctx) {
-      PADDLE_ENFORCE(cudaGetDevice(&device_));
-      PADDLE_ENFORCE(cudaEventCreate(&event_));
-      auto stream = cuda_dev_ctx->stream();
-      PADDLE_ENFORCE(cudaEventRecord(event_, stream));
-      has_cuda_ = true;
-    }
-#endif
-    cpu_ns_ = GetTimeInNsec();
-  }
-
-  std::string kind() const {
-    switch (kind_) {
-      case EventKind::kMark:
-        return "mark";
-      case EventKind::kPushRange:
-        return "push";
-      case EventKind::kPopRange:
-        return "pop";
-    }
-    PADDLE_THROW("Unknown EventKind.");
-  }
+        DeviceContext* dev_ctx);
 
+  std::string kind() const;
   std::string name() const { return name_; }
-
   bool has_cuda() const { return has_cuda_; }
 
 #ifdef PADDLE_WITH_CUDA
   cudaEvent_t event() const { return event_; }
-
   int device() const { return device_; }
 #endif
 
-  double CpuElapsedUs(const Event& e) const {
-    return (e.cpu_ns_ - cpu_ns_) / (1000.0);
-  }
-
-  double CudaElapsedUs(const Event& e) const {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE(e.has_cuda() && has_cuda());
-    PADDLE_ENFORCE(e.device() == device());
-    PADDLE_ENFORCE(cudaEventSynchronize(event_));
-    PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
-    float ms;
-    PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
-    return ms * 1000.0;
-#else
-    PADDLE_THROW("CUDA is not enabled");
-#endif
-  }
+  double CpuElapsedUs(const Event& e) const;
+  double CudaElapsedUs(const Event& e) const;
 
  private:
   EventKind kind_;
@@ -108,11 +56,11 @@ class Event {
 };
 
 struct EventList {
-  constexpr static std::size_t kMB = 1024 * 1024;
-  constexpr static std::size_t kEventBlockSize = 16 * kMB;
-  constexpr static std::size_t kEventSize = sizeof(Event);
-  constexpr static std::size_t kEventAlign = alignof(Event);
-  constexpr static std::size_t kNumBlock =
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(Event);
+  constexpr static size_t kEventAlign = alignof(Event);
+  constexpr static size_t kNumBlock =
       kEventBlockSize /
       ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
 
@@ -139,58 +87,27 @@ struct EventList {
 };
 
 enum ProfilerState {
-  kDisabled,
-  kCPU,
-  kCUDA,
+  kDisabled,  // disabled state
+  kCPU,       // CPU profiling state
+  kCUDA,      // GPU profiling state
 };
 
-// The profiler state, the initial value is ProfilerState::kDisabled
-extern ProfilerState kState;
-// The global mutex
-extern std::mutex kAllEventListsMutex;
-// The total event lists of all threads
-extern std::list<std::shared_ptr<EventList>> kAllEventLists;
-// The thread local event list only can be accessed by the specific thread
-extern thread_local std::shared_ptr<EventList> kEventList;
-// The thread index of each thread
-extern thread_local int32_t kThreadId;
-// The kNextThreadId is a global counter for threads, by the kThreadId and
-// kNextThreadId, we can know how many threads have created EventList.
-extern uint32_t kNextThreadId;
-
-inline EventList& GetEventList() {
-  if (!kEventList) {
-    std::lock_guard<std::mutex> guard(kAllEventListsMutex);
-    kEventList = std::make_shared<EventList>();
-    kThreadId = kNextThreadId++;
-    kAllEventLists.emplace_front(kEventList);
-  }
-  return *kEventList;
-}
-
-inline void Mark(const std::string name,
-                 const platform::DeviceContext* dev_ctx = nullptr) {
-  GetEventList().Record(EventKind::kMark, std::move(name), kThreadId, dev_ctx);
-}
+void Mark(const std::string& name, DeviceContext* dev_ctx);
 
 struct RecordEvent {
-  explicit RecordEvent(const std::string name,
-                       platform::DeviceContext* dev_ctx = nullptr) {
-    if (kState == ProfilerState::kDisabled) return;
-    dev_ctx_ = dev_ctx;
-    GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId,
-                          dev_ctx_);
-  }
+  explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
 
-  ~RecordEvent() {
-    if (kState == ProfilerState::kDisabled) return;
-    GetEventList().Record(EventKind::kPopRange, std::string(), kThreadId,
-                          dev_ctx_);
-  }
-  platform::DeviceContext* dev_ctx_;
+  ~RecordEvent();
+
+  // The device context is used by Event to get the current cuda stream.
+  DeviceContext* dev_ctx_;
 };
 
+// Enable the profiling function.
 void EnableProfiler(ProfilerState state);
+
+// Return the event list of all threads. Asummed the returned value calls
+// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> DisableProfiler();
 
 }  // namespace platform
diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc
index 5bd0a9d8599..47cf7be1461 100644
--- a/paddle/platform/profiler_test.cc
+++ b/paddle/platform/profiler_test.cc
@@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) {
   using paddle::platform::Event;
   using paddle::platform::EventKind;
 
-  Event start_event(EventKind::kPushRange, "test", 0);
+  Event start_event(EventKind::kPushRange, "test", 0, nullptr);
   EXPECT_TRUE(start_event.has_cuda() == false);
   int counter = 0;
   while (counter != 1000) {
     counter++;
   }
-  Event stop_event(EventKind::kPopRange, "test", 0);
+  Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
   EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
 }
 
@@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) {
 TEST(Event, CudaElapsedTime) {
   using paddle::platform::DeviceContext;
   using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
   using paddle::platform::Event;
   using paddle::platform::EventKind;
 
-  DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(0));
+  DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
   Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
   EXPECT_TRUE(start_event.has_cuda() == true);
   int counter = 0;
@@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) {
   DeviceContext* dev_ctx = nullptr;
 #ifdef PADDLE_WITH_CUDA
   using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
   state = ProfilerState::kCUDA;
   dev_ctx =
-      new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace(0));
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
 #endif
   EnableProfiler(state);
 
-- 
GitLab