Refine code struct.

0a5fbb06 · dangqingqing · f03e73c8 · 0a5fbb06 · 0a5fbb06 · 0a5fbb06
4 changed file
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -115,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext {
  cudnnHandle_t cudnn_handle_;
 };

-class DeviceGuard {
- public:
-  explicit DeviceGuard(int device) {
-    original_device_ = platform::GetCurrentDeviceId();
-    platform::SetDeviceId(device);
-  }
-  ~DeviceGuard() { platform::SetDeviceId(original_device_); }
-
- private:
-  int original_device_;
-};
-
 #endif

 /*! \brief device context pool singleton */

--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -17,34 +17,133 @@ limitations under the License. */
 namespace paddle {
 namespace platform {

-ProfilerState kState = ProfilerState::kDisabled;
-uint32_t kNextThreadId = 0;
-std::mutex kAllEventListsMutex;
-std::list<std::shared_ptr<EventList>> kAllEventLists;
-thread_local std::shared_ptr<EventList> kEventList;
-thread_local int32_t kThreadId;
+// The profiler state, the initial value is ProfilerState::kDisabled
+static ProfilerState g_state = ProfilerState::kDisabled;
+// The thread local event list only can be accessed by the specific thread
+// The thread index of each thread
+static thread_local int32_t g_thread_id;
+// The g_next_thread_id is a global counter for threads, by the g_thread_id and
+// g_next_thread_id, we can know how many threads have created EventList.
+static uint32_t g_next_thread_id = 0;
+// The global mutex
+static std::mutex g_all_event_lists_mutex;
+// The total event lists of all threads
+static std::list<std::shared_ptr<EventList>> g_all_event_lists;
+// The thread local event list only can be accessed by the specific thread
+static thread_local std::shared_ptr<EventList> g_event_list;
+
+inline uint64_t GetTimeInNsec() {
+  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
+                                 std::chrono::high_resolution_clock,
+                                 std::chrono::steady_clock>::type;
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             clock::now().time_since_epoch())
+      .count();
+}
+
+Event::Event(EventKind kind, std::string name, uint32_t thread_id,
+             DeviceContext* dev_ctx)
+    : kind_(kind),
+      name_(std::move(name)),
+      thread_id_(thread_id),
+      has_cuda_(false) {
+#ifdef PADDLE_WITH_CUDA
+  auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+  if (cuda_dev_ctx) {
+    PADDLE_ENFORCE(cudaGetDevice(&device_));
+    PADDLE_ENFORCE(cudaEventCreate(&event_));
+    auto stream = cuda_dev_ctx->stream();
+    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
+    has_cuda_ = true;
+  }
+#endif
+  cpu_ns_ = GetTimeInNsec();
+}
+
+std::string Event::kind() const {
+  switch (kind_) {
+    case EventKind::kMark:
+      return "mark";
+    case EventKind::kPushRange:
+      return "push";
+    case EventKind::kPopRange:
+      return "pop";
+  }
+  PADDLE_THROW("Unknown EventKind.");
+}
+
+double Event::CpuElapsedUs(const Event& e) const {
+  return (e.cpu_ns_ - cpu_ns_) / (1000.0);
+}
+
+double Event::CudaElapsedUs(const Event& e) const {
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
+  PADDLE_ENFORCE(e.device() == device());
+  PADDLE_ENFORCE(cudaEventSynchronize(event_));
+  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
+  float ms;
+  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
+  return ms * 1000.0;
+#else
+  PADDLE_THROW("CUDA is not enabled");
+#endif
+}
+
+#ifdef PADDLE_WITH_CUDA
+static void ForEachDevice(std::function<void(int)> func) {
+  auto original_device = GetCurrentDeviceId();
+  int count = GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    func(i);
+  }
+  SetDeviceId(original_device);
+}
+#endif
+
+inline EventList& GetEventList() {
+  if (!g_event_list) {
+    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+    g_event_list = std::make_shared<EventList>();
+    g_thread_id = g_next_thread_id++;
+    g_all_event_lists.emplace_front(g_event_list);
+  }
+  return *g_event_list;
+}
+
+void Mark(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
+                        dev_ctx);
+}
+
+RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
+  if (g_state == ProfilerState::kDisabled) return;
+  dev_ctx_ = dev_ctx;
+  GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
+                        dev_ctx_);
+}
+
+RecordEvent::~RecordEvent() {
+  if (g_state == ProfilerState::kDisabled) return;
+  GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id,
+                        dev_ctx_);
+}

 void EnableProfiler(ProfilerState state) {
  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                 "Can't enbale profling, since the input state is ",
                 "ProfilerState::kDisabled");
-  PADDLE_ENFORCE(kState == ProfilerState::kDisabled,
+  PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
                 "The profiling state should be disabled when calling ",
                 "EnableProfiler.");
-  kState = state;
+  g_state = state;
 #ifdef PADDLE_WITH_CUDA
-  auto ForEachDevice = [](std::function<void(int)> op) {
-    int count = GetCUDADeviceCount();
-    for (int i = 0; i < count; i++) {
-      DeviceGuard dev_guard(i);
-      op(i);
-    }
-  };
-  if (kState == ProfilerState::kCUDA) {
+  if (g_state == ProfilerState::kCUDA) {
    // Generate some dummy evenets first to reduce the startup overhead.
    for (int i = 0; i < 5; i++) {
      ForEachDevice([](int d) {
-        DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(d));
+        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
        Mark("_cuda_startup_", dev_ctx);
        dev_ctx->Wait();
      });
@@ -52,20 +151,20 @@ void EnableProfiler(ProfilerState state) {
  }
 #endif
  // Mark the profiling start.
-  Mark("_start_profiler_");
+  Mark("_start_profiler_", nullptr);
 }

 std::vector<std::vector<Event>> DisableProfiler() {
-  PADDLE_ENFORCE(kState != ProfilerState::kDisabled,
+  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
                 "Can't disable profiling, since it's not starting.");
  // Mark the profiling stop.
-  Mark("_stop_profiler_");
-  kState = ProfilerState::kDisabled;
+  Mark("_stop_profiler_", nullptr);
+  g_state = ProfilerState::kDisabled;
  std::vector<std::vector<Event>> result;
-  std::lock_guard<std::mutex> guard(kAllEventListsMutex);
-  for (auto it = kAllEventLists.begin(); it != kAllEventLists.end(); ++it) {
-    auto& list = *it;
-    result.emplace_back(list->Reduce());
+  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
+       ++it) {
+    result.emplace_back((*it)->Reduce());
  }
  return result;
 }

--- a/paddle/platform/profiler.h
+++ b/paddle/platform/profiler.h
@@ -24,76 +24,24 @@ namespace platform {

 enum EventKind { kMark, kPushRange, kPopRange };

-inline uint64_t GetTimeInNsec() {
-  // using std::chrono;
-  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
-                                 std::chrono::high_resolution_clock,
-                                 std::chrono::steady_clock>::type;
-  return std::chrono::duration_cast<std::chrono::nanoseconds>(
-             clock::now().time_since_epoch())
-      .count();
-}
-
 class Event {
 public:
-  // the DeviceContext is used to get the cuda stream.
+  // The DeviceContext is used to get the cuda stream.
+  // If CPU profiling mode, can pass nullptr.
  Event(EventKind kind, std::string name, uint32_t thread_id,
-        const platform::DeviceContext* dev_ctx = nullptr)
-      : kind_(kind), name_(std::move(name)), thread_id_(thread_id) {
-    has_cuda_ = false;
-#ifdef PADDLE_WITH_CUDA
-    auto* cuda_dev_ctx =
-        static_cast<const platform::CUDADeviceContext*>(dev_ctx);
-    if (cuda_dev_ctx) {
-      PADDLE_ENFORCE(cudaGetDevice(&device_));
-      PADDLE_ENFORCE(cudaEventCreate(&event_));
-      auto stream = cuda_dev_ctx->stream();
-      PADDLE_ENFORCE(cudaEventRecord(event_, stream));
-      has_cuda_ = true;
-    }
-#endif
-    cpu_ns_ = GetTimeInNsec();
-  }
-
-  std::string kind() const {
-    switch (kind_) {
-      case EventKind::kMark:
-        return "mark";
-      case EventKind::kPushRange:
-        return "push";
-      case EventKind::kPopRange:
-        return "pop";
-    }
-    PADDLE_THROW("Unknown EventKind.");
-  }
+        DeviceContext* dev_ctx);

+  std::string kind() const;
  std::string name() const { return name_; }
-
  bool has_cuda() const { return has_cuda_; }

 #ifdef PADDLE_WITH_CUDA
  cudaEvent_t event() const { return event_; }
-
  int device() const { return device_; }
 #endif

-  double CpuElapsedUs(const Event& e) const {
-    return (e.cpu_ns_ - cpu_ns_) / (1000.0);
-  }
-
-  double CudaElapsedUs(const Event& e) const {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE(e.has_cuda() && has_cuda());
-    PADDLE_ENFORCE(e.device() == device());
-    PADDLE_ENFORCE(cudaEventSynchronize(event_));
-    PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
-    float ms;
-    PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
-    return ms * 1000.0;
-#else
-    PADDLE_THROW("CUDA is not enabled");
-#endif
-  }
+  double CpuElapsedUs(const Event& e) const;
+  double CudaElapsedUs(const Event& e) const;

 private:
  EventKind kind_;
@@ -108,11 +56,11 @@ class Event {
 };

 struct EventList {
-  constexpr static std::size_t kMB = 1024 * 1024;
-  constexpr static std::size_t kEventBlockSize = 16 * kMB;
-  constexpr static std::size_t kEventSize = sizeof(Event);
-  constexpr static std::size_t kEventAlign = alignof(Event);
-  constexpr static std::size_t kNumBlock =
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(Event);
+  constexpr static size_t kEventAlign = alignof(Event);
+  constexpr static size_t kNumBlock =
      kEventBlockSize /
      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);

@@ -139,58 +87,27 @@ struct EventList {
 };

 enum ProfilerState {
-  kDisabled,
-  kCPU,
-  kCUDA,
+  kDisabled,  // disabled state
+  kCPU,       // CPU profiling state
+  kCUDA,      // GPU profiling state
 };

-// The profiler state, the initial value is ProfilerState::kDisabled
-extern ProfilerState kState;
-// The global mutex
-extern std::mutex kAllEventListsMutex;
-// The total event lists of all threads
-extern std::list<std::shared_ptr<EventList>> kAllEventLists;
-// The thread local event list only can be accessed by the specific thread
-extern thread_local std::shared_ptr<EventList> kEventList;
-// The thread index of each thread
-extern thread_local int32_t kThreadId;
-// The kNextThreadId is a global counter for threads, by the kThreadId and
-// kNextThreadId, we can know how many threads have created EventList.
-extern uint32_t kNextThreadId;
-
-inline EventList& GetEventList() {
-  if (!kEventList) {
-    std::lock_guard<std::mutex> guard(kAllEventListsMutex);
-    kEventList = std::make_shared<EventList>();
-    kThreadId = kNextThreadId++;
-    kAllEventLists.emplace_front(kEventList);
-  }
-  return *kEventList;
-}
-
-inline void Mark(const std::string name,
-                 const platform::DeviceContext* dev_ctx = nullptr) {
-  GetEventList().Record(EventKind::kMark, std::move(name), kThreadId, dev_ctx);
-}
+void Mark(const std::string& name, DeviceContext* dev_ctx);

 struct RecordEvent {
-  explicit RecordEvent(const std::string name,
-                       platform::DeviceContext* dev_ctx = nullptr) {
-    if (kState == ProfilerState::kDisabled) return;
-    dev_ctx_ = dev_ctx;
-    GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId,
-                          dev_ctx_);
-  }
+  explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);

-  ~RecordEvent() {
-    if (kState == ProfilerState::kDisabled) return;
-    GetEventList().Record(EventKind::kPopRange, std::string(), kThreadId,
-                          dev_ctx_);
-  }
-  platform::DeviceContext* dev_ctx_;
+  ~RecordEvent();
+
+  // The device context is used by Event to get the current cuda stream.
+  DeviceContext* dev_ctx_;
 };

+// Enable the profiling function.
 void EnableProfiler(ProfilerState state);
+
+// Return the event list of all threads. Asummed the returned value calls
+// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> DisableProfiler();

 }  // namespace platform

--- a/paddle/platform/profiler_test.cc
+++ b/paddle/platform/profiler_test.cc
@@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) {
  using paddle::platform::Event;
  using paddle::platform::EventKind;

-  Event start_event(EventKind::kPushRange, "test", 0);
+  Event start_event(EventKind::kPushRange, "test", 0, nullptr);
  EXPECT_TRUE(start_event.has_cuda() == false);
  int counter = 0;
  while (counter != 1000) {
    counter++;
  }
-  Event stop_event(EventKind::kPopRange, "test", 0);
+  Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
  EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
 }

@@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) {
 TEST(Event, CudaElapsedTime) {
  using paddle::platform::DeviceContext;
  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
  using paddle::platform::Event;
  using paddle::platform::EventKind;

-  DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(0));
+  DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
  Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
  EXPECT_TRUE(start_event.has_cuda() == true);
  int counter = 0;
@@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) {
  DeviceContext* dev_ctx = nullptr;
 #ifdef PADDLE_WITH_CUDA
  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
  state = ProfilerState::kCUDA;
  dev_ctx =
-      new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace(0));
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
 #endif
  EnableProfiler(state);