Merge pull request #7576 from qingqing01/profiling_py

Refine profiler and expose to Python.

Merge pull request #7576 from qingqing01/profiling_py
Refine profiler and expose to Python.
750299f5 · qingqing01 · GitHub · 42549fb7 · f18016b9 · 750299f5
10 changed file
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -74,7 +74,8 @@ cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table)
+cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
+framework_proto backward glog lod_rank_table profiler)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/platform/place.h"
+#include "paddle/platform/profiler.h"
 DECLARE_bool(do_memory_benchmark);
 DEFINE_bool(check_nan_inf, false,
@@ -117,6 +118,10 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
  for (auto& op_desc : block.AllOps()) {
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
    VLOG(4) << op->DebugStringEx(local_scope);
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(op->Type(), pool.Get(place_));
    op->Run(*local_scope, place_);
    VLOG(3) << op->DebugStringEx(local_scope);
    if (FLAGS_do_memory_benchmark) {

--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -47,16 +47,16 @@ inline uint64_t GetTimeInNsec() {
 }
 Event::Event(EventKind kind, std::string name, uint32_t thread_id,
-             DeviceContext* dev_ctx)
+             const DeviceContext* dev_ctx)
    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
 #ifdef PADDLE_WITH_CUDA
+  has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
+  if (has_cuda_) {
    auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
-  if (cuda_dev_ctx) {
    PADDLE_ENFORCE(cudaGetDevice(&device_));
    PADDLE_ENFORCE(cudaEventCreate(&event_));
    auto stream = cuda_dev_ctx->stream();
    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
-    has_cuda_ = true;
  }
 #endif
  cpu_ns_ = GetTimeInNsec();
@@ -114,19 +114,20 @@ inline EventList& GetEventList() {
  return *g_event_list;
 }
-void Mark(const std::string& name, DeviceContext* dev_ctx) {
+void Mark(const std::string& name, const DeviceContext* dev_ctx) {
  GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
 }
-void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
+void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
  GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
 }
-void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
+void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }
-RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
+RecordEvent::RecordEvent(const std::string& name,
+                         const DeviceContext* dev_ctx) {
  if (g_state == ProfilerState::kDisabled) return;
  dev_ctx_ = dev_ctx;
  name_ = name;
@@ -155,6 +156,7 @@ void EnableProfiler(ProfilerState state) {
        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
        Mark("_cuda_startup_", dev_ctx);
        dev_ctx->Wait();
+        delete dev_ctx;
      });
    }
  }
@@ -163,14 +165,17 @@ void EnableProfiler(ProfilerState state) {
  Mark("_start_profiler_", nullptr);
 }
-std::vector<std::vector<Event>> DisableProfiler() {
+void ResetProfiler() {
-  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
-                 "Can't disable profiling, since it's not starting.");
-  // Mark the profiling stop.
-  Mark("_stop_profiler_", nullptr);
-  g_state = ProfilerState::kDisabled;
-  std::vector<std::vector<Event>> result;
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
+       ++it) {
+    (*it)->Clear();
+  }
+}
+std::vector<std::vector<Event>> GetAllEvents() {
+  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  std::vector<std::vector<Event>> result;
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    result.emplace_back((*it)->Reduce());
@@ -178,6 +183,18 @@ std::vector<std::vector<Event>> DisableProfiler() {
  return result;
 }
+void DisableProfiler(EventSortingKey sorted_key) {
+  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
+                 "Can't disable profiling, since it's not starting.");
+  // Mark the profiling stop.
+  Mark("_stop_profiler_", nullptr);
+  g_state = ProfilerState::kDisabled;
+  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  ParseEvents(all_events, sorted_key);
+  ResetProfiler();
+}
 void ParseEvents(std::vector<std::vector<Event>>& events,
                 EventSortingKey sorted_by) {
  if (g_profiler_place == "") return;
@@ -291,10 +308,10 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
  }
  // Print report
-  PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12);
+  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
 }
-void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
+void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
                   std::string& sorted_domain, const size_t name_width,
                   const size_t data_width) {
  // Output header information

--- a/paddle/platform/profiler.h
+++ b/paddle/platform/profiler.h
@@ -29,7 +29,7 @@ class Event {
  // The DeviceContext is used to get the cuda stream.
  // If CPU profiling mode, can pass nullptr.
  Event(EventKind kind, std::string name, uint32_t thread_id,
-        DeviceContext* dev_ctx);
+        const DeviceContext* dev_ctx);
  std::string kind() const;
  std::string name() const { return name_; }
@@ -84,6 +84,8 @@ struct EventList {
    return result;
  }
+  void Clear() { event_blocks.clear(); }
  std::forward_list<std::vector<Event>> event_blocks;
 };
@@ -93,29 +95,26 @@ enum ProfilerState {
  kCUDA,      // GPU profiling state
 };
-void Mark(const std::string& name, DeviceContext* dev_ctx);
+void Mark(const std::string& name, const DeviceContext* dev_ctx);
-void PushEvent(const std::string& name, DeviceContext* dev_ctx);
+void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
-void PopEvent(const std::string& name, DeviceContext* dev_ctx);
+void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
 struct RecordEvent {
-  explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
+  explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
  ~RecordEvent();
  // The device context is used by Event to get the current cuda stream.
-  DeviceContext* dev_ctx_;
+  const DeviceContext* dev_ctx_;
  // Event name
  std::string name_;
 };
-// Enable the profiling function.
-void EnableProfiler(ProfilerState state);
 // Return the event list of all threads. Asummed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
-std::vector<std::vector<Event>> DisableProfiler();
+std::vector<std::vector<Event>> GetAllEvents();
 // The information of each event given in the profiling report
 struct EventItem {
@@ -130,13 +129,22 @@ struct EventItem {
 // Candidate keys to sort the profiling report
 enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
+// Enable the profiling function.
+void EnableProfiler(ProfilerState state);
+// Clear the g_all_event_lists, which is total event lists of all threads.
+void ResetProfiler();
+void DisableProfiler(EventSortingKey sorted_key);
 // Parse the event list and output the profiling report
 void ParseEvents(std::vector<std::vector<Event>>&,
                 EventSortingKey sorted_by = EventSortingKey::kDefault);
 // Print results
-void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
+void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
                   std::string& sorted_domain, const size_t name_width,
                   const size_t data_width);
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/profiler_test.cc
+++ b/paddle/platform/profiler_test.cc
@@ -103,18 +103,14 @@ TEST(RecordEvent, RecordEvent) {
  // Bad Usage:
  PushEvent("event_without_pop", dev_ctx);
  PopEvent("event_without_push", dev_ctx);
-  std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
+  std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
-  // Will remove parsing-related code from test later
-  ParseEvents(events, EventSortingKey::kTotal);
  int cuda_startup_count = 0;
  int start_profiler_count = 0;
-  int stop_profiler_count = 0;
  for (size_t i = 0; i < events.size(); ++i) {
    for (size_t j = 0; j < events[i].size(); ++j) {
      if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
      if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
-      if (events[i][j].name() == "_stop_profiler_") ++stop_profiler_count;
      if (events[i][j].name() == "push") {
        EXPECT_EQ(events[i][j + 1].name(), "pop");
 #ifdef PADDLE_WITH_CUDA
@@ -127,5 +123,7 @@ TEST(RecordEvent, RecordEvent) {
  }
  EXPECT_EQ(cuda_startup_count % 5, 0);
  EXPECT_EQ(start_profiler_count, 1);
-  EXPECT_EQ(stop_profiler_count, 1);
+  // Will remove parsing-related code from test later
+  DisableProfiler(EventSortingKey::kTotal);
 }
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
 if(WITH_PYTHON)
  cc_library(paddle_pybind SHARED
    SRCS pybind.cc exception.cc protobuf.cc const_value.cc
-    DEPS pybind python backward proto_desc paddle_memory executor prune init
+    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler
    ${GLOB_OP_LIB})
  if(NOT APPLE AND NOT ANDROID)
    target_link_libraries(paddle_pybind rt)

--- a/paddle/pybind/protobuf.h
+++ b/paddle/pybind/protobuf.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <Python.h>
 #include <fstream>
 #include <vector>
+#include "paddle/platform/variant.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/operators/net_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "paddle/platform/profiler.h"
 #include "paddle/pybind/const_value.h"
 #include "paddle/pybind/exception.h"
 #include "paddle/pybind/pybind.h"
@@ -476,6 +477,24 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("nvprof_stop", platform::CudaProfilerStop);
 #endif
+  py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
+      .value("kDisabled", platform::ProfilerState::kDisabled)
+      .value("kCPU", platform::ProfilerState::kCPU)
+      .value("kCUDA", platform::ProfilerState::kCUDA)
+      .export_values();
+  py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
+      .value("kDefault", platform::EventSortingKey::kDefault)
+      .value("kCalls", platform::EventSortingKey::kCalls)
+      .value("kTotal", platform::EventSortingKey::kTotal)
+      .value("kMin", platform::EventSortingKey::kMin)
+      .value("kMax", platform::EventSortingKey::kMax)
+      .value("kAve", platform::EventSortingKey::kAve)
+      .export_values();
+  m.def("enable_profiler", platform::EnableProfiler);
+  m.def("disable_profiler", platform::DisableProfiler);
+  m.def("reset_profiler", platform::ResetProfiler);
  return m.ptr();
 }
 }  // namespace pybind

--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
@@ -63,3 +63,58 @@ def cuda_profiler(output_file, output_mode=None, config=None):
    # Disables profiler collection.
    core.nvprof_stop()
    os.remove(config_file)
+def reset_profiler():
+    """The profiler clear interface.
+    reset_profiler will clear the previous time record.
+    """
+    core.reset_profiler()
+@contextmanager
+def profiler(state, sorted_key=None):
+    """The profiler interface.
+    Different from cuda_profiler, this profiler can be used to profile both CPU
+    and GPU program. By defalut, it records the CPU and GPU operator kernels,
+    if you want to profile other program, you can refer the profiling tutorial
+    to add more records.
+    Args:
+        state (string) : The profiling state, which should be 'CPU' or 'GPU',
+            telling the profiler to use CPU timer or GPU timer for profiling.
+            Although users may have already specified the execution place
+            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
+            would not inherit this place.
+        sorted_key (string) : If None, the profiling results will be printed
+            in the order of first end time of events. Otherwise, the profiling
+            results will be sorted by the this flag. This flag should be one
+            of 'calls', 'total', 'max', 'min' or 'ave'.
+            The `calls` means sorting by the number of calls.
+            The `total` means sorting by the total execution time.
+            The `max` means sorting by the maximum execution time.
+            The `min` means sorting by the minimum execution time.
+            The `ave` means sorting by the average execution time.
+    """
+    if state not in ['CPU', 'GPU']:
+        raise ValueError("The state must be 'CPU' or 'GPU'.")
+    prof_state = core.ProfilerState.kCUDA if state == "GPU" else core.ProfilerState.kCPU
+    core.enable_profiler(prof_state)
+    yield
+    if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
+        raise ValueError("The state must be in 'calls', 'total', "
+                         "'max', 'min', 'ave'")
+    sorted_key = 'default' if sorted_key is None else sorted_key
+    key_map = {
+        'default': core.EventSortingKey.kDefault,
+        'calls': core.EventSortingKey.kCalls,
+        'total': core.EventSortingKey.kTotal,
+        'max': core.EventSortingKey.kMax,
+        'min': core.EventSortingKey.kMin,
+        'ave': core.EventSortingKey.kAve,
+    }
+    # TODO(qingqing) : redirect C++ ostream to Python stream.
+    # with core.ostream_redirect(stdout=True, stderr=True):
+    core.disable_profiler(key_map[sorted_key])
--- a/python/paddle/v2/fluid/tests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 import unittest
+import os
 import numpy as np
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.profiler as profiler
 import paddle.v2.fluid.layers as layers
-import os
+import paddle.v2.fluid.core as core
 class TestProfiler(unittest.TestCase):
@@ -40,6 +41,50 @@ class TestProfiler(unittest.TestCase):
                exe.run(fluid.default_main_program(), feed={'data': input})
        os.remove(output_file)
+    def net_profiler(self, state):
+        if state == 'GPU' and not core.is_compile_gpu():
+            return
+        startup_program = fluid.Program()
+        main_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            image = fluid.layers.data(name='x', shape=[784], dtype='float32')
+            hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
+            hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+            predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
+            label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+        optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
+        opts = optimizer.minimize(avg_cost, startup_program=startup_program)
+        place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+        accuracy.reset(exe)
+        with profiler.profiler(state, 'total') as prof:
+            for iter in range(10):
+                if iter == 2:
+                    profiler.reset_profiler()
+                x = np.random.random((32, 784)).astype("float32")
+                y = np.random.randint(0, 10, (32, 1)).astype("int64")
+                outs = exe.run(main_program,
+                               feed={'x': x,
+                                     'y': y},
+                               fetch_list=[avg_cost] + accuracy.metrics)
+                acc = np.array(outs[1])
+                pass_acc = accuracy.eval(exe)
+    def test_cpu_profiler(self):
+        self.net_profiler('CPU')
+    def test_cuda_profiler(self):
+        self.net_profiler('GPU')
 if __name__ == '__main__':
    unittest.main()