add flag to control profile level in python API (#22319)

* add python flag to control profile level test=develop

add flag to control profile level in python API (#22319)
* add python flag to control profile level test=develop
c65c6ae5 · wangchaochaohu · GitHub · fce37bc5 · c65c6ae5 · c65c6ae5
6 changed file
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -167,7 +167,13 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
    }
    {
-      platform::RecordEvent record_event(Type());
+      // TODO(wangchaochaohu) : refine code to use only one RecordEvent)
+      // in order to record different op type cost time
+      // and different op name cost time,we set two event.
+      platform::RecordEvent op_type_record_event(Type());
+      auto op_name = platform::OpName(outputs_, Type());
+      platform::RecordEvent op_name_record_event(
+          op_name, platform::RecordRole::kUniqueOp);
      RunImpl(scope, place);
    }
@@ -950,7 +956,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  std::vector<std::string> transfered_inplace_vars;
  Scope* transfer_scope = nullptr;
  {
-    platform::RecordEvent record_event("prepare_data_inner_op");
+    platform::RecordEvent record_event("prepare_data",
+                                       platform::RecordRole::kInnerOp);
    transfer_scope = PrepareData(scope, *kernel_type_, &transfered_inplace_vars,
                                 runtime_ctx);
  }
@@ -963,7 +970,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  }
  if (!all_kernels_must_compute_runtime_shape_) {
-    platform::RecordEvent record_event("infer_shape_inner_op");
+    platform::RecordEvent record_event("infer_shape",
+                                       platform::RecordRole::kInnerOp);
    RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
    this->InferShape(&infer_shape_ctx);
  }
@@ -975,7 +983,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
  // not Scope. Imperative mode only pass inputs and get outputs.
  {
-    platform::RecordEvent record_event("compute_inner_op");
+    platform::RecordEvent record_event("compute",
+                                       platform::RecordRole::kInnerOp);
    (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
                                     kernel_configs));
  }

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -42,6 +42,7 @@ static int64_t profiler_lister_id = 0;
 static bool should_send_profile_state = false;
 std::mutex profiler_mu;
+static TracerOption g_tracer_option = TracerOption::kDefault;
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
 // The thread local event list only can be accessed by the specific thread
@@ -137,11 +138,15 @@ void PopEvent(const std::string &name) {
  GetEventList().Record(EventType::kPopRange, name, g_thread_id);
 }
-RecordEvent::RecordEvent(const std::string &name)
+RecordEvent::RecordEvent(const std::string &name, const RecordRole role)
-    : is_enabled_(false), start_ns_(PosixInNsec()) {
+    : is_enabled_(false), start_ns_(PosixInNsec()), role_(role) {
  if (g_state == ProfilerState::kDisabled || name.empty()) return;
+  if ((g_tracer_option == TracerOption::kOpDetail &&
+       role_ != RecordRole::kInnerOp) ||
+      (g_tracer_option == TracerOption::kDefault &&
+       role != RecordRole::kOrdinary))
+    return;
  // lock is not needed, the code below is thread-safe
  is_enabled_ = true;
  Event *e = PushEvent(name);
  // Maybe need the same push/pop behavior.
@@ -363,30 +368,21 @@ void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
              << "Max." << std::setw(data_width) << "Ave."
              << std::setw(data_width) << "Ratio." << std::endl;
  }
-  if (print_depth >= 100) return;
-  int print_depth_next = print_depth;
+  if (events_table.size() <= 0) return;
  for (size_t i = 0; i < events_table.size(); ++i) {
    for (size_t j = 0; j < events_table[i].size(); ++j) {
      auto event_item = events_table[i][j];
      std::vector<std::vector<EventItem>> child_table;
      std::vector<EventItem> table;
-      bool do_next = false;
-      std::string op_end_str = "inner_op";
      for (auto it = child_map.begin(); it != child_map.end(); it++) {
        if (it->first == event_item.name) {
          table.push_back(it->second);
-          if (!do_next)
-            do_next = !(it->second.name.rfind(op_end_str) ==
-                        (it->second.name.length() - op_end_str.length()));
        }
      }
      child_table.push_back(table);
-      if (do_next)
-        print_depth_next = print_depth + 1;
-      else
-        print_depth_next = print_depth + 100;
      auto name_len = event_item.name.length();
      std::string print_name = event_item.name.substr(remove_len, name_len);
      std::string delimiter;
@@ -394,10 +390,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
        delimiter = "  " + delimiter;
      }
      print_name = delimiter + print_name;
-      size_t pos = 0;
-      while ((pos = print_name.find(op_end_str)) != std::string::npos) {
-        print_name.erase(pos, op_end_str.length());
-      }
      std::cout << std::setw(name_width) << print_name << std::setw(data_width)
                << event_item.calls << std::setw(data_width)
                << event_item.total_time;
@@ -416,14 +409,14 @@ void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
                << std::setw(data_width) << event_item.ave_time
                << std::setw(data_width) << event_item.ratio << std::endl;
      PrintProfiler(child_table, child_map, sorted_domain, name_width,
-                    data_width, merge_thread, print_depth_next, 0);
+                    data_width, merge_thread, print_depth + 1, 0);
    }
  }
 }
 std::function<bool(const EventItem &, const EventItem &)> SetSortedFunc(
    EventSortingKey sorted_by, std::string *domain) {
-  auto sorted_domain = *domain;
+  std::string sorted_domain;
  std::function<bool(const EventItem &, const EventItem &)> sorted_func;
  switch (sorted_by) {
    case EventSortingKey::kCalls:
@@ -471,6 +464,7 @@ std::function<bool(const EventItem &, const EventItem &)> SetSortedFunc(
    default:
      sorted_domain = "event first end time";
  }
+  *domain = sorted_domain;
  return sorted_func;
 }
@@ -709,6 +703,45 @@ void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
  PrintMemProfiler(annotation_report, 55, 18);
 }
+void DealWithShowName() {
+  std::unordered_map<std::string, int> prefix_name;
+  std::vector<std::string> op_out_name;
+  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
+       ++it) {
+    for (auto &block : (*it)->event_blocks) {
+      for (auto &r : block) {
+        auto event_name = r.name();
+        size_t start = event_name.find('%', 0);
+        size_t end = event_name.find('%', start + 1);
+        std::string prefix_str = event_name.substr(0, start);
+        while (start != std::string::npos && end != std::string::npos) {
+          auto search_str = event_name.substr(start, end - start + 1);
+          auto it = find(op_out_name.begin(), op_out_name.end(), search_str);
+          std::string replace_str;
+          bool prefix_find = true;
+          if (prefix_name.find(prefix_str) == prefix_name.end()) {
+            prefix_find = false;
+            prefix_name[prefix_str] = 0;
+          }
+          if (it == op_out_name.end()) {
+            if (prefix_find)
+              prefix_name[prefix_str] = prefix_name[prefix_str] + 1;
+            op_out_name.push_back(search_str);
+          }
+          replace_str = std::to_string(prefix_name[prefix_str]);
+          event_name.replace(start, end - start + 1, replace_str);
+          start = start + 1;
+          start = event_name.find('%', start);
+          end = event_name.find('%', start + 1);
+          prefix_str = event_name.substr(0, start);
+        }
+        r.set_name(event_name);
+      }
+    }
+  }
+}
 void DisableProfiler(EventSortingKey sorted_key,
                     const std::string &profile_path) {
  SynchronizeAllDevice();
@@ -718,37 +751,17 @@ void DisableProfiler(EventSortingKey sorted_key,
  if (g_state == ProfilerState::kDisabled) return;
  // Mark the profiling stop.
  Mark("_stop_profiler_");
+  DealWithShowName();
  DeviceTracer *tracer = GetDeviceTracer();
  if (tracer->IsEnabled()) {
    tracer->Disable();
-    tracer->GenProfile(profile_path);
    tracer->GenEventKernelCudaElapsedTime();
+    tracer->GenProfile(profile_path);
  }
  std::vector<std::vector<Event>> all_events = GetAllEvents();
-  std::vector<std::string> op_name;
-  for (size_t i = 0; i < (all_events).size(); i++) {
-    for (size_t j = 0; j < (all_events)[i].size(); j++) {
-      std::string event_name = (all_events)[i][j].name();
-      size_t start = event_name.find('%', 0);
-      size_t end = event_name.rfind('%', event_name.length() - 1);
-      if (start == std::string::npos || end == std::string::npos) continue;
-      std::string search_str = event_name.substr(start, end - start + 1);
-      auto it = find(op_name.begin(), op_name.end(), search_str);
-      std::string replace_str;
-      if (it != op_name.end()) {
-        replace_str = std::to_string(std::distance(op_name.begin(), it));
-      } else {
-        replace_str = std::to_string(op_name.size());
-        op_name.push_back(search_str);
-      }
-      event_name.replace(start, end - start + 1, replace_str);
-      (all_events)[i][j].set_name(event_name);
-    }
-  }
  ParseEvents(all_events, true, sorted_key);
  ParseEvents(all_events, false, sorted_key);
  if (VLOG_IS_ON(5)) {
@@ -773,5 +786,30 @@ void SetProfileListener() {
 }
 int64_t ListenerId() { return profiler_lister_id; }
+std::string OpName(const framework::VariableNameMap &name_map,
+                   const std::string &type_name) {
+  if (platform::GetTracerOption() != platform::TracerOption::kAllOpDetail)
+    return "";
+  std::string ret = type_name + "%";
+  for (auto it = name_map.begin(); it != name_map.end(); it++) {
+    auto name_outputs = it->second;
+    if (!name_outputs.empty() &&
+        type_name.length() < name_outputs[0].length()) {
+      ret = ret + name_outputs[0];
+      break;
+    }
+  }
+  ret = ret + "%";
+  return ret;
+}
+void SetTracerOption(TracerOption option) {
+  std::lock_guard<std::mutex> l(profiler_mu);
+  g_tracer_option = option;
+}
+platform::TracerOption GetTracerOption() { return g_tracer_option; }
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/place.h"
@@ -39,6 +40,19 @@ enum class ProfilerState {
  kAll,       // Profile both CPU and GPU. (Currently experimental).
 };
+enum class RecordRole {
+  kOrdinary,  // only record op time with op type key
+  kInnerOp,   // record op detail time with op type key
+  kUniqueOp,  // record op detail time with op unique name key
+};
+// it is the flag to control to print the profiling result
+enum class TracerOption {
+  kDefault,      // print the different op type profiling result
+  kOpDetail,     // print the detail profiling result of different op type
+  kAllOpDetail,  // print the detail profiling result of different op name
+};
 void Mark(const std::string& name);
 void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
@@ -79,7 +93,8 @@ Event* PushEvent(const std::string& name);
 void PopEvent(const std::string& name);
 struct RecordEvent {
-  explicit RecordEvent(const std::string& name);
+  RecordEvent(const std::string& name,
+              const RecordRole r_type = RecordRole::kOrdinary);
  ~RecordEvent();
@@ -90,6 +105,7 @@ struct RecordEvent {
  // Need to distinguish name by op type, block_id, program_id and perhaps
  // different kernel invocations within an op.
  std::string full_name_;
+  RecordRole role_{RecordRole::kOrdinary};
 };
 class RecordRPCEvent {
@@ -181,6 +197,10 @@ bool ShouldSendProfileState();
 void SetProfileListener();
 int64_t ListenerId();
+std::string OpName(const framework::VariableNameMap& name_map,
+                   const std::string& type_name);
+void SetTracerOption(TracerOption option);
+platform::TracerOption GetTracerOption();
 #ifdef PADDLE_WITH_CUDA
 void DummyKernelAndEvent();
 #endif

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1555,6 +1555,12 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #endif
+  py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
+      .value("kDefault", platform::TracerOption::kDefault)
+      .value("kOpDetail", platform::TracerOption::kOpDetail)
+      .value("kAllOpDetail", platform::TracerOption::kAllOpDetail)
+      .export_values();
  py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
      .value("kDisabled", platform::ProfilerState::kDisabled)
      .value("kCPU", platform::ProfilerState::kCPU)
@@ -1571,6 +1577,7 @@ All parameter, weight, gradient are variables in Paddle.
      .value("kAve", platform::EventSortingKey::kAve)
      .export_values();
+  m.def("set_tracer_option", platform::SetTracerOption);
  m.def("enable_profiler", platform::EnableProfiler);
  m.def("disable_profiler", platform::DisableProfiler);
  m.def("is_profiler_enabled", platform::IsProfileEnabled);

--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -126,7 +126,7 @@ def reset_profiler():
    core.reset_profiler()
-def start_profiler(state):
+def start_profiler(state, tracer_option='Default'):
    """
    Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
    `fluid.profiler.stop_profiler` to profile, which is equal to the usage 
@@ -137,9 +137,15 @@ def start_profiler(state):
            or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
            both CPU and GPU; 'All' means profiling both CPU and GPU, and 
            generates timeline as well.
+        tracer_option (str) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it
+            can control the profile level and print the different level profile result. Default option print 
+            the different Op type profiling result and the OpDetail option print the detail profiling 
+            result of different op types such as compute and data transform, AllOpDetail option 
+            print the detail profiling result of different op name same as OpDetail.
    Raises:
        ValueError: If `state` is not in ['CPU', 'GPU', 'All'].
+        ValueError: If `tracer_option` is not in ['Default', 'OpDetail', 'AllOpDetail']
    Examples:
@@ -154,6 +160,13 @@ def start_profiler(state):
                    profiler.reset_profiler()
                # except each iteration
            profiler.stop_profiler('total', '/tmp/profile')
+            profiler.start_profiler('GPU', "OpDetail")
+            for iter in range(10):
+                if iter == 2:
+                    profiler.reset_profiler()
+                # except each iteration
+            profiler.stop_profiler('total', '/tmp/profile')
    """
    if core.is_profiler_enabled():
        return
@@ -165,6 +178,18 @@ def start_profiler(state):
        prof_state = core.ProfilerState.kCPU
    else:
        prof_state = core.ProfilerState.kAll
+    if tracer_option not in ['Default', 'OpDetail', 'AllOpDetail']:
+        raise ValueError(
+            "tracer option must be 'Default', 'OpDetail', 'AllOpDetail'.")
+    if tracer_option == "Default":
+        prof_tracer_option = core.TracerOption.kDefault
+    elif tracer_option == "OpDetail":
+        prof_tracer_option = core.TracerOption.kOpDetail
+    else:
+        prof_tracer_option = core.TracerOption.kAllOpDetail
+    core.set_tracer_option(prof_tracer_option)
    core.enable_profiler(prof_state)
@@ -184,8 +209,8 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
            The `max` means sorting by the maximum execution time.
            The `min` means sorting by the minimum execution time.
            The `ave` means sorting by the average execution time.
-        profile_path (str, optional) : If state == 'All', it will generate timeline,
            and write it into `profile_path`. The default profile_path is `/tmp/profile`. 
+        profile_path (str, optional) : If state == 'All', it will generate timeline,
    Raises:
        ValueError: If `sorted_key` is not in
@@ -225,7 +250,10 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
 @signature_safe_contextmanager
-def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
+def profiler(state,
+             sorted_key=None,
+             profile_path='/tmp/profile',
+             tracer_option='Default'):
    """
    The profiler interface. Different from `fluid.profiler.cuda_profiler`, 
    this profiler can be used to profile both CPU and GPU program.
@@ -246,6 +274,11 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
            The `ave` means sorting by the average execution time.
        profile_path (str, optional) : If state == 'All', it will generate timeline,
            and write it into `profile_path`. The default profile_path is `/tmp/profile`. 
+        tracer_option (str) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it
+            can control the profile level and print the different level profile result. Default option print 
+            the different Op type profiling result and the OpDetail option print the detail profiling 
+            result of different op types such as compute and data transform, AllOpDetail option 
+            print the detail profiling result of different op name same as OpDetail.
    Raises:
        ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
@@ -268,7 +301,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
            exe = fluid.Executor(place)
            exe.run(fluid.default_startup_program())
-            with profiler.profiler('CPU', 'total', '/tmp/profile') as prof:
+            with profiler.profiler('CPU', 'total', '/tmp/profile', 'Default') as prof:
                for i in range(epoc):
                    input = np.random.random(dshape).astype('float32')
                    exe.run(fluid.default_main_program(), feed={'data': input})
@@ -311,6 +344,6 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
            thread0::conv2d             8           7.93456     0.291385    5.63342     0.99182     0.795243
            thread0::elementwise_add    8           1.96555     0.191884    0.518004    0.245693    0.196998
    """
-    start_profiler(state)
+    start_profiler(state, tracer_option)
    yield
    stop_profiler(sorted_key, profile_path)
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -22,6 +22,7 @@ import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
+from paddle.fluid import compiler, Program, program_guard
 import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
@@ -30,7 +31,11 @@ class TestProfiler(unittest.TestCase):
    def setUpClass(cls):
        os.environ['CPU_NUM'] = str(4)
-    def net_profiler(self, state, use_parallel_executor=False):
+    def net_profiler(self,
+                     state,
+                     option,
+                     iter_range=None,
+                     use_parallel_executor=False):
        profile_path = os.path.join(tempfile.gettempdir(), "profile")
        open(profile_path, "w").write("")
        startup_program = fluid.Program()
@@ -75,7 +80,7 @@ class TestProfiler(unittest.TestCase):
                main_program=main_program)
        pass_acc_calculator = fluid.average.WeightedAverage()
-        with profiler.profiler(state, 'total', profile_path) as prof:
+        with profiler.profiler(state, 'total', profile_path, option) as prof:
            for iter in range(10):
                if iter == 2:
                    profiler.reset_profiler()
@@ -94,36 +99,37 @@ class TestProfiler(unittest.TestCase):
                pass_acc_calculator.add(value=acc, weight=b_size)
                pass_acc = pass_acc_calculator.eval()
        data = open(profile_path, 'rb').read()
-        self.assertGreater(len(data), 0)
+        if (len(data) > 0):
-        profile_pb = profiler_pb2.Profile()
+            profile_pb = profiler_pb2.Profile()
-        profile_pb.ParseFromString(data)
+            profile_pb.ParseFromString(data)
-        self.assertGreater(len(profile_pb.events), 0)
+            self.assertGreater(len(profile_pb.events), 0)
-        for event in profile_pb.events:
+            for event in profile_pb.events:
-            if event.type == profiler_pb2.Event.GPUKernel:
+                if event.type == profiler_pb2.Event.GPUKernel:
-                if not event.detail_info and not event.name.startswith("MEM"):
+                    if not event.detail_info and not event.name.startswith(
-                    raise Exception(
+                            "MEM"):
-                        "Kernel %s missing event. Has this kernel been recorded by RecordEvent?"
+                        raise Exception(
-                        % event.name)
+                            "Kernel %s missing event. Has this kernel been recorded by RecordEvent?"
-            elif event.type == profiler_pb2.Event.CPU and (
+                            % event.name)
-                    event.name.startswith("Driver API") or
+                elif event.type == profiler_pb2.Event.CPU and (
-                    event.name.startswith("Runtime API")):
+                        event.name.startswith("Driver API") or
-                print("Warning: unregister", event.name)
+                        event.name.startswith("Runtime API")):
+                    print("Warning: unregister", event.name)
    def test_cpu_profiler(self):
-        self.net_profiler('CPU')
+        self.net_profiler('CPU', "Default")
-        self.net_profiler('CPU', use_parallel_executor=True)
+        self.net_profiler('CPU', "Default", use_parallel_executor=True)
    @unittest.skipIf(not core.is_compiled_with_cuda(),
                     "profiler is enabled only with GPU")
    def test_cuda_profiler(self):
-        self.net_profiler('GPU')
+        self.net_profiler('GPU', "OpDetail")
-        self.net_profiler('GPU', use_parallel_executor=True)
+        self.net_profiler('GPU', "OpDetail", use_parallel_executor=True)
    @unittest.skipIf(not core.is_compiled_with_cuda(),
                     "profiler is enabled only with GPU")
    def test_all_profiler(self):
-        self.net_profiler('All')
+        self.net_profiler('All', "AllOpDetail")
-        self.net_profiler('All', use_parallel_executor=True)
+        self.net_profiler('All', "AllOpDetail", use_parallel_executor=True)
 if __name__ == '__main__':