Fix operator type record in profiler [cherry-pick PR44582] (#44654)

* fix record event for operator type in new dygraph (#44582) * fix new dygraph record event for op * update unit test * fix file mode

Fix operator type record in profiler [cherry-pick PR44582] (#44654)
* fix record event for operator type in new dygraph (#44582) * fix new dygraph record event for op * update unit test * fix file mode
6de20581 · chenjian · GitHub · b71833ea · 6de20581 · 6de20581
6 changed file
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -71,7 +71,7 @@ PARSE_PYTHON_C_ARGS_TEMPLATE = \
 RECORD_EVENT_TEMPLATE = \
-"    paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);"
+"paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::UserDefined, 1);"
 RETURN_INPLACE_PYOBJECT_TEMPLATE = \
@@ -253,6 +253,7 @@ NAMESPACE_WRAPPER_TEMPLATE = \
 ## Generator Classes ##
 #######################
 class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
    def __init__(self, forward_api_contents, namespace):
        # Members from Parent:
        #self.namespace
@@ -265,7 +266,7 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
        #self.forward_outputs_position_map
        #self.optional_inputs
        #self.no_need_buffers
-        #self.intermediate_outputs   
+        #self.intermediate_outputs
        #self.inplace_map
        FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
@@ -327,8 +328,8 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
        set_device_str = FUNCTION_SET_DEVICE_TEMPLATE.format(expected_place_str)
        # Generate Dygraph Function Call Logic
-        num_args = len(forward_inputs_position_map.keys()) + len(
+        num_args = len(
-            orig_forward_attrs_list)
+            forward_inputs_position_map.keys()) + len(orig_forward_attrs_list)
        dygraph_function_call_list = ["" for i in range(num_args)]
        for name, (_, pos) in forward_inputs_position_map.items():
            dygraph_function_call_list[pos] = f"{name}"
@@ -336,7 +337,7 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
            dygraph_function_call_list[pos] = f"{name}"
        dygraph_function_call_str = ",".join(dygraph_function_call_list)
-        # Generate Python-C Function Definitions 
+        # Generate Python-C Function Definitions
        if is_forward_only:
            fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
                "paddle::experimental::", namespace, forward_api_name)
@@ -441,8 +442,9 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
 class PythonCYamlGenerator(YamlGeneratorBase):
    def __init__(self, path):
-        # Parent members: 
+        # Parent members:
        # self.namespace
        # self.api_yaml_path
        # self.forward_api_list
@@ -457,8 +459,8 @@ class PythonCYamlGenerator(YamlGeneratorBase):
        forward_api_list = self.forward_api_list
        for forward_api_content in forward_api_list:
-            f_generator = PythonCSingleFunctionGenerator(forward_api_content,
+            f_generator = PythonCSingleFunctionGenerator(
-                                                         namespace)
+                forward_api_content, namespace)
            status = f_generator.run()
            if status == True:

--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -30,10 +30,10 @@
 namespace egr {
 /*
-* GeneralGrad is Helpper class to implement custom grad operation between
+ * GeneralGrad is Helpper class to implement custom grad operation between
-* outputs and inputs.
+ * outputs and inputs.
-*
+ *
-* **/
+ * **/
 class GeneralGrad {
 public:
  static GeneralGrad& Instance() { return *general_grad_; }
@@ -64,7 +64,8 @@ class GeneralGrad {
                                paddle::platform::errors::Fatal(
                                    "There is no grad op for %s:[%d] or it's"
                                    "stop_gradient=True.",
-                                    msg, i));
+                                    msg,
+                                    i));
        if (is_no_grad_vars) {
          (no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta;
        } else {  // normal input
@@ -248,7 +249,8 @@ class GeneralGrad {
  std::vector<paddle::experimental::Tensor> GetResults(
      const std::vector<paddle::experimental::Tensor>& inputs,
-      bool allow_unused, bool create_graph) {
+      bool allow_unused,
+      bool create_graph) {
    VLOG(6) << "Running in GetResults";
    if (inputs.empty()) return {};
@@ -276,7 +278,8 @@ class GeneralGrad {
        tensor_auto_grad_meta->SetStopGradient(!create_graph);
        results.emplace_back(iter->second);
      } else {
-        PADDLE_ENFORCE_EQ(allow_unused, true,
+        PADDLE_ENFORCE_EQ(allow_unused,
+                          true,
                          paddle::platform::errors::InvalidArgument(
                              "The %d-th input does not appear in the backward "
                              "graph. Please check the input tensor or set "
@@ -493,7 +496,8 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
 void EnforceGradNodeHasInput(GradNodeBase* node) {
  VLOG(6) << "Running in EnforceGradNodeHasInput";
  PADDLE_ENFORCE_NE(
-      node->IsTensorWrappersCleared(), true,
+      node->IsTensorWrappersCleared(),
+      true,
      paddle::platform::errors::Fatal(
          "The TensorWrappers of %s do not exist. This may be because:\n"
          "You calculate backward twice for the same subgraph without "
@@ -509,10 +513,13 @@ void DuplicateCheck(const std::vector<paddle::experimental::Tensor>& inputs,
  for (auto in : inputs) {
    AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in);
    PADDLE_ENFORCE_EQ(
-        visisted_ins.count(auto_grad_meta), 0,
+        visisted_ins.count(auto_grad_meta),
+        0,
        paddle::platform::errors::AlreadyExists(
-            "%s contain duplicate tensor %s, please check %s carefully.", msg,
+            "%s contain duplicate tensor %s, please check %s carefully.",
-            in.name(), msg));
+            msg,
+            in.name(),
+            msg));
    visisted_ins.insert(auto_grad_meta);
  }
 }
@@ -522,7 +529,8 @@ GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad();
 std::vector<paddle::experimental::Tensor> RunBackward(
    const std::vector<paddle::experimental::Tensor>& tensors,  // output
    const std::vector<paddle::experimental::Tensor>& grad_tensors,
-    bool retain_graph, bool create_graph = false,
+    bool retain_graph,
+    bool create_graph = false,
    const std::vector<paddle::experimental::Tensor>& inputs = {},
    bool allow_unused = false,
    const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
@@ -631,8 +639,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
  if (is_general_grad) {
    // Prepare several vital preprocess for GeneralGrad
-    GeneralGrad::Instance().PreparedForGeneralGrad(inputs, no_grad_vars, &queue,
+    GeneralGrad::Instance().PreparedForGeneralGrad(
-                                                   node_input_buffers_dict);
+        inputs, no_grad_vars, &queue, node_input_buffers_dict);
  }
  VLOG(6) << " startup_ops' size is :" << queue.size();
@@ -651,7 +659,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
    paddle::platform::RecordEvent node_record_event(
        std::string((*node).name()) + " grad_node",
-        paddle::platform::TracerEventType::Operator, 1);
+        paddle::platform::TracerEventType::Operator,
+        1);
    if (queue.size() > 1 && node_in_degree_map[node] != 0) {
      queue.pop();
@@ -716,7 +725,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
                       "Number of edges should be either empty ( for leaf node "
                       ") or the same as number of output grad tensors, but we "
                       "got edges size is: %d, grad_output size is: %d",
-                       edges.size(), grad_output_tensors.size()));
+                       edges.size(),
+                       grad_output_tensors.size()));
    for (size_t i = 0; i < edges.size(); i++) {
      for (size_t j = 0; j < edges[i].size(); j++) {
@@ -739,7 +749,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
        }
        PADDLE_ENFORCE_LT(
-            j, grad_output_tensors[i].size(),
+            j,
+            grad_output_tensors[i].size(),
            paddle::platform::errors::Fatal(
                "Rank of grad_output_tensors should be less than "
                "grad_output_tensors[i].size(), which is: %d. This error may "
@@ -771,9 +782,10 @@ std::vector<paddle::experimental::Tensor> RunBackward(
        VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
                << ", rank: " << edge_rank.second;
-        node_input_buffers_dict[next_node]->add(
+        node_input_buffers_dict[next_node]->add(edge_rank.first,
-            edge_rank.first, edge_rank.second, grad_output_tensor,
+                                                edge_rank.second,
-            create_graph);
+                                                grad_output_tensor,
+                                                create_graph);
        // Update queue
        node_in_degree_map[next_node]--;
@@ -810,7 +822,7 @@ void Backward(
    bool retain_graph) {
  VLOG(6) << "Run in Backward";
  paddle::platform::RecordEvent backward_record_event(
-      "backward", paddle::platform::TracerEventType::Operator, 1);
+      "backward", paddle::platform::TracerEventType::UserDefined, 1);
  RunBackward(tensors, grad_tensors, retain_graph);
  phi::autotune::AutoTuneStatus::Instance().Update();
 }
@@ -819,14 +831,22 @@ std::vector<paddle::experimental::Tensor> Grad(
    const std::vector<paddle::experimental::Tensor>& tensors,  // outputs
    const std::vector<paddle::experimental::Tensor>& inputs,
    const std::vector<paddle::experimental::Tensor>& grad_tensors,
-    bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused,
+    bool retain_graph,
+    bool create_graph,
+    bool only_inputs,
+    bool allow_unused,
    const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
  VLOG(6) << "Run in Grad";
  DuplicateCheck(inputs, true /* is_input */);
  DuplicateCheck(tensors, false /* is_input */);
-  return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs,
+  return RunBackward(tensors,
-                     allow_unused, no_grad_vars);
+                     grad_tensors,
+                     retain_graph,
+                     create_graph,
+                     inputs,
+                     allow_unused,
+                     no_grad_vars);
 }
 }  // namespace egr
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -588,7 +588,7 @@ void ChromeTracingLogger::StartLog() {
        std::string(
            R"JSON(
    {
-       "id": %d, "name": "%s", "totalGlobalMem": %u,
+       "id": %d, "name": "%s", "totalGlobalMem": %llu,
      "computeMajor": %d, "computeMinor": %d,
      "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
      "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
@@ -618,7 +618,7 @@ void ChromeTracingLogger::StartLog() {
        std::string(
            R"JSON(
    {
-       "id": %d, "name": "%s", "totalGlobalMem": %u,
+       "id": %d, "name": "%s", "totalGlobalMem": %llu,
      "computeMajor": %d, "computeMinor": %d,
      "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
      "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,

--- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -19,6 +19,7 @@ import paddle.profiler as profiler
 class HostPythonNode:
    def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
        self.name = name
        self.type = type
@@ -32,6 +33,7 @@ class HostPythonNode:
 class DevicePythonNode:
    def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
                 stream_id):
        self.name = name
@@ -44,6 +46,7 @@ class DevicePythonNode:
 class TestProfilerStatistic(unittest.TestCase):
    def test_statistic_case1(self):
        root_node = HostPythonNode('Root Node',
                                   profiler.TracerEventType.UserDefined, 0,
@@ -54,14 +57,16 @@ class TestProfilerStatistic(unittest.TestCase):
        dataloader_node = HostPythonNode('Dataloader',
                                         profiler.TracerEventType.Dataloader, 5,
                                         15, 1000, 1001)
-        mobilenet_node = HostPythonNode(
+        mobilenet_node = HostPythonNode('MobileNet',
-            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+                                        profiler.TracerEventType.Forward, 20,
-        yolonet_node = HostPythonNode(
+                                        50, 1000, 1001)
-            'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
+        yolonet_node = HostPythonNode('Yolov3Net',
+                                      profiler.TracerEventType.Forward, 50, 110,
+                                      1000, 1001)
-        userdefined_node = HostPythonNode('Communication Time',
+        userdefined_node = HostPythonNode(
-                                          profiler.TracerEventType.UserDefined,
+            'Communication Time', profiler.TracerEventType.PythonUserDefined,
-                                          100, 110, 1000, 1001)
+            100, 110, 1000, 1001)
        communication_node = HostPythonNode(
            'Communication', profiler.TracerEventType.Communication, 105, 110,
@@ -72,8 +77,9 @@ class TestProfilerStatistic(unittest.TestCase):
        optimization_node = HostPythonNode(
            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
            1000, 1001)
-        conv2d_node = HostPythonNode(
+        conv2d_node = HostPythonNode('conv2d',
-            'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
+                                     profiler.TracerEventType.Operator, 25, 40,
+                                     1000, 1001)
        sync_batch_norm_node = HostPythonNode('sync_batch_norm',
                                              profiler.TracerEventType.Operator,
                                              60, 100, 1000, 1001)
@@ -92,10 +98,12 @@ class TestProfilerStatistic(unittest.TestCase):
        conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
                                           profiler.TracerEventType.CudaRuntime,
                                           35, 40, 1000, 1001)
-        conv2d_kernel = DevicePythonNode(
+        conv2d_kernel = DevicePythonNode('conv2d_kernel',
-            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
+                                         profiler.TracerEventType.Kernel, 35,
-        conv2d_memcpy = DevicePythonNode(
+                                         50, 0, 0, 0)
-            'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
+        conv2d_memcpy = DevicePythonNode('conv2d_memcpy',
+                                         profiler.TracerEventType.Memcpy, 50,
+                                         60, 0, 0, 0)
        sync_batch_norm_infer_shape = HostPythonNode(
            'sync_batch_norm::infer_shape',
            profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
@@ -146,8 +154,8 @@ class TestProfilerStatistic(unittest.TestCase):
            'Process Cpu Utilization': '1.02',
            'System Cpu Utilization': '0.68'
        }
-        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
+        statistic_data = profiler.profiler_statistic.StatisticData(
-                                                                   extra_info)
+            thread_tree, extra_info)
        time_range_summary = statistic_data.time_range_summary
        event_summary = statistic_data.event_summary
@@ -180,7 +188,7 @@ class TestProfilerStatistic(unittest.TestCase):
                0, profiler.TracerEventType.Memcpy), 60)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
-                profiler.TracerEventType.UserDefined), 25)
+                profiler.TracerEventType.UserDefined), 15)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.Communication), 5)
@@ -200,8 +208,9 @@ class TestProfilerStatistic(unittest.TestCase):
            0)
        self.assertEqual(
            event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
-        self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy']
+        self.assertEqual(
-                         .general_gpu_time, 60)
+            event_summary.memory_manipulation_items['AsyncMemcpy'].
+            general_gpu_time, 60)
        print(
            profiler.profiler_statistic._build_table(
                statistic_data,
@@ -222,14 +231,16 @@ class TestProfilerStatistic(unittest.TestCase):
                                         profiler.TracerEventType.Dataloader, 5,
                                         15, 1000, 1001)
-        mobilenet_node = HostPythonNode(
+        mobilenet_node = HostPythonNode('MobileNet',
-            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+                                        profiler.TracerEventType.Forward, 20,
-        yolonet_node = HostPythonNode(
+                                        50, 1000, 1001)
-            'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
+        yolonet_node = HostPythonNode('Yolov3Net',
+                                      profiler.TracerEventType.Forward, 50, 110,
+                                      1000, 1001)
-        userdefined_node = HostPythonNode('Communication Time',
+        userdefined_node = HostPythonNode(
-                                          profiler.TracerEventType.UserDefined,
+            'Communication Time', profiler.TracerEventType.PythonUserDefined,
-                                          100, 110, 1000, 1001)
+            100, 110, 1000, 1001)
        allreduce_launchkernel0 = HostPythonNode(
            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104,
            1000, 1001)
@@ -263,8 +274,9 @@ class TestProfilerStatistic(unittest.TestCase):
        optimization_node = HostPythonNode(
            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
            1000, 1001)
-        conv2d_node = HostPythonNode(
+        conv2d_node = HostPythonNode('conv2d',
-            'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
+                                     profiler.TracerEventType.Operator, 25, 40,
+                                     1000, 1001)
        sync_batch_norm_node = HostPythonNode('sync_batch_norm',
                                              profiler.TracerEventType.Operator,
                                              60, 100, 1000, 1001)
@@ -283,10 +295,12 @@ class TestProfilerStatistic(unittest.TestCase):
        conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
                                           profiler.TracerEventType.CudaRuntime,
                                           35, 40, 1000, 1001)
-        conv2d_kernel = DevicePythonNode(
+        conv2d_kernel = DevicePythonNode('conv2d_kernel',
-            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
+                                         profiler.TracerEventType.Kernel, 35,
-        conv2d_memcpy = DevicePythonNode(
+                                         50, 0, 0, 0)
-            'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
+        conv2d_memcpy = DevicePythonNode('conv2d_memcpy',
+                                         profiler.TracerEventType.Memcpy, 50,
+                                         60, 0, 0, 0)
        sync_batch_norm_infer_shape = HostPythonNode(
            'sync_batch_norm::infer_shape',
            profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
@@ -363,8 +377,8 @@ class TestProfilerStatistic(unittest.TestCase):
            'Process Cpu Utilization': '1.02',
            'System Cpu Utilization': '0.68'
        }
-        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
+        statistic_data = profiler.profiler_statistic.StatisticData(
-                                                                   extra_info)
+            thread_tree, extra_info)
        time_range_summary = statistic_data.time_range_summary
        event_summary = statistic_data.event_summary
        distributed_summary = statistic_data.distributed_summary
@@ -398,7 +412,7 @@ class TestProfilerStatistic(unittest.TestCase):
                0, profiler.TracerEventType.Memcpy), 60)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
-                profiler.TracerEventType.UserDefined), 25)
+                profiler.TracerEventType.UserDefined), 15)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.Communication), 5)
@@ -433,8 +447,9 @@ class TestProfilerStatistic(unittest.TestCase):
            0)
        self.assertEqual(
            event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
-        self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy']
+        self.assertEqual(
-                         .general_gpu_time, 60)
+            event_summary.memory_manipulation_items['AsyncMemcpy'].
+            general_gpu_time, 60)
        print(
            profiler.profiler_statistic._build_table(
                statistic_data,
@@ -454,8 +469,9 @@ class TestProfilerStatistic(unittest.TestCase):
        dataloader_node = HostPythonNode('Dataloader',
                                         profiler.TracerEventType.Dataloader, 5,
                                         15, 1000, 1001)
-        mobilenet_node = HostPythonNode(
+        mobilenet_node = HostPythonNode('MobileNet',
-            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+                                        profiler.TracerEventType.Forward, 20,
+                                        50, 1000, 1001)
        backward_node = HostPythonNode('Gradient Backward',
                                       profiler.TracerEventType.Backward, 120,
@@ -463,12 +479,13 @@ class TestProfilerStatistic(unittest.TestCase):
        optimization_node = HostPythonNode(
            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
            1000, 1001)
-        userdefined_node = HostPythonNode('Communication Time',
+        userdefined_node = HostPythonNode(
-                                          profiler.TracerEventType.UserDefined,
+            'Communication Time', profiler.TracerEventType.PythonUserDefined,
-                                          60, 70, 1000, 1001)
+            60, 70, 1000, 1001)
-        conv2d_node = HostPythonNode(
+        conv2d_node = HostPythonNode('conv2d',
-            'conv2d', profiler.TracerEventType.Operator, 25, 25, 1000, 1001)
+                                     profiler.TracerEventType.Operator, 25, 25,
+                                     1000, 1001)
        conv2d_infer_shape = HostPythonNode(
            'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
@@ -480,8 +497,9 @@ class TestProfilerStatistic(unittest.TestCase):
            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25,
            1000, 1001)
-        conv2d_kernel = DevicePythonNode(
+        conv2d_kernel = DevicePythonNode('conv2d_kernel',
-            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
+                                         profiler.TracerEventType.Kernel, 35,
+                                         35, 0, 0, 0)
        another_kernel = DevicePythonNode(
            'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()',
            profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
@@ -500,15 +518,16 @@ class TestProfilerStatistic(unittest.TestCase):
            'Process Cpu Utilization': '1.02',
            'System Cpu Utilization': '0.68'
        }
-        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
+        statistic_data = profiler.profiler_statistic.StatisticData(
-                                                                   extra_info)
+            thread_tree, extra_info)
        time_range_summary = statistic_data.time_range_summary
        event_summary = statistic_data.event_summary
        self.assertEqual(event_summary.items['conv2d'].cpu_time, 0)
        self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0)
-        self.assertEqual(event_summary.userdefined_items['Communication Time']
+        self.assertEqual(
-                         .general_gpu_time, 0)
+            event_summary.userdefined_items['Communication Time'].
+            general_gpu_time, 0)
        for sort_key in [
                profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax,
                profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg,
@@ -516,12 +535,11 @@ class TestProfilerStatistic(unittest.TestCase):
                profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg
        ]:
            print(
-                profiler.profiler_statistic._build_table(
+                profiler.profiler_statistic._build_table(statistic_data,
-                    statistic_data,
+                                                         sorted_by=sort_key,
-                    sorted_by=sort_key,
+                                                         op_detail=True,
-                    op_detail=True,
+                                                         thread_sep=False,
-                    thread_sep=False,
+                                                         time_unit='ms'))
-                    time_unit='ms'))
 if __name__ == '__main__':

--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py