From 6de2058165a81f0c39b441206a1294d058706a26 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Tue, 2 Aug 2022 09:58:53 +0800
Subject: [PATCH] Fix operator type record in profiler [cherry-pick PR44582]
 (#44654)

* fix record event for operator type in new dygraph (#44582)

* fix new dygraph record event for op

* update unit test

* fix file mode
---
 .../auto_code_generator/eager_generator.cc    | 348 +++++++++------
 .../final_state_generator/python_c_gen.py     |  24 +-
 paddle/fluid/eager/backward.cc                |  68 +--
 .../platform/profiler/chrometracing_logger.cc |   4 +-
 .../unittests/test_profiler_statistic.py      | 128 +++---
 python/paddle/profiler/profiler_statistic.py  | 396 ++++++++----------
 6 files changed, 536 insertions(+), 432 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 39559a2d901..30ace9c1d3a 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -476,7 +476,8 @@ static void SlotNameMatching(
               PADDLE_THROW(platform::errors::Fatal(
                   "Detected mismatched slot names."
                   "grad_slot_name %s matches both %s and %s fwd_slot_name",
-                  grad_slot_name, grad_fwd_slotname_map[grad_slot_name],
+                  grad_slot_name,
+                  grad_fwd_slotname_map[grad_slot_name],
                   fwd_slot_name));
             }
             grad_fwd_slotname_map[grad_slot_name] = fwd_slot_name;
@@ -489,7 +490,8 @@ static void SlotNameMatching(
               PADDLE_THROW(platform::errors::Fatal(
                   "Detected mismatched slot names."
                   "grad_slot_name %s matches both %s and %s fwd_slot_name",
-                  grad_slot_name, grad_grad_slotname_map[grad_slot_name],
+                  grad_slot_name,
+                  grad_grad_slotname_map[grad_slot_name],
                   fwd_slot_name));
             }
             grad_grad_slotname_map[grad_slot_name] = fwd_slot_name;
@@ -509,7 +511,8 @@ static void SlotNameMatching(
               PADDLE_THROW(platform::errors::Fatal(
                   "Detected mismatched slot names"
                   "grad_slot_name %s matches both %s and %s fwd_slot_name",
-                  grad_slot_name, grad_fwd_slotname_map[grad_slot_name],
+                  grad_slot_name,
+                  grad_fwd_slotname_map[grad_slot_name],
                   fwd_slot_name));
             }
             grad_fwd_slotname_map[grad_slot_name] = fwd_slot_name;
@@ -522,7 +525,8 @@ static void SlotNameMatching(
               PADDLE_THROW(platform::errors::Fatal(
                   "Detected mismatched slot names."
                   "grad_slot_name %s matches both %s and %s fwd_slot_name",
-                  grad_slot_name, grad_grad_slotname_map[grad_slot_name],
+                  grad_slot_name,
+                  grad_grad_slotname_map[grad_slot_name],
                   fwd_slot_name));
             }
             grad_grad_slotname_map[grad_slot_name] = fwd_slot_name;
@@ -900,8 +904,8 @@ static bool CollectGradInformationFromOpInfo(
   }
 
   std::shared_ptr<paddle::imperative::GradOpNode> grad_node =
-      op_info.dygraph_grad_op_maker_(op_type, ins, outs, attrs, default_attrs,
-                                     {});
+      op_info.dygraph_grad_op_maker_(
+          op_type, ins, outs, attrs, default_attrs, {});
 
   if (!grad_node) {
     VLOG(6) << "Got nullptr GradOpNode for " << op_type
@@ -977,12 +981,16 @@ static bool CollectGradInformationFromOpInfo(
   /* ------ Slot Name Matching ---- */
   for (auto& iter : *op_base_infos) {
     // grad_ins -> fwd_ins, fwd_outs
-    SlotNameMatching(iter.GetGradIns(), fwd_ins, fwd_outs,
+    SlotNameMatching(iter.GetGradIns(),
+                     fwd_ins,
+                     fwd_outs,
                      iter.GetMutableGradInsFwdSlotnameMap(),
                      iter.GetMutableGradInsGradSlotnameMap());
 
     // grad_outs -> fwd_ins, fwd_outs
-    SlotNameMatching(iter.GetGradOuts(), fwd_ins, fwd_outs,
+    SlotNameMatching(iter.GetGradOuts(),
+                     fwd_ins,
+                     fwd_outs,
                      iter.GetMutableGradOutsSlotnameMap(),
                      iter.GetMutableGradOutsSlotnameMap());
   }
@@ -1042,16 +1050,18 @@ static std::string GenerateGradNodeCreationContent(
             "p_autograd_" + inplace_input_name;
         const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
             "    %s = egr::EagerUtils::autograd_meta(&%s);\n";
-        get_output_autograd_meta_str += paddle::string::Sprintf(
-            GET_SINGLE_AUTOGRAD_META_TEMPLATE, inplace_input_autograd_name,
-            inplace_input_name);
+        get_output_autograd_meta_str +=
+            paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE,
+                                    inplace_input_autograd_name,
+                                    inplace_input_name);
       } else {
         const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
             "    egr::AutogradMeta* %s = "
             "egr::EagerUtils::autograd_meta(&%s);\n";
         get_output_autograd_meta_str +=
             paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE,
-                                    output_autograd_name, output_name);
+                                    output_autograd_name,
+                                    output_name);
       }
     }
   }
@@ -1097,8 +1107,8 @@ static std::string GenerateGradNodeCreationContent(
         "require_any_grad);\n";
     for (auto& inplace_pair : inplace_map) {
       std::string inplace_name = inplace_pair.second;
-      check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE,
-                                                   inplace_name, inplace_name);
+      check_inplace_str += paddle::string::Sprintf(
+          CHECKING_INPLACE_TEMPLATE, inplace_name, inplace_name);
     }
     VLOG(6) << "Check Inplace Input";
   }
@@ -1124,9 +1134,11 @@ static std::string GenerateGradNodeCreationContent(
       "      auto grad_node = std::shared_ptr<GradNode%s>(new GradNode%s(%d, "
       "%d));\n";
   grad_node_creation_str += "    // Create GradOpNode\n";
-  grad_node_creation_str +=
-      paddle::string::Sprintf(GRAD_OP_NODE_TEMPLATE, op_type, op_type,
-                              bwd_in_slot_num, bwd_out_slot_num);
+  grad_node_creation_str += paddle::string::Sprintf(GRAD_OP_NODE_TEMPLATE,
+                                                    op_type,
+                                                    op_type,
+                                                    bwd_in_slot_num,
+                                                    bwd_out_slot_num);
   grad_node_creation_str += "\n";
 
   VLOG(6) << "Generated GradOpNode construction";
@@ -1158,13 +1170,17 @@ static std::string GenerateGradNodeCreationContent(
       // Replace output directly with input in inplace op.
       if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
         auto inplace_input_name = inplace_map[tensor_wrapper_name];
-        grad_node_creation_str += paddle::string::Sprintf(
-            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
-            inplace_input_name, full_reserved);
+        grad_node_creation_str +=
+            paddle::string::Sprintf(SET_TENSOR_WRAPPER_TEMPLATE,
+                                    tensor_wrapper_name,
+                                    inplace_input_name,
+                                    full_reserved);
       } else {
-        grad_node_creation_str += paddle::string::Sprintf(
-            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
-            tensor_wrapper_name, full_reserved);
+        grad_node_creation_str +=
+            paddle::string::Sprintf(SET_TENSOR_WRAPPER_TEMPLATE,
+                                    tensor_wrapper_name,
+                                    tensor_wrapper_name,
+                                    full_reserved);
       }
     }
   }
@@ -1189,9 +1205,10 @@ static std::string GenerateGradNodeCreationContent(
 
       const char* ADD_EDGES_TEMPLATE =
           "      if(%s) grad_node->AddEdges(%s, %d);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name,
-                                  input_autograd_name, input_position);
+      grad_node_creation_str += paddle::string::Sprintf(ADD_EDGES_TEMPLATE,
+                                                        input_autograd_name,
+                                                        input_autograd_name,
+                                                        input_position);
     } else {
       compute_require_grad_args += ", &" + input_autograd_name;
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
@@ -1319,7 +1336,7 @@ static std::string GenerateGradNodeCreationContent(
       "%s"
       "  {\n"
       "    paddle::platform::RecordEvent node_creation_record_event(\"%s\", "
-      "paddle::platform::TracerEventType::Operator, 1);\n"
+      "paddle::platform::TracerEventType::OperatorInner, 1);\n"
       "%s"
       "    if(require_any_grad) {\n"
       "      VLOG(6) << \" Construct Grad for %s \"; \n"
@@ -1327,11 +1344,17 @@ static std::string GenerateGradNodeCreationContent(
       "  %s\n"
       "    }\n"
       "  }";
-  std::string grad_node_creation_body_str = paddle::string::Sprintf(
-      GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str,
-      compute_require_grad_args, check_inplace_str, trace_op_body_str,
-      event_name, get_output_autograd_meta_str, op_type,
-      pass_stop_gradient_args, grad_node_creation_str);
+  std::string grad_node_creation_body_str =
+      paddle::string::Sprintf(GRAD_NODE_CREATION_TEMPLATE,
+                              prepare_autograd_meta_str,
+                              compute_require_grad_args,
+                              check_inplace_str,
+                              trace_op_body_str,
+                              event_name,
+                              get_output_autograd_meta_str,
+                              op_type,
+                              pass_stop_gradient_args,
+                              grad_node_creation_str);
 
   return grad_node_creation_body_str;
 }
@@ -1454,8 +1477,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
     const char* FWD_INS_CONTENT_TEMPLATE =
         "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
-    ins_contents_str += paddle::string::Sprintf(FWD_INS_CONTENT_TEMPLATE,
-                                                input_name, input_name);
+    ins_contents_str += paddle::string::Sprintf(
+        FWD_INS_CONTENT_TEMPLATE, input_name, input_name);
     if (input.duplicable()) {
       const char* AMP_TENSORS_VECTOR_TEMPLATE = "%s,";
       amp_tensors_vector_str +=
@@ -1518,9 +1541,14 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE =
             "    auto NEW_%s = ((%s.size() > 0) ? egr::AmpAutoCasts(\"%s\", "
             "%s, amp_dst_dtype, \"%s\") : %s);\n";
-        dispensable_amp_auto_cast_str += paddle::string::Sprintf(
-            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name,
-            input_name, input_name, op_type, input_name);
+        dispensable_amp_auto_cast_str +=
+            paddle::string::Sprintf(DISPENSABLE_AMP_AUTO_CAST_TEMPLATE,
+                                    input_name,
+                                    input_name,
+                                    input_name,
+                                    input_name,
+                                    op_type,
+                                    input_name);
       } else {
         const char* FWD_INS_CONTENT_TEMPLATE =
             "  if(%s.initialized()) "
@@ -1535,9 +1563,14 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE =
             "    auto NEW_%s = ((%s.initialized()) ? egr::AmpAutoCast(\"%s\", "
             "%s, amp_dst_dtype, \"%s\") : %s);\n";
-        dispensable_amp_auto_cast_str += paddle::string::Sprintf(
-            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name,
-            input_name, input_name, op_type, input_name);
+        dispensable_amp_auto_cast_str +=
+            paddle::string::Sprintf(DISPENSABLE_AMP_AUTO_CAST_TEMPLATE,
+                                    input_name,
+                                    input_name,
+                                    input_name,
+                                    input_name,
+                                    op_type,
+                                    input_name);
       }
     }
   }
@@ -1594,9 +1627,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     } else if (!inplace_map.empty() && inplace_map.count(output_name)) {
       // In inplace op, replace the output with the input directly.
       PADDLE_ENFORCE_NE(
-          inplace_map[output_name], "",
+          inplace_map[output_name],
+          "",
           paddle::platform::errors::InvalidArgument(
-              "Inplace op %s has no input corresponding to output %s.", op_type,
+              "Inplace op %s has no input corresponding to output %s.",
+              op_type,
               output_name));
       const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },";
       auto inplace_input_name = inplace_map[output_name];
@@ -1618,8 +1653,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         amp_function_call_args_str += (", " + outnum);
         const char* FWD_OUTS_CONTENT_TEMPLATE =
             "{ \"%s\", egr::EagerUtils::CreateVars(%s) },";
-        outs_contents_str += paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE,
-                                                     output_name, outnum);
+        outs_contents_str += paddle::string::Sprintf(
+            FWD_OUTS_CONTENT_TEMPLATE, output_name, outnum);
         core_ops_args_info[op_type].push_back(outnum);
         core_ops_args_type_info[op_type].push_back("int");
       } else {
@@ -1738,9 +1773,12 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     std::string view_strategy_str = "";
     std::string viwe_input_name = view_op_map[op_type].first;
     std::string viwe_output_name = view_op_map[op_type].second;
-    view_strategy_str += paddle::string::Sprintf(
-        HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name,
-        viwe_input_name, viwe_output_name);
+    view_strategy_str +=
+        paddle::string::Sprintf(HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT,
+                                viwe_input_name,
+                                viwe_output_name,
+                                viwe_input_name,
+                                viwe_output_name);
 
     generated_function_body += view_strategy_str;
     generated_function_body += "\n";
@@ -1794,26 +1832,33 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
               "  if (outs.count(\"%s\"))  "
               "egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n"
               "  egr::EagerUtils::Output2Result(%s, &%s);\n";
-          out_tensor_str = paddle::string::Sprintf(
-              FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name,
-              output_name, output_var_args_name, output_var_args_name,
-              output_varname);
+          out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE,
+                                                   output_varname,
+                                                   output_name,
+                                                   output_name,
+                                                   output_var_args_name,
+                                                   output_var_args_name,
+                                                   output_varname);
         } else {
           const char* FWD_OUT_TENSORS_TEMPLATE =
               "  std::vector<paddle::experimental::Tensor> %s;\n"
               "  egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n"
               "  egr::EagerUtils::Output2Result(%s, &%s);\n";
-          out_tensor_str = paddle::string::Sprintf(
-              FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name,
-              output_var_args_name, output_var_args_name, output_varname);
+          out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE,
+                                                   output_varname,
+                                                   output_name,
+                                                   output_var_args_name,
+                                                   output_var_args_name,
+                                                   output_varname);
         }
       } else {
         const char* FWD_OUT_TENSORS_TEMPLATE =
             "  std::vector<paddle::experimental::Tensor> %s;\n"
             "  egr::EagerUtils::GetOutputs(outs[\"%s\"], &%s);\n";
-        out_tensor_str =
-            paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE, output_varname,
-                                    output_name, output_varname);
+        out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE,
+                                                 output_varname,
+                                                 output_name,
+                                                 output_varname);
       }
       return_types[return_position] =
           "std::vector<paddle::experimental::Tensor>";
@@ -1824,16 +1869,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
               "  if (outs.count(\"%s\"))  "
               "egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n"
               "  paddle::experimental::Tensor& %s = *%s;\n";
-          out_tensor_str = paddle::string::Sprintf(
-              FWD_OUT_TENSOR_TEMPLATE, output_name, output_name,
-              output_var_args_name, output_varname, output_var_args_name);
+          out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE,
+                                                   output_name,
+                                                   output_name,
+                                                   output_var_args_name,
+                                                   output_varname,
+                                                   output_var_args_name);
         } else {
           const char* FWD_OUT_TENSOR_TEMPLATE =
               "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n"
               "  paddle::experimental::Tensor& %s = *%s;\n";
-          out_tensor_str = paddle::string::Sprintf(
-              FWD_OUT_TENSOR_TEMPLATE, output_name, output_var_args_name,
-              output_varname, output_var_args_name);
+          out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE,
+                                                   output_name,
+                                                   output_var_args_name,
+                                                   output_varname,
+                                                   output_var_args_name);
         }
       } else {
         if (!inplace_map.empty() && inplace_map.count(output_name)) {
@@ -1845,16 +1895,19 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
               "  %s.bump_inplace_version();\n"
               "  VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace "
               "Strategy.\";\n";
-          out_tensor_str = paddle::string::Sprintf(
-              FWD_OUT_TENSOR_TEMPLATE, output_name, inplace_input_name,
-              inplace_input_name, inplace_input_name);
+          out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE,
+                                                   output_name,
+                                                   inplace_input_name,
+                                                   inplace_input_name,
+                                                   inplace_input_name);
         } else {
           const char* FWD_OUT_TENSOR_TEMPLATE =
               "  paddle::experimental::Tensor %s;\n"
               "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n";
-          out_tensor_str =
-              paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname,
-                                      output_name, output_varname);
+          out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE,
+                                                   output_varname,
+                                                   output_name,
+                                                   output_varname);
         }
       }
       return_types[return_position] = "paddle::experimental::Tensor";
@@ -1964,21 +2017,28 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       "%s\n"
       "%s\n"
       "}\n\n";
-  std::string fwd_function_str = paddle::string::Sprintf(
-      FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name,
-      dygraph_function_args_str, fwd_record_event_str, generated_function_body);
+  std::string fwd_function_str =
+      paddle::string::Sprintf(FWD_FUNCTION_TEMPLATE,
+                              function_proto_return_type_str,
+                              function_name,
+                              dygraph_function_args_str,
+                              fwd_record_event_str,
+                              generated_function_body);
 
   // [Generation] Generate forward functions header
   const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n";
-  std::string dygraph_function_declaration_str = paddle::string::Sprintf(
-      FWD_HEADER_TEMPLATE, function_proto_return_type_str, function_name,
-      dygraph_function_args_str);
+  std::string dygraph_function_declaration_str =
+      paddle::string::Sprintf(FWD_HEADER_TEMPLATE,
+                              function_proto_return_type_str,
+                              function_name,
+                              dygraph_function_args_str);
 
   return {fwd_function_str, dygraph_function_declaration_str};
 }
 
 static std::string GenerateSingleOpBase(
-    const std::string& fwd_op_type, const std::string& op_base_type,
+    const std::string& fwd_op_type,
+    const std::string& op_base_type,
     const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
     const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
     const std::vector<proto::OpProto::Var>& in_vars,
@@ -1994,7 +2054,8 @@ static std::string GenerateSingleOpBase(
         std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
         grad_outs,
     const paddle::framework::AttributeMap& grad_attrs,
-    bool is_op_base_per_duplicable_input, size_t* outs_size) {
+    bool is_op_base_per_duplicable_input,
+    size_t* outs_size) {
   std::string generated_grad_function_body = "";
 
   const std::string& ins_name = "ins" + std::to_string(*outs_size);
@@ -2029,9 +2090,9 @@ static std::string GenerateSingleOpBase(
           "RecoverTensorWrapper("
           "&"
           "this->%s)) },";
-      ins_contents_str +=
-          paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE,
-                                  grad_input_name, struct_fwd_input_name);
+      ins_contents_str += paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE,
+                                                  grad_input_name,
+                                                  struct_fwd_input_name);
 
     } else if (grad_ins_grad_slotname_map.count(grad_input_name)) {
       // Fwd Tensor's Grad
@@ -2075,18 +2136,25 @@ static std::string GenerateSingleOpBase(
               "  if(this->%s.size() > 0) %s[\"%s\"] = "
               "egr::EagerUtils::TrySyncToVars(egr::EagerUtils::"
               "RecoverTensorWrapper(&this->%s));\n";
-          generated_grad_function_body += paddle::string::Sprintf(
-              DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, struct_fwd_input_name,
-              ins_name, grad_input_name, struct_fwd_input_name);
+          generated_grad_function_body +=
+              paddle::string::Sprintf(DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE,
+                                      struct_fwd_input_name,
+                                      ins_name,
+                                      grad_input_name,
+                                      struct_fwd_input_name);
         } else {
           const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE =
               "  auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s);\n"
               "  if(%s.defined()) %s[\"%s\"] = "
               "     egr::EagerUtils::TrySyncToVars(%s);\n";
-          generated_grad_function_body += paddle::string::Sprintf(
-              DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name,
-              struct_fwd_input_name, grad_input_name, ins_name, grad_input_name,
-              grad_input_name);
+          generated_grad_function_body +=
+              paddle::string::Sprintf(DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE,
+                                      grad_input_name,
+                                      struct_fwd_input_name,
+                                      grad_input_name,
+                                      ins_name,
+                                      grad_input_name,
+                                      grad_input_name);
         }
       }
     }
@@ -2203,15 +2271,20 @@ static std::string GenerateSingleOpBase(
                 "  if(%s.size() > 0) %s[\"%s\"] = egr::EagerUtils::CreateVars( "
                 "this->OutputMeta()[%d].size() );\n";
             generated_grad_function_body += paddle::string::Sprintf(
-                DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name, outs_name,
-                grad_output_name, fwd_input_position);
+                DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE,
+                fwd_name,
+                outs_name,
+                grad_output_name,
+                fwd_input_position);
           } else {
             const char* DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE =
                 "  if(%s.defined()) %s[\"%s\"] = "
                 "{std::make_shared<egr::EagerVariable>(egr::Controller::"
                 "Instance().GenerateUniqueName())};\n";
             generated_grad_function_body += paddle::string::Sprintf(
-                DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name, outs_name,
+                DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE,
+                fwd_name,
+                outs_name,
                 grad_output_name);
           }
         }
@@ -2236,8 +2309,8 @@ static std::string GenerateSingleOpBase(
         "  auto temp_type = %s[\"in_dtype\"];\n"
         "  %s[\"in_dtype\"] = %s[\"out_dtype\"];\n"
         "  %s[\"out_dtype\"] = temp_type;\n";
-    grad_attrs_str += paddle::string::Sprintf(CAST_GRAD, attrs_name, attrs_name,
-                                              attrs_name, attrs_name);
+    grad_attrs_str += paddle::string::Sprintf(
+        CAST_GRAD, attrs_name, attrs_name, attrs_name, attrs_name);
   }
 
   // Handle dynamic grad attributes
@@ -2278,8 +2351,8 @@ static std::string GenerateSingleOpBase(
             "  "
             "outputs[0].emplace_back(egr::EagerUtils::GetOutputs(%s[\"%s\"])[0]"
             ");\n";
-        outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE, outs_name,
-                                               grad_out_name);
+        outputs_str += paddle::string::Sprintf(
+            BWD_OUTPUT_TEMPLATE, outs_name, grad_out_name);
       }
       num_appended_outputs++;
     } else {
@@ -2411,11 +2484,20 @@ static std::string GenerateGradNodeCCContents(
     const auto& grad_attrs = op_base_info.GetGradAttrs();
 
     const std::string& op_base_type = op_base_info.GetOpBaseType();
-    generated_grad_function_body += GenerateSingleOpBase(
-        fwd_op_type, op_base_type, fwd_inputs_name_pos_map,
-        fwd_outputs_name_pos_map, in_vars, grad_ins_fwd_slotname_map,
-        grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins, grad_outs,
-        grad_attrs, is_op_base_per_duplicable_input, &outs_size);
+    generated_grad_function_body +=
+        GenerateSingleOpBase(fwd_op_type,
+                             op_base_type,
+                             fwd_inputs_name_pos_map,
+                             fwd_outputs_name_pos_map,
+                             in_vars,
+                             grad_ins_fwd_slotname_map,
+                             grad_ins_grad_slotname_map,
+                             grad_outs_slotname_map,
+                             grad_ins,
+                             grad_outs,
+                             grad_attrs,
+                             is_op_base_per_duplicable_input,
+                             &outs_size);
   }
 
   if (is_op_base_per_duplicable_input) {
@@ -2436,7 +2518,9 @@ static std::string GenerateGradNodeCCContents(
       "HandleComplexGradToRealGrad(&outputs);\n"
       "  return outputs;\n";
   generated_grad_function_body =
-      paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(),
+      paddle::string::Sprintf(BWD_RETURN_TEMPLATE,
+                              fwd_op_type,
+                              in_vars.size(),
                               generated_grad_function_body);
 
   // [Generation] Get Full Grad Function
@@ -2455,8 +2539,10 @@ static std::string GenerateGradNodeCCContents(
         "this->InputMeta());\n";
   }
   std::string grad_function_str =
-      paddle::string::Sprintf(GRAD_FUNCTION_TEMPLATE, fwd_op_type,
-                              fill_zero_str, generated_grad_function_body);
+      paddle::string::Sprintf(GRAD_FUNCTION_TEMPLATE,
+                              fwd_op_type,
+                              fill_zero_str,
+                              generated_grad_function_body);
 
   VLOG(6) << "Generated returns";
 
@@ -2579,9 +2665,12 @@ static std::string GenerateGradNodeHeaderContents(
             "          %s.emplace_back( egr::TensorWrapper(eager_tensor, %s "
             "/*full_reserved*/, %s) );\n"
             "      }\n";
-        tensor_wrapper_body_str = paddle::string::Sprintf(
-            SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name,
-            struct_tensor_wrapper_name, full_reserved_str, no_need_buffer_str);
+        tensor_wrapper_body_str =
+            paddle::string::Sprintf(SET_TENSOR_WRAPPER_BODY_TEMPLATE,
+                                    tensor_wrapper_name,
+                                    struct_tensor_wrapper_name,
+                                    full_reserved_str,
+                                    no_need_buffer_str);
 
         const char* CLEAR_TENSOR_WRAPPER_TEMPLATE =
             "for (auto tw: %s)   {\n"
@@ -2603,9 +2692,12 @@ static std::string GenerateGradNodeHeaderContents(
 
         const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
             "%s = egr::TensorWrapper(%s, %s /*full_reserved*/, %s);\n";
-        tensor_wrapper_body_str = paddle::string::Sprintf(
-            SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
-            tensor_wrapper_name, full_reserved_str, no_need_buffer_str);
+        tensor_wrapper_body_str =
+            paddle::string::Sprintf(SET_TENSOR_WRAPPER_BODY_TEMPLATE,
+                                    struct_tensor_wrapper_name,
+                                    tensor_wrapper_name,
+                                    full_reserved_str,
+                                    no_need_buffer_str);
 
         const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = "   %s.clear();\n";
         clear_tensor_wrappers_str += paddle::string::Sprintf(
@@ -2614,19 +2706,33 @@ static std::string GenerateGradNodeHeaderContents(
       std::string full_reserved_signature_str = "bool full_reserved";
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
           "   void SetTensorWrapper%s(%s, %s) {\n     %s\n   }\n";
-      set_tensor_wrappers_str += paddle::string::Sprintf(
-          SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
-          tensor_wrapper_arg_str, full_reserved_signature_str,
-          tensor_wrapper_body_str);
+      set_tensor_wrappers_str +=
+          paddle::string::Sprintf(SET_TENSOR_WRAPPER_TEMPLATE,
+                                  tensor_wrapper_name,
+                                  tensor_wrapper_arg_str,
+                                  full_reserved_signature_str,
+                                  tensor_wrapper_body_str);
     }
   }
   VLOG(6) << "Generated TensorWrapper";
 
-  std::string grad_node_str = paddle::string::Sprintf(
-      GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
-      op_type, clear_tensor_wrappers_str, op_type, op_type, op_type,
-      set_tensor_wrappers_str, set_attr_map_str, tensor_wrapper_members_str,
-      attr_members_str);
+  std::string grad_node_str =
+      paddle::string::Sprintf(GRAD_NODE_TEMPLATE,
+                              op_type,
+                              op_type,
+                              op_type,
+                              op_type,
+                              op_type,
+                              op_type,
+                              op_type,
+                              clear_tensor_wrappers_str,
+                              op_type,
+                              op_type,
+                              op_type,
+                              set_tensor_wrappers_str,
+                              set_attr_map_str,
+                              tensor_wrapper_members_str,
+                              attr_members_str);
 
   return grad_node_str;
 }
@@ -2760,9 +2866,11 @@ static std::string GenerateCoreOpsReturnsInfo() {
   std::string core_ops_returns_info_init_str =
       ConvertCoreOpsInfosToString(core_ops_returns_info);
 
-  std::string core_ops_info_str = paddle::string::Sprintf(
-      Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_info_init_str,
-      core_ops_args_type_info_init_str, core_ops_returns_info_init_str);
+  std::string core_ops_info_str =
+      paddle::string::Sprintf(Core_Ops_Returns_MAP_TEMPLATE,
+                              core_ops_args_info_init_str,
+                              core_ops_args_type_info_init_str,
+                              core_ops_returns_info_init_str);
 
   return core_ops_info_str;
 }
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 7ca5fc833ea..4a21d95eefe 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -71,7 +71,7 @@ PARSE_PYTHON_C_ARGS_TEMPLATE = \
 
 
 RECORD_EVENT_TEMPLATE = \
-"    paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);"
+"paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::UserDefined, 1);"
 
 
 RETURN_INPLACE_PYOBJECT_TEMPLATE = \
@@ -253,6 +253,7 @@ NAMESPACE_WRAPPER_TEMPLATE = \
 ## Generator Classes ##
 #######################
 class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
+
     def __init__(self, forward_api_contents, namespace):
         # Members from Parent:
         #self.namespace
@@ -265,7 +266,7 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
         #self.forward_outputs_position_map
         #self.optional_inputs
         #self.no_need_buffers
-        #self.intermediate_outputs   
+        #self.intermediate_outputs
         #self.inplace_map
         FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
 
@@ -327,8 +328,8 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
         set_device_str = FUNCTION_SET_DEVICE_TEMPLATE.format(expected_place_str)
 
         # Generate Dygraph Function Call Logic
-        num_args = len(forward_inputs_position_map.keys()) + len(
-            orig_forward_attrs_list)
+        num_args = len(
+            forward_inputs_position_map.keys()) + len(orig_forward_attrs_list)
         dygraph_function_call_list = ["" for i in range(num_args)]
         for name, (_, pos) in forward_inputs_position_map.items():
             dygraph_function_call_list[pos] = f"{name}"
@@ -336,7 +337,7 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
             dygraph_function_call_list[pos] = f"{name}"
         dygraph_function_call_str = ",".join(dygraph_function_call_list)
 
-        # Generate Python-C Function Definitions 
+        # Generate Python-C Function Definitions
         if is_forward_only:
             fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
                 "paddle::experimental::", namespace, forward_api_name)
@@ -441,8 +442,9 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
 
 
 class PythonCYamlGenerator(YamlGeneratorBase):
+
     def __init__(self, path):
-        # Parent members: 
+        # Parent members:
         # self.namespace
         # self.api_yaml_path
         # self.forward_api_list
@@ -457,8 +459,8 @@ class PythonCYamlGenerator(YamlGeneratorBase):
         forward_api_list = self.forward_api_list
 
         for forward_api_content in forward_api_list:
-            f_generator = PythonCSingleFunctionGenerator(forward_api_content,
-                                                         namespace)
+            f_generator = PythonCSingleFunctionGenerator(
+                forward_api_content, namespace)
             status = f_generator.run()
 
             if status == True:
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index b1f31e20be4..aa8b6344d1e 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -30,10 +30,10 @@
 namespace egr {
 
 /*
-* GeneralGrad is Helpper class to implement custom grad operation between
-* outputs and inputs.
-*
-* **/
+ * GeneralGrad is Helpper class to implement custom grad operation between
+ * outputs and inputs.
+ *
+ * **/
 class GeneralGrad {
  public:
   static GeneralGrad& Instance() { return *general_grad_; }
@@ -64,7 +64,8 @@ class GeneralGrad {
                                 paddle::platform::errors::Fatal(
                                     "There is no grad op for %s:[%d] or it's"
                                     "stop_gradient=True.",
-                                    msg, i));
+                                    msg,
+                                    i));
         if (is_no_grad_vars) {
           (no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta;
         } else {  // normal input
@@ -248,7 +249,8 @@ class GeneralGrad {
 
   std::vector<paddle::experimental::Tensor> GetResults(
       const std::vector<paddle::experimental::Tensor>& inputs,
-      bool allow_unused, bool create_graph) {
+      bool allow_unused,
+      bool create_graph) {
     VLOG(6) << "Running in GetResults";
     if (inputs.empty()) return {};
 
@@ -276,7 +278,8 @@ class GeneralGrad {
         tensor_auto_grad_meta->SetStopGradient(!create_graph);
         results.emplace_back(iter->second);
       } else {
-        PADDLE_ENFORCE_EQ(allow_unused, true,
+        PADDLE_ENFORCE_EQ(allow_unused,
+                          true,
                           paddle::platform::errors::InvalidArgument(
                               "The %d-th input does not appear in the backward "
                               "graph. Please check the input tensor or set "
@@ -493,7 +496,8 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
 void EnforceGradNodeHasInput(GradNodeBase* node) {
   VLOG(6) << "Running in EnforceGradNodeHasInput";
   PADDLE_ENFORCE_NE(
-      node->IsTensorWrappersCleared(), true,
+      node->IsTensorWrappersCleared(),
+      true,
       paddle::platform::errors::Fatal(
           "The TensorWrappers of %s do not exist. This may be because:\n"
           "You calculate backward twice for the same subgraph without "
@@ -509,10 +513,13 @@ void DuplicateCheck(const std::vector<paddle::experimental::Tensor>& inputs,
   for (auto in : inputs) {
     AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in);
     PADDLE_ENFORCE_EQ(
-        visisted_ins.count(auto_grad_meta), 0,
+        visisted_ins.count(auto_grad_meta),
+        0,
         paddle::platform::errors::AlreadyExists(
-            "%s contain duplicate tensor %s, please check %s carefully.", msg,
-            in.name(), msg));
+            "%s contain duplicate tensor %s, please check %s carefully.",
+            msg,
+            in.name(),
+            msg));
     visisted_ins.insert(auto_grad_meta);
   }
 }
@@ -522,7 +529,8 @@ GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad();
 std::vector<paddle::experimental::Tensor> RunBackward(
     const std::vector<paddle::experimental::Tensor>& tensors,  // output
     const std::vector<paddle::experimental::Tensor>& grad_tensors,
-    bool retain_graph, bool create_graph = false,
+    bool retain_graph,
+    bool create_graph = false,
     const std::vector<paddle::experimental::Tensor>& inputs = {},
     bool allow_unused = false,
     const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
@@ -631,8 +639,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
   if (is_general_grad) {
     // Prepare several vital preprocess for GeneralGrad
-    GeneralGrad::Instance().PreparedForGeneralGrad(inputs, no_grad_vars, &queue,
-                                                   node_input_buffers_dict);
+    GeneralGrad::Instance().PreparedForGeneralGrad(
+        inputs, no_grad_vars, &queue, node_input_buffers_dict);
   }
 
   VLOG(6) << " startup_ops' size is :" << queue.size();
@@ -651,7 +659,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
     paddle::platform::RecordEvent node_record_event(
         std::string((*node).name()) + " grad_node",
-        paddle::platform::TracerEventType::Operator, 1);
+        paddle::platform::TracerEventType::Operator,
+        1);
 
     if (queue.size() > 1 && node_in_degree_map[node] != 0) {
       queue.pop();
@@ -716,7 +725,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
                        "Number of edges should be either empty ( for leaf node "
                        ") or the same as number of output grad tensors, but we "
                        "got edges size is: %d, grad_output size is: %d",
-                       edges.size(), grad_output_tensors.size()));
+                       edges.size(),
+                       grad_output_tensors.size()));
 
     for (size_t i = 0; i < edges.size(); i++) {
       for (size_t j = 0; j < edges[i].size(); j++) {
@@ -739,7 +749,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
         }
 
         PADDLE_ENFORCE_LT(
-            j, grad_output_tensors[i].size(),
+            j,
+            grad_output_tensors[i].size(),
             paddle::platform::errors::Fatal(
                 "Rank of grad_output_tensors should be less than "
                 "grad_output_tensors[i].size(), which is: %d. This error may "
@@ -771,9 +782,10 @@ std::vector<paddle::experimental::Tensor> RunBackward(
         VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
                 << ", rank: " << edge_rank.second;
 
-        node_input_buffers_dict[next_node]->add(
-            edge_rank.first, edge_rank.second, grad_output_tensor,
-            create_graph);
+        node_input_buffers_dict[next_node]->add(edge_rank.first,
+                                                edge_rank.second,
+                                                grad_output_tensor,
+                                                create_graph);
 
         // Update queue
         node_in_degree_map[next_node]--;
@@ -810,7 +822,7 @@ void Backward(
     bool retain_graph) {
   VLOG(6) << "Run in Backward";
   paddle::platform::RecordEvent backward_record_event(
-      "backward", paddle::platform::TracerEventType::Operator, 1);
+      "backward", paddle::platform::TracerEventType::UserDefined, 1);
   RunBackward(tensors, grad_tensors, retain_graph);
   phi::autotune::AutoTuneStatus::Instance().Update();
 }
@@ -819,14 +831,22 @@ std::vector<paddle::experimental::Tensor> Grad(
     const std::vector<paddle::experimental::Tensor>& tensors,  // outputs
     const std::vector<paddle::experimental::Tensor>& inputs,
     const std::vector<paddle::experimental::Tensor>& grad_tensors,
-    bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused,
+    bool retain_graph,
+    bool create_graph,
+    bool only_inputs,
+    bool allow_unused,
     const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
   VLOG(6) << "Run in Grad";
 
   DuplicateCheck(inputs, true /* is_input */);
   DuplicateCheck(tensors, false /* is_input */);
 
-  return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs,
-                     allow_unused, no_grad_vars);
+  return RunBackward(tensors,
+                     grad_tensors,
+                     retain_graph,
+                     create_graph,
+                     inputs,
+                     allow_unused,
+                     no_grad_vars);
 }
 }  // namespace egr
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index e8fe5412721..ad522651723 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -588,7 +588,7 @@ void ChromeTracingLogger::StartLog() {
         std::string(
             R"JSON(
     {
-       "id": %d, "name": "%s", "totalGlobalMem": %u,
+       "id": %d, "name": "%s", "totalGlobalMem": %llu,
       "computeMajor": %d, "computeMinor": %d,
       "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
       "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
@@ -618,7 +618,7 @@ void ChromeTracingLogger::StartLog() {
         std::string(
             R"JSON(
     {
-       "id": %d, "name": "%s", "totalGlobalMem": %u,
+       "id": %d, "name": "%s", "totalGlobalMem": %llu,
       "computeMajor": %d, "computeMinor": %d,
       "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
       "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
index 7079d9678b2..88e42d2c5a5 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -19,6 +19,7 @@ import paddle.profiler as profiler
 
 
 class HostPythonNode:
+
     def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
         self.name = name
         self.type = type
@@ -32,6 +33,7 @@ class HostPythonNode:
 
 
 class DevicePythonNode:
+
     def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
                  stream_id):
         self.name = name
@@ -44,6 +46,7 @@ class DevicePythonNode:
 
 
 class TestProfilerStatistic(unittest.TestCase):
+
     def test_statistic_case1(self):
         root_node = HostPythonNode('Root Node',
                                    profiler.TracerEventType.UserDefined, 0,
@@ -54,14 +57,16 @@ class TestProfilerStatistic(unittest.TestCase):
         dataloader_node = HostPythonNode('Dataloader',
                                          profiler.TracerEventType.Dataloader, 5,
                                          15, 1000, 1001)
-        mobilenet_node = HostPythonNode(
-            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
-        yolonet_node = HostPythonNode(
-            'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
+        mobilenet_node = HostPythonNode('MobileNet',
+                                        profiler.TracerEventType.Forward, 20,
+                                        50, 1000, 1001)
+        yolonet_node = HostPythonNode('Yolov3Net',
+                                      profiler.TracerEventType.Forward, 50, 110,
+                                      1000, 1001)
 
-        userdefined_node = HostPythonNode('Communication Time',
-                                          profiler.TracerEventType.UserDefined,
-                                          100, 110, 1000, 1001)
+        userdefined_node = HostPythonNode(
+            'Communication Time', profiler.TracerEventType.PythonUserDefined,
+            100, 110, 1000, 1001)
 
         communication_node = HostPythonNode(
             'Communication', profiler.TracerEventType.Communication, 105, 110,
@@ -72,8 +77,9 @@ class TestProfilerStatistic(unittest.TestCase):
         optimization_node = HostPythonNode(
             'Optimization', profiler.TracerEventType.Optimization, 220, 300,
             1000, 1001)
-        conv2d_node = HostPythonNode(
-            'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
+        conv2d_node = HostPythonNode('conv2d',
+                                     profiler.TracerEventType.Operator, 25, 40,
+                                     1000, 1001)
         sync_batch_norm_node = HostPythonNode('sync_batch_norm',
                                               profiler.TracerEventType.Operator,
                                               60, 100, 1000, 1001)
@@ -92,10 +98,12 @@ class TestProfilerStatistic(unittest.TestCase):
         conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
                                            profiler.TracerEventType.CudaRuntime,
                                            35, 40, 1000, 1001)
-        conv2d_kernel = DevicePythonNode(
-            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
-        conv2d_memcpy = DevicePythonNode(
-            'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
+        conv2d_kernel = DevicePythonNode('conv2d_kernel',
+                                         profiler.TracerEventType.Kernel, 35,
+                                         50, 0, 0, 0)
+        conv2d_memcpy = DevicePythonNode('conv2d_memcpy',
+                                         profiler.TracerEventType.Memcpy, 50,
+                                         60, 0, 0, 0)
         sync_batch_norm_infer_shape = HostPythonNode(
             'sync_batch_norm::infer_shape',
             profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
@@ -146,8 +154,8 @@ class TestProfilerStatistic(unittest.TestCase):
             'Process Cpu Utilization': '1.02',
             'System Cpu Utilization': '0.68'
         }
-        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
-                                                                   extra_info)
+        statistic_data = profiler.profiler_statistic.StatisticData(
+            thread_tree, extra_info)
         time_range_summary = statistic_data.time_range_summary
         event_summary = statistic_data.event_summary
 
@@ -180,7 +188,7 @@ class TestProfilerStatistic(unittest.TestCase):
                 0, profiler.TracerEventType.Memcpy), 60)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
-                profiler.TracerEventType.UserDefined), 25)
+                profiler.TracerEventType.UserDefined), 15)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
                 profiler.TracerEventType.Communication), 5)
@@ -200,8 +208,9 @@ class TestProfilerStatistic(unittest.TestCase):
             0)
         self.assertEqual(
             event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
-        self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy']
-                         .general_gpu_time, 60)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].
+            general_gpu_time, 60)
         print(
             profiler.profiler_statistic._build_table(
                 statistic_data,
@@ -222,14 +231,16 @@ class TestProfilerStatistic(unittest.TestCase):
                                          profiler.TracerEventType.Dataloader, 5,
                                          15, 1000, 1001)
 
-        mobilenet_node = HostPythonNode(
-            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
-        yolonet_node = HostPythonNode(
-            'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
+        mobilenet_node = HostPythonNode('MobileNet',
+                                        profiler.TracerEventType.Forward, 20,
+                                        50, 1000, 1001)
+        yolonet_node = HostPythonNode('Yolov3Net',
+                                      profiler.TracerEventType.Forward, 50, 110,
+                                      1000, 1001)
 
-        userdefined_node = HostPythonNode('Communication Time',
-                                          profiler.TracerEventType.UserDefined,
-                                          100, 110, 1000, 1001)
+        userdefined_node = HostPythonNode(
+            'Communication Time', profiler.TracerEventType.PythonUserDefined,
+            100, 110, 1000, 1001)
         allreduce_launchkernel0 = HostPythonNode(
             'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104,
             1000, 1001)
@@ -263,8 +274,9 @@ class TestProfilerStatistic(unittest.TestCase):
         optimization_node = HostPythonNode(
             'Optimization', profiler.TracerEventType.Optimization, 220, 300,
             1000, 1001)
-        conv2d_node = HostPythonNode(
-            'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
+        conv2d_node = HostPythonNode('conv2d',
+                                     profiler.TracerEventType.Operator, 25, 40,
+                                     1000, 1001)
         sync_batch_norm_node = HostPythonNode('sync_batch_norm',
                                               profiler.TracerEventType.Operator,
                                               60, 100, 1000, 1001)
@@ -283,10 +295,12 @@ class TestProfilerStatistic(unittest.TestCase):
         conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
                                            profiler.TracerEventType.CudaRuntime,
                                            35, 40, 1000, 1001)
-        conv2d_kernel = DevicePythonNode(
-            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
-        conv2d_memcpy = DevicePythonNode(
-            'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
+        conv2d_kernel = DevicePythonNode('conv2d_kernel',
+                                         profiler.TracerEventType.Kernel, 35,
+                                         50, 0, 0, 0)
+        conv2d_memcpy = DevicePythonNode('conv2d_memcpy',
+                                         profiler.TracerEventType.Memcpy, 50,
+                                         60, 0, 0, 0)
         sync_batch_norm_infer_shape = HostPythonNode(
             'sync_batch_norm::infer_shape',
             profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
@@ -363,8 +377,8 @@ class TestProfilerStatistic(unittest.TestCase):
             'Process Cpu Utilization': '1.02',
             'System Cpu Utilization': '0.68'
         }
-        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
-                                                                   extra_info)
+        statistic_data = profiler.profiler_statistic.StatisticData(
+            thread_tree, extra_info)
         time_range_summary = statistic_data.time_range_summary
         event_summary = statistic_data.event_summary
         distributed_summary = statistic_data.distributed_summary
@@ -398,7 +412,7 @@ class TestProfilerStatistic(unittest.TestCase):
                 0, profiler.TracerEventType.Memcpy), 60)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
-                profiler.TracerEventType.UserDefined), 25)
+                profiler.TracerEventType.UserDefined), 15)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
                 profiler.TracerEventType.Communication), 5)
@@ -433,8 +447,9 @@ class TestProfilerStatistic(unittest.TestCase):
             0)
         self.assertEqual(
             event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
-        self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy']
-                         .general_gpu_time, 60)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].
+            general_gpu_time, 60)
         print(
             profiler.profiler_statistic._build_table(
                 statistic_data,
@@ -454,8 +469,9 @@ class TestProfilerStatistic(unittest.TestCase):
         dataloader_node = HostPythonNode('Dataloader',
                                          profiler.TracerEventType.Dataloader, 5,
                                          15, 1000, 1001)
-        mobilenet_node = HostPythonNode(
-            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+        mobilenet_node = HostPythonNode('MobileNet',
+                                        profiler.TracerEventType.Forward, 20,
+                                        50, 1000, 1001)
 
         backward_node = HostPythonNode('Gradient Backward',
                                        profiler.TracerEventType.Backward, 120,
@@ -463,12 +479,13 @@ class TestProfilerStatistic(unittest.TestCase):
         optimization_node = HostPythonNode(
             'Optimization', profiler.TracerEventType.Optimization, 220, 300,
             1000, 1001)
-        userdefined_node = HostPythonNode('Communication Time',
-                                          profiler.TracerEventType.UserDefined,
-                                          60, 70, 1000, 1001)
+        userdefined_node = HostPythonNode(
+            'Communication Time', profiler.TracerEventType.PythonUserDefined,
+            60, 70, 1000, 1001)
 
-        conv2d_node = HostPythonNode(
-            'conv2d', profiler.TracerEventType.Operator, 25, 25, 1000, 1001)
+        conv2d_node = HostPythonNode('conv2d',
+                                     profiler.TracerEventType.Operator, 25, 25,
+                                     1000, 1001)
 
         conv2d_infer_shape = HostPythonNode(
             'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
@@ -480,8 +497,9 @@ class TestProfilerStatistic(unittest.TestCase):
             'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25,
             1000, 1001)
 
-        conv2d_kernel = DevicePythonNode(
-            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
+        conv2d_kernel = DevicePythonNode('conv2d_kernel',
+                                         profiler.TracerEventType.Kernel, 35,
+                                         35, 0, 0, 0)
         another_kernel = DevicePythonNode(
             'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()',
             profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
@@ -500,15 +518,16 @@ class TestProfilerStatistic(unittest.TestCase):
             'Process Cpu Utilization': '1.02',
             'System Cpu Utilization': '0.68'
         }
-        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
-                                                                   extra_info)
+        statistic_data = profiler.profiler_statistic.StatisticData(
+            thread_tree, extra_info)
         time_range_summary = statistic_data.time_range_summary
         event_summary = statistic_data.event_summary
 
         self.assertEqual(event_summary.items['conv2d'].cpu_time, 0)
         self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0)
-        self.assertEqual(event_summary.userdefined_items['Communication Time']
-                         .general_gpu_time, 0)
+        self.assertEqual(
+            event_summary.userdefined_items['Communication Time'].
+            general_gpu_time, 0)
         for sort_key in [
                 profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax,
                 profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg,
@@ -516,12 +535,11 @@ class TestProfilerStatistic(unittest.TestCase):
                 profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg
         ]:
             print(
-                profiler.profiler_statistic._build_table(
-                    statistic_data,
-                    sorted_by=sort_key,
-                    op_detail=True,
-                    thread_sep=False,
-                    time_unit='ms'))
+                profiler.profiler_statistic._build_table(statistic_data,
+                                                         sorted_by=sort_key,
+                                                         op_detail=True,
+                                                         thread_sep=False,
+                                                         time_unit='ms'))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index 50aa3a1f11f..6f5894b590c 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -197,8 +197,8 @@ class TimeRangeSummary:
     def __init__(self):
         self.CPUTimeRange = collections.defaultdict(list)
         self.GPUTimeRange = collections.defaultdict(
-            lambda: collections.defaultdict(list)
-        )  # GPU events should be divided into different devices
+            lambda: collections.defaultdict(
+                list))  # GPU events should be divided into different devices
         self.CPUTimeRangeSum = collections.defaultdict(int)
         self.GPUTimeRangeSum = collections.defaultdict(
             lambda: collections.defaultdict(int))
@@ -212,8 +212,8 @@ class TimeRangeSummary:
         for threadid, hostnodes in thread2hostnodes.items():
             CPUTimeRange = collections.defaultdict(list)
             GPUTimeRange = collections.defaultdict(
-                lambda: collections.defaultdict(lambda: collections.defaultdict(list))
-            )  # device_id/type/stream_id
+                lambda: collections.defaultdict(lambda: collections.defaultdict(
+                    list)))  # device_id/type/stream_id
             for hostnode in hostnodes[1:]:  #skip root node
                 CPUTimeRange[hostnode.type].append(
                     (hostnode.start_ns, hostnode.end_ns))
@@ -235,8 +235,8 @@ class TimeRangeSummary:
             for device_id, device_time_ranges in GPUTimeRange.items():
                 for event_type, event_time_ranges in device_time_ranges.items():
                     for stream_id, time_ranges in event_time_ranges.items():
-                        time_ranges = merge_self_ranges(
-                            time_ranges, is_sorted=False)
+                        time_ranges = merge_self_ranges(time_ranges,
+                                                        is_sorted=False)
                         self.GPUTimeRange[device_id][event_type] = merge_ranges(
                             self.GPUTimeRange[device_id][event_type],
                             time_ranges,
@@ -310,25 +310,27 @@ class DistributedSummary:
                         for devicenode in runtimenode.device_node:
                             if devicenode.type == TracerEventType.Kernel:
                                 if 'nccl' in devicenode.name.lower():
-                                    self.gpu_communication_range.append((
-                                        devicenode.start_ns, devicenode.end_ns))
+                                    self.gpu_communication_range.append(
+                                        (devicenode.start_ns,
+                                         devicenode.end_ns))
                                 else:
-                                    self.computation_range.append((
-                                        devicenode.start_ns, devicenode.end_ns))
+                                    self.computation_range.append(
+                                        (devicenode.start_ns,
+                                         devicenode.end_ns))
         self.cpu_calls = len(set(self.cpu_communication_range))
         self.gpu_calls = len(set(self.gpu_communication_range))
         self.cpu_communication_range = merge_self_ranges(
             self.cpu_communication_range, is_sorted=False)
         self.gpu_communication_range = merge_self_ranges(
             self.gpu_communication_range, is_sorted=False)
-        self.communication_range = merge_ranges(
-            self.cpu_communication_range,
-            self.gpu_communication_range,
-            is_sorted=True)
-        self.computation_range = merge_self_ranges(
-            self.computation_range, is_sorted=False)
-        self.overlap_range = intersection_ranges(
-            self.communication_range, self.computation_range, is_sorted=True)
+        self.communication_range = merge_ranges(self.cpu_communication_range,
+                                                self.gpu_communication_range,
+                                                is_sorted=True)
+        self.computation_range = merge_self_ranges(self.computation_range,
+                                                   is_sorted=False)
+        self.overlap_range = intersection_ranges(self.communication_range,
+                                                 self.computation_range,
+                                                 is_sorted=True)
 
 
 class EventSummary:
@@ -337,6 +339,7 @@ class EventSummary:
     """
 
     class DeviceItem:
+
         def __init__(self, name):
             self.name = name
             self.call = 0
@@ -360,6 +363,7 @@ class EventSummary:
             self.add_gpu_time(node.end_ns - node.start_ns)
 
     class OperatorItem:
+
         def __init__(self, name):
             self.name = name
             self.call = 0
@@ -430,6 +434,7 @@ class EventSummary:
                     self.devices[name].add_item(devicenode)
 
     class GeneralItem:
+
         def __init__(self, name):
             self.name = name
             self.call = 0
@@ -513,7 +518,8 @@ class EventSummary:
                         or 'memset' in host_statistic_node.name.lower():
                         self.add_memory_manipulation_item(host_statistic_node)
                     else:
-                        self.add_userdefined_item(host_statistic_node)
+                        if host_statistic_node.type == TracerEventType.PythonUserDefined:
+                            self.add_userdefined_item(host_statistic_node)
             self.add_kernel_item(host_statistic_nodes[0])
 
         for threadid, root_statistic_node in node_statistic_trees.items():
@@ -688,13 +694,14 @@ def _build_table(statistic_data,
     append(row_format.format(*headers))
     append(header_sep)
     row_values = [
-        'CPU(Process)', format_ratio(
-            float(statistic_data.extra_info['Process Cpu Utilization']))
+        'CPU(Process)',
+        format_ratio(float(
+            statistic_data.extra_info['Process Cpu Utilization']))
     ]
     append(row_format.format(*row_values))
     row_values = [
-        'CPU(System)', format_ratio(
-            float(statistic_data.extra_info['System Cpu Utilization']))
+        'CPU(System)',
+        format_ratio(float(statistic_data.extra_info['System Cpu Utilization']))
     ]
     append(row_format.format(*row_values))
     for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
@@ -783,20 +790,22 @@ def _build_table(statistic_data,
             TracerEventType.
             Communication] = statistic_data.distributed_summary.gpu_calls
 
-    sorted_items = sorted(
-        cpu_type_time.items(), key=lambda x: x[1], reverse=True)
+    sorted_items = sorted(cpu_type_time.items(),
+                          key=lambda x: x[1],
+                          reverse=True)
     event_type, time = sorted_items[0]
     row_values = [
         '{}'.format(str(event_type).split('.')[1]), cpu_call_times[event_type],
-        format_time(
-            time, unit=time_unit), format_ratio(float(time) / total_time)
+        format_time(time, unit=time_unit),
+        format_ratio(float(time) / total_time)
     ]
     append(row_format.format(*row_values))
     for event_type, time in sorted_items[1:]:
         row_values = [
             '  {}'.format(str(event_type).split('.')[1]),
-            cpu_call_times[event_type], format_time(
-                time, unit=time_unit), format_ratio(float(time) / total_time)
+            cpu_call_times[event_type],
+            format_time(time, unit=time_unit),
+            format_ratio(float(time) / total_time)
         ]
         append(row_format.format(*row_values))
     append(header_sep)
@@ -806,8 +815,9 @@ def _build_table(statistic_data,
     for event_type, time in gpu_type_time.items():
         row_values = [
             '  {}'.format(str(event_type).split('.')[1]),
-            gpu_call_times[event_type], format_time(
-                time, unit=time_unit), format_ratio(float(time) / total_time)
+            gpu_call_times[event_type],
+            format_time(time, unit=time_unit),
+            format_ratio(float(time) / total_time)
         ]
         append(row_format.format(*row_values))
 
@@ -851,24 +861,16 @@ def _build_table(statistic_data,
                 row_values = [
                     '{}'.format(name), item.call,
                     '{} / {} / {} / {} / {}'.format(
-                        format_time(
-                            item.cpu_time, unit=time_unit),
-                        format_time(
-                            item.avg_cpu_time, unit=time_unit),
-                        format_time(
-                            item.max_cpu_time, unit=time_unit),
-                        format_time(
-                            item.min_cpu_time, unit=time_unit),
+                        format_time(item.cpu_time, unit=time_unit),
+                        format_time(item.avg_cpu_time, unit=time_unit),
+                        format_time(item.max_cpu_time, unit=time_unit),
+                        format_time(item.min_cpu_time, unit=time_unit),
                         format_ratio(float(item.cpu_time) / total_time)),
                     '{} / {} / {} / {} / {}'.format(
-                        format_time(
-                            item.gpu_time, unit=time_unit),
-                        format_time(
-                            item.avg_gpu_time, unit=time_unit),
-                        format_time(
-                            item.max_gpu_time, unit=time_unit),
-                        format_time(
-                            item.min_gpu_time, unit=time_unit),
+                        format_time(item.gpu_time, unit=time_unit),
+                        format_time(item.avg_gpu_time, unit=time_unit),
+                        format_time(item.max_gpu_time, unit=time_unit),
+                        format_time(item.min_gpu_time, unit=time_unit),
                         format_ratio(gpu_ratio))
                 ]
                 all_row_values.append(row_values)
@@ -884,12 +886,10 @@ def _build_table(statistic_data,
             gpu_ratio = float(other_gpu_time) / gpu_total_time
         row_values = [
             '  Others', '-', '{} / - / - / - / {}'.format(
-                format_time(
-                    other_time, unit=time_unit),
+                format_time(other_time, unit=time_unit),
                 format_ratio(float(other_time) / total_time)),
             '{} / - / - / - / {}'.format(
-                format_time(
-                    other_gpu_time, unit=time_unit),
+                format_time(other_gpu_time, unit=time_unit),
                 format_ratio(gpu_ratio))
         ]
         all_row_values.append(row_values)
@@ -971,28 +971,28 @@ def _build_table(statistic_data,
         overlap_time = sum_ranges(
             statistic_data.distributed_summary.overlap_range)
         row_values = [
-            'ProfileStep', format_time(
-                total_time, unit=time_unit),
+            'ProfileStep',
+            format_time(total_time, unit=time_unit),
             format_ratio(float(total_time) / total_time)
         ]
         append(row_format.format(*row_values))
         row_values = [
-            '  Communication', format_time(
-                communication_time, unit=time_unit),
+            '  Communication',
+            format_time(communication_time, unit=time_unit),
             format_ratio(float(communication_time) / total_time)
         ]
         append(row_format.format(*row_values))
 
         row_values = [
-            '  Computation', format_time(
-                computation_time, unit=time_unit),
+            '  Computation',
+            format_time(computation_time, unit=time_unit),
             format_ratio(float(computation_time) / total_time)
         ]
         append(row_format.format(*row_values))
 
         row_values = [
-            '  Overlap', format_time(
-                overlap_time, unit=time_unit),
+            '  Overlap',
+            format_time(overlap_time, unit=time_unit),
             format_ratio(float(overlap_time) / total_time)
         ]
         append(row_format.format(*row_values))
@@ -1026,39 +1026,35 @@ def _build_table(statistic_data,
         for thread_id, items in thread_items.items():
             all_row_values.append("Thread: {}".format(thread_id))
             if sorted_by == SortedKeys.CPUTotal:
-                sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].cpu_time, reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].cpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.CPUAvg:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].avg_cpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].avg_cpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.CPUMax:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].max_cpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].max_cpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.CPUMin:
-                sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].min_cpu_time)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].min_cpu_time)
             elif sorted_by == SortedKeys.GPUTotal:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].general_gpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].general_gpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.GPUAvg:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].avg_general_gpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].avg_general_gpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.GPUMax:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].max_general_gpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].max_general_gpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.GPUMin:
-                sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].min_general_gpu_time)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].min_general_gpu_time)
             total_op_cpu_time = 0
             total_op_gpu_time = 0
 
@@ -1077,24 +1073,16 @@ def _build_table(statistic_data,
                     gpu_ratio = float(item.general_gpu_time) / total_op_gpu_time
                 row_values = [
                     name, item.call, '{} / {} / {} / {} / {}'.format(
-                        format_time(
-                            item.cpu_time, unit=time_unit),
-                        format_time(
-                            item.avg_cpu_time, unit=time_unit),
-                        format_time(
-                            item.max_cpu_time, unit=time_unit),
-                        format_time(
-                            item.min_cpu_time, unit=time_unit),
+                        format_time(item.cpu_time, unit=time_unit),
+                        format_time(item.avg_cpu_time, unit=time_unit),
+                        format_time(item.max_cpu_time, unit=time_unit),
+                        format_time(item.min_cpu_time, unit=time_unit),
                         format_ratio(cpu_ratio)),
                     '{} / {} / {} / {} / {}'.format(
-                        format_time(
-                            item.general_gpu_time, unit=time_unit),
-                        format_time(
-                            item.avg_general_gpu_time, unit=time_unit),
-                        format_time(
-                            item.max_general_gpu_time, unit=time_unit),
-                        format_time(
-                            item.min_general_gpu_time, unit=time_unit),
+                        format_time(item.general_gpu_time, unit=time_unit),
+                        format_time(item.avg_general_gpu_time, unit=time_unit),
+                        format_time(item.max_general_gpu_time, unit=time_unit),
+                        format_time(item.min_general_gpu_time, unit=time_unit),
                         format_ratio(gpu_ratio))
                 ]
                 all_row_values.append(row_values)
@@ -1117,28 +1105,24 @@ def _build_table(statistic_data,
                         row_values = [
                             '  {}'.format(innerop_name), innerop_node.call,
                             '{} / {} / {} / {} / {}'.format(
-                                format_time(
-                                    innerop_node.cpu_time, unit=time_unit),
-                                format_time(
-                                    innerop_node.avg_cpu_time, unit=time_unit),
-                                format_time(
-                                    innerop_node.max_cpu_time, unit=time_unit),
-                                format_time(
-                                    innerop_node.min_cpu_time, unit=time_unit),
+                                format_time(innerop_node.cpu_time,
+                                            unit=time_unit),
+                                format_time(innerop_node.avg_cpu_time,
+                                            unit=time_unit),
+                                format_time(innerop_node.max_cpu_time,
+                                            unit=time_unit),
+                                format_time(innerop_node.min_cpu_time,
+                                            unit=time_unit),
                                 format_ratio(cpu_ratio)),
                             '{} / {} / {} / {} / {}'.format(
-                                format_time(
-                                    innerop_node.general_gpu_time,
-                                    unit=time_unit),
-                                format_time(
-                                    innerop_node.avg_general_gpu_time,
-                                    unit=time_unit),
-                                format_time(
-                                    innerop_node.max_general_gpu_time,
-                                    unit=time_unit),
-                                format_time(
-                                    innerop_node.min_general_gpu_time,
-                                    unit=time_unit),
+                                format_time(innerop_node.general_gpu_time,
+                                            unit=time_unit),
+                                format_time(innerop_node.avg_general_gpu_time,
+                                            unit=time_unit),
+                                format_time(innerop_node.max_general_gpu_time,
+                                            unit=time_unit),
+                                format_time(innerop_node.min_general_gpu_time,
+                                            unit=time_unit),
                                 format_ratio(gpu_ratio))
                         ]
                         all_row_values.append(row_values)
@@ -1148,8 +1132,8 @@ def _build_table(statistic_data,
                                 gpu_ratio = 0
                             else:
                                 gpu_ratio = float(
-                                    device_node.
-                                    gpu_time) / innerop_node.general_gpu_time
+                                    device_node.gpu_time
+                                ) / innerop_node.general_gpu_time
                             if len(device_node_name) + 4 > name_column_width:
                                 device_node_name = device_node_name[:
                                                                     name_column_width
@@ -1159,17 +1143,14 @@ def _build_table(statistic_data,
                                 '    {}'.format(device_node_name),
                                 device_node.call, '- / - / - / - / -',
                                 '{} / {} / {} / {} / {}'.format(
-                                    format_time(
-                                        device_node.gpu_time, unit=time_unit),
-                                    format_time(
-                                        device_node.avg_gpu_time,
-                                        unit=time_unit),
-                                    format_time(
-                                        device_node.max_gpu_time,
-                                        unit=time_unit),
-                                    format_time(
-                                        device_node.min_gpu_time,
-                                        unit=time_unit),
+                                    format_time(device_node.gpu_time,
+                                                unit=time_unit),
+                                    format_time(device_node.avg_gpu_time,
+                                                unit=time_unit),
+                                    format_time(device_node.max_gpu_time,
+                                                unit=time_unit),
+                                    format_time(device_node.min_gpu_time,
+                                                unit=time_unit),
                                     format_ratio(gpu_ratio))
                             ]
                             all_row_values.append(row_values)
@@ -1188,14 +1169,14 @@ def _build_table(statistic_data,
                             '  {}'.format(device_node_name), device_node.call,
                             '- / - / - / - / -',
                             '{} / {} / {} / {} / {}'.format(
-                                format_time(
-                                    device_node.gpu_time, unit=time_unit),
-                                format_time(
-                                    device_node.avg_gpu_time, unit=time_unit),
-                                format_time(
-                                    device_node.max_gpu_time, unit=time_unit),
-                                format_time(
-                                    device_node.min_gpu_time, unit=time_unit),
+                                format_time(device_node.gpu_time,
+                                            unit=time_unit),
+                                format_time(device_node.avg_gpu_time,
+                                            unit=time_unit),
+                                format_time(device_node.max_gpu_time,
+                                            unit=time_unit),
+                                format_time(device_node.min_gpu_time,
+                                            unit=time_unit),
                                 format_ratio(gpu_ratio))
                         ]
                         all_row_values.append(row_values)
@@ -1249,21 +1230,20 @@ def _build_table(statistic_data,
         all_row_values = []
         kernel_items = statistic_data.event_summary.kernel_items
         if sorted_by == SortedKeys.GPUAvg:
-            sorted_items = sorted(
-                kernel_items.items(),
-                key=lambda x: x[1].avg_gpu_time,
-                reverse=True)
+            sorted_items = sorted(kernel_items.items(),
+                                  key=lambda x: x[1].avg_gpu_time,
+                                  reverse=True)
         elif sorted_by == SortedKeys.GPUMax:
-            sorted_items = sorted(
-                kernel_items.items(),
-                key=lambda x: x[1].max_gpu_time,
-                reverse=True)
+            sorted_items = sorted(kernel_items.items(),
+                                  key=lambda x: x[1].max_gpu_time,
+                                  reverse=True)
         elif sorted_by == SortedKeys.GPUMin:
-            sorted_items = sorted(
-                kernel_items.items(), key=lambda x: x[1].min_gpu_time)
+            sorted_items = sorted(kernel_items.items(),
+                                  key=lambda x: x[1].min_gpu_time)
         else:
-            sorted_items = sorted(
-                kernel_items.items(), key=lambda x: x[1].gpu_time, reverse=True)
+            sorted_items = sorted(kernel_items.items(),
+                                  key=lambda x: x[1].gpu_time,
+                                  reverse=True)
 
         total_kernel_gpu_time = 0
         for name, item in sorted_items:
@@ -1277,14 +1257,10 @@ def _build_table(statistic_data,
                 name,
                 item.call,
                 '{} / {} / {} / {} / {}'.format(
-                    format_time(
-                        item.gpu_time, unit=time_unit),
-                    format_time(
-                        item.avg_gpu_time, unit=time_unit),
-                    format_time(
-                        item.max_gpu_time, unit=time_unit),
-                    format_time(
-                        item.min_gpu_time, unit=time_unit),
+                    format_time(item.gpu_time, unit=time_unit),
+                    format_time(item.avg_gpu_time, unit=time_unit),
+                    format_time(item.max_gpu_time, unit=time_unit),
+                    format_time(item.min_gpu_time, unit=time_unit),
                     format_ratio(gpu_ratio)),
             ]
             all_row_values.append(row_values)
@@ -1349,24 +1325,16 @@ def _build_table(statistic_data,
                 name,
                 item.call,
                 '{} / {} / {} / {} / {}'.format(
-                    format_time(
-                        item.cpu_time, unit=time_unit),
-                    format_time(
-                        item.avg_cpu_time, unit=time_unit),
-                    format_time(
-                        item.max_cpu_time, unit=time_unit),
-                    format_time(
-                        item.min_cpu_time, unit=time_unit),
+                    format_time(item.cpu_time, unit=time_unit),
+                    format_time(item.avg_cpu_time, unit=time_unit),
+                    format_time(item.max_cpu_time, unit=time_unit),
+                    format_time(item.min_cpu_time, unit=time_unit),
                     format_ratio(float(item.cpu_time) / total_time)),
                 '{} / {} / {} / {} / {}'.format(
-                    format_time(
-                        item.general_gpu_time, unit=time_unit),
-                    format_time(
-                        item.avg_general_gpu_time, unit=time_unit),
-                    format_time(
-                        item.max_general_gpu_time, unit=time_unit),
-                    format_time(
-                        item.min_general_gpu_time, unit=time_unit),
+                    format_time(item.general_gpu_time, unit=time_unit),
+                    format_time(item.avg_general_gpu_time, unit=time_unit),
+                    format_time(item.max_general_gpu_time, unit=time_unit),
+                    format_time(item.min_general_gpu_time, unit=time_unit),
                     format_ratio(gpu_ratio)),
             ]
             all_row_values.append(row_values)
@@ -1429,39 +1397,35 @@ def _build_table(statistic_data,
         for thread_id, items in userdefined_thread_items.items():
             all_row_values.append("Thread: {}".format(thread_id))
             if sorted_by == SortedKeys.CPUTotal:
-                sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].cpu_time, reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].cpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.CPUAvg:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].avg_cpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].avg_cpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.CPUMax:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].max_cpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].max_cpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.CPUMin:
-                sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].min_cpu_time)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].min_cpu_time)
             elif sorted_by == SortedKeys.GPUTotal:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].general_gpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].general_gpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.GPUAvg:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].avg_general_gpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].avg_general_gpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.GPUMax:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].max_general_gpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].max_general_gpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.GPUMin:
-                sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].min_general_gpu_time)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].min_general_gpu_time)
 
             for name, item in sorted_items:
                 if gpu_total_time == 0:
@@ -1472,24 +1436,16 @@ def _build_table(statistic_data,
                     name,
                     item.call,
                     '{} / {} / {} / {} / {}'.format(
-                        format_time(
-                            item.cpu_time, unit=time_unit),
-                        format_time(
-                            item.avg_cpu_time, unit=time_unit),
-                        format_time(
-                            item.max_cpu_time, unit=time_unit),
-                        format_time(
-                            item.min_cpu_time, unit=time_unit),
+                        format_time(item.cpu_time, unit=time_unit),
+                        format_time(item.avg_cpu_time, unit=time_unit),
+                        format_time(item.max_cpu_time, unit=time_unit),
+                        format_time(item.min_cpu_time, unit=time_unit),
                         format_ratio(float(item.cpu_time) / total_time)),
                     '{} / {} / {} / {} / {}'.format(
-                        format_time(
-                            item.general_gpu_time, unit=time_unit),
-                        format_time(
-                            item.avg_general_gpu_time, unit=time_unit),
-                        format_time(
-                            item.max_general_gpu_time, unit=time_unit),
-                        format_time(
-                            item.min_general_gpu_time, unit=time_unit),
+                        format_time(item.general_gpu_time, unit=time_unit),
+                        format_time(item.avg_general_gpu_time, unit=time_unit),
+                        format_time(item.max_general_gpu_time, unit=time_unit),
+                        format_time(item.min_general_gpu_time, unit=time_unit),
                         format_ratio(gpu_ratio)),
                 ]
                 all_row_values.append(row_values)
-- 
GitLab