From 6de2058165a81f0c39b441206a1294d058706a26 Mon Sep 17 00:00:00 2001 From: chenjian Date: Tue, 2 Aug 2022 09:58:53 +0800 Subject: [PATCH] Fix operator type record in profiler [cherry-pick PR44582] (#44654) * fix record event for operator type in new dygraph (#44582) * fix new dygraph record event for op * update unit test * fix file mode --- .../auto_code_generator/eager_generator.cc | 348 +++++++++------ .../final_state_generator/python_c_gen.py | 24 +- paddle/fluid/eager/backward.cc | 68 +-- .../platform/profiler/chrometracing_logger.cc | 4 +- .../unittests/test_profiler_statistic.py | 128 +++--- python/paddle/profiler/profiler_statistic.py | 396 ++++++++---------- 6 files changed, 536 insertions(+), 432 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 39559a2d901..30ace9c1d3a 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -476,7 +476,8 @@ static void SlotNameMatching( PADDLE_THROW(platform::errors::Fatal( "Detected mismatched slot names." "grad_slot_name %s matches both %s and %s fwd_slot_name", - grad_slot_name, grad_fwd_slotname_map[grad_slot_name], + grad_slot_name, + grad_fwd_slotname_map[grad_slot_name], fwd_slot_name)); } grad_fwd_slotname_map[grad_slot_name] = fwd_slot_name; @@ -489,7 +490,8 @@ static void SlotNameMatching( PADDLE_THROW(platform::errors::Fatal( "Detected mismatched slot names." "grad_slot_name %s matches both %s and %s fwd_slot_name", - grad_slot_name, grad_grad_slotname_map[grad_slot_name], + grad_slot_name, + grad_grad_slotname_map[grad_slot_name], fwd_slot_name)); } grad_grad_slotname_map[grad_slot_name] = fwd_slot_name; @@ -509,7 +511,8 @@ static void SlotNameMatching( PADDLE_THROW(platform::errors::Fatal( "Detected mismatched slot names" "grad_slot_name %s matches both %s and %s fwd_slot_name", - grad_slot_name, grad_fwd_slotname_map[grad_slot_name], + grad_slot_name, + grad_fwd_slotname_map[grad_slot_name], fwd_slot_name)); } grad_fwd_slotname_map[grad_slot_name] = fwd_slot_name; @@ -522,7 +525,8 @@ static void SlotNameMatching( PADDLE_THROW(platform::errors::Fatal( "Detected mismatched slot names." "grad_slot_name %s matches both %s and %s fwd_slot_name", - grad_slot_name, grad_grad_slotname_map[grad_slot_name], + grad_slot_name, + grad_grad_slotname_map[grad_slot_name], fwd_slot_name)); } grad_grad_slotname_map[grad_slot_name] = fwd_slot_name; @@ -900,8 +904,8 @@ static bool CollectGradInformationFromOpInfo( } std::shared_ptr grad_node = - op_info.dygraph_grad_op_maker_(op_type, ins, outs, attrs, default_attrs, - {}); + op_info.dygraph_grad_op_maker_( + op_type, ins, outs, attrs, default_attrs, {}); if (!grad_node) { VLOG(6) << "Got nullptr GradOpNode for " << op_type @@ -977,12 +981,16 @@ static bool CollectGradInformationFromOpInfo( /* ------ Slot Name Matching ---- */ for (auto& iter : *op_base_infos) { // grad_ins -> fwd_ins, fwd_outs - SlotNameMatching(iter.GetGradIns(), fwd_ins, fwd_outs, + SlotNameMatching(iter.GetGradIns(), + fwd_ins, + fwd_outs, iter.GetMutableGradInsFwdSlotnameMap(), iter.GetMutableGradInsGradSlotnameMap()); // grad_outs -> fwd_ins, fwd_outs - SlotNameMatching(iter.GetGradOuts(), fwd_ins, fwd_outs, + SlotNameMatching(iter.GetGradOuts(), + fwd_ins, + fwd_outs, iter.GetMutableGradOutsSlotnameMap(), iter.GetMutableGradOutsSlotnameMap()); } @@ -1042,16 +1050,18 @@ static std::string GenerateGradNodeCreationContent( "p_autograd_" + inplace_input_name; const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = " %s = egr::EagerUtils::autograd_meta(&%s);\n"; - get_output_autograd_meta_str += paddle::string::Sprintf( - GET_SINGLE_AUTOGRAD_META_TEMPLATE, inplace_input_autograd_name, - inplace_input_name); + get_output_autograd_meta_str += + paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE, + inplace_input_autograd_name, + inplace_input_name); } else { const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = " egr::AutogradMeta* %s = " "egr::EagerUtils::autograd_meta(&%s);\n"; get_output_autograd_meta_str += paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE, - output_autograd_name, output_name); + output_autograd_name, + output_name); } } } @@ -1097,8 +1107,8 @@ static std::string GenerateGradNodeCreationContent( "require_any_grad);\n"; for (auto& inplace_pair : inplace_map) { std::string inplace_name = inplace_pair.second; - check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE, - inplace_name, inplace_name); + check_inplace_str += paddle::string::Sprintf( + CHECKING_INPLACE_TEMPLATE, inplace_name, inplace_name); } VLOG(6) << "Check Inplace Input"; } @@ -1124,9 +1134,11 @@ static std::string GenerateGradNodeCreationContent( " auto grad_node = std::shared_ptr(new GradNode%s(%d, " "%d));\n"; grad_node_creation_str += " // Create GradOpNode\n"; - grad_node_creation_str += - paddle::string::Sprintf(GRAD_OP_NODE_TEMPLATE, op_type, op_type, - bwd_in_slot_num, bwd_out_slot_num); + grad_node_creation_str += paddle::string::Sprintf(GRAD_OP_NODE_TEMPLATE, + op_type, + op_type, + bwd_in_slot_num, + bwd_out_slot_num); grad_node_creation_str += "\n"; VLOG(6) << "Generated GradOpNode construction"; @@ -1158,13 +1170,17 @@ static std::string GenerateGradNodeCreationContent( // Replace output directly with input in inplace op. if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) { auto inplace_input_name = inplace_map[tensor_wrapper_name]; - grad_node_creation_str += paddle::string::Sprintf( - SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, - inplace_input_name, full_reserved); + grad_node_creation_str += + paddle::string::Sprintf(SET_TENSOR_WRAPPER_TEMPLATE, + tensor_wrapper_name, + inplace_input_name, + full_reserved); } else { - grad_node_creation_str += paddle::string::Sprintf( - SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, - tensor_wrapper_name, full_reserved); + grad_node_creation_str += + paddle::string::Sprintf(SET_TENSOR_WRAPPER_TEMPLATE, + tensor_wrapper_name, + tensor_wrapper_name, + full_reserved); } } } @@ -1189,9 +1205,10 @@ static std::string GenerateGradNodeCreationContent( const char* ADD_EDGES_TEMPLATE = " if(%s) grad_node->AddEdges(%s, %d);\n"; - grad_node_creation_str += - paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name, - input_autograd_name, input_position); + grad_node_creation_str += paddle::string::Sprintf(ADD_EDGES_TEMPLATE, + input_autograd_name, + input_autograd_name, + input_position); } else { compute_require_grad_args += ", &" + input_autograd_name; size_t input_position = fwd_inputs_name_pos_map.at(input_name); @@ -1319,7 +1336,7 @@ static std::string GenerateGradNodeCreationContent( "%s" " {\n" " paddle::platform::RecordEvent node_creation_record_event(\"%s\", " - "paddle::platform::TracerEventType::Operator, 1);\n" + "paddle::platform::TracerEventType::OperatorInner, 1);\n" "%s" " if(require_any_grad) {\n" " VLOG(6) << \" Construct Grad for %s \"; \n" @@ -1327,11 +1344,17 @@ static std::string GenerateGradNodeCreationContent( " %s\n" " }\n" " }"; - std::string grad_node_creation_body_str = paddle::string::Sprintf( - GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str, - compute_require_grad_args, check_inplace_str, trace_op_body_str, - event_name, get_output_autograd_meta_str, op_type, - pass_stop_gradient_args, grad_node_creation_str); + std::string grad_node_creation_body_str = + paddle::string::Sprintf(GRAD_NODE_CREATION_TEMPLATE, + prepare_autograd_meta_str, + compute_require_grad_args, + check_inplace_str, + trace_op_body_str, + event_name, + get_output_autograd_meta_str, + op_type, + pass_stop_gradient_args, + grad_node_creation_str); return grad_node_creation_body_str; } @@ -1454,8 +1477,8 @@ static std::pair GenerateForwardFunctionContents( const char* FWD_INS_CONTENT_TEMPLATE = "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },"; - ins_contents_str += paddle::string::Sprintf(FWD_INS_CONTENT_TEMPLATE, - input_name, input_name); + ins_contents_str += paddle::string::Sprintf( + FWD_INS_CONTENT_TEMPLATE, input_name, input_name); if (input.duplicable()) { const char* AMP_TENSORS_VECTOR_TEMPLATE = "%s,"; amp_tensors_vector_str += @@ -1518,9 +1541,14 @@ static std::pair GenerateForwardFunctionContents( const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE = " auto NEW_%s = ((%s.size() > 0) ? egr::AmpAutoCasts(\"%s\", " "%s, amp_dst_dtype, \"%s\") : %s);\n"; - dispensable_amp_auto_cast_str += paddle::string::Sprintf( - DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name, - input_name, input_name, op_type, input_name); + dispensable_amp_auto_cast_str += + paddle::string::Sprintf(DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, + input_name, + input_name, + input_name, + input_name, + op_type, + input_name); } else { const char* FWD_INS_CONTENT_TEMPLATE = " if(%s.initialized()) " @@ -1535,9 +1563,14 @@ static std::pair GenerateForwardFunctionContents( const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE = " auto NEW_%s = ((%s.initialized()) ? egr::AmpAutoCast(\"%s\", " "%s, amp_dst_dtype, \"%s\") : %s);\n"; - dispensable_amp_auto_cast_str += paddle::string::Sprintf( - DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name, - input_name, input_name, op_type, input_name); + dispensable_amp_auto_cast_str += + paddle::string::Sprintf(DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, + input_name, + input_name, + input_name, + input_name, + op_type, + input_name); } } } @@ -1594,9 +1627,11 @@ static std::pair GenerateForwardFunctionContents( } else if (!inplace_map.empty() && inplace_map.count(output_name)) { // In inplace op, replace the output with the input directly. PADDLE_ENFORCE_NE( - inplace_map[output_name], "", + inplace_map[output_name], + "", paddle::platform::errors::InvalidArgument( - "Inplace op %s has no input corresponding to output %s.", op_type, + "Inplace op %s has no input corresponding to output %s.", + op_type, output_name)); const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },"; auto inplace_input_name = inplace_map[output_name]; @@ -1618,8 +1653,8 @@ static std::pair GenerateForwardFunctionContents( amp_function_call_args_str += (", " + outnum); const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", egr::EagerUtils::CreateVars(%s) },"; - outs_contents_str += paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE, - output_name, outnum); + outs_contents_str += paddle::string::Sprintf( + FWD_OUTS_CONTENT_TEMPLATE, output_name, outnum); core_ops_args_info[op_type].push_back(outnum); core_ops_args_type_info[op_type].push_back("int"); } else { @@ -1738,9 +1773,12 @@ static std::pair GenerateForwardFunctionContents( std::string view_strategy_str = ""; std::string viwe_input_name = view_op_map[op_type].first; std::string viwe_output_name = view_op_map[op_type].second; - view_strategy_str += paddle::string::Sprintf( - HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name, - viwe_input_name, viwe_output_name); + view_strategy_str += + paddle::string::Sprintf(HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, + viwe_input_name, + viwe_output_name, + viwe_input_name, + viwe_output_name); generated_function_body += view_strategy_str; generated_function_body += "\n"; @@ -1794,26 +1832,33 @@ static std::pair GenerateForwardFunctionContents( " if (outs.count(\"%s\")) " "egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n" " egr::EagerUtils::Output2Result(%s, &%s);\n"; - out_tensor_str = paddle::string::Sprintf( - FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name, - output_name, output_var_args_name, output_var_args_name, - output_varname); + out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE, + output_varname, + output_name, + output_name, + output_var_args_name, + output_var_args_name, + output_varname); } else { const char* FWD_OUT_TENSORS_TEMPLATE = " std::vector %s;\n" " egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n" " egr::EagerUtils::Output2Result(%s, &%s);\n"; - out_tensor_str = paddle::string::Sprintf( - FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name, - output_var_args_name, output_var_args_name, output_varname); + out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE, + output_varname, + output_name, + output_var_args_name, + output_var_args_name, + output_varname); } } else { const char* FWD_OUT_TENSORS_TEMPLATE = " std::vector %s;\n" " egr::EagerUtils::GetOutputs(outs[\"%s\"], &%s);\n"; - out_tensor_str = - paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE, output_varname, - output_name, output_varname); + out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE, + output_varname, + output_name, + output_varname); } return_types[return_position] = "std::vector"; @@ -1824,16 +1869,21 @@ static std::pair GenerateForwardFunctionContents( " if (outs.count(\"%s\")) " "egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n" " paddle::experimental::Tensor& %s = *%s;\n"; - out_tensor_str = paddle::string::Sprintf( - FWD_OUT_TENSOR_TEMPLATE, output_name, output_name, - output_var_args_name, output_varname, output_var_args_name); + out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, + output_name, + output_name, + output_var_args_name, + output_varname, + output_var_args_name); } else { const char* FWD_OUT_TENSOR_TEMPLATE = " egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n" " paddle::experimental::Tensor& %s = *%s;\n"; - out_tensor_str = paddle::string::Sprintf( - FWD_OUT_TENSOR_TEMPLATE, output_name, output_var_args_name, - output_varname, output_var_args_name); + out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, + output_name, + output_var_args_name, + output_varname, + output_var_args_name); } } else { if (!inplace_map.empty() && inplace_map.count(output_name)) { @@ -1845,16 +1895,19 @@ static std::pair GenerateForwardFunctionContents( " %s.bump_inplace_version();\n" " VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace " "Strategy.\";\n"; - out_tensor_str = paddle::string::Sprintf( - FWD_OUT_TENSOR_TEMPLATE, output_name, inplace_input_name, - inplace_input_name, inplace_input_name); + out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, + output_name, + inplace_input_name, + inplace_input_name, + inplace_input_name); } else { const char* FWD_OUT_TENSOR_TEMPLATE = " paddle::experimental::Tensor %s;\n" " egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"; - out_tensor_str = - paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname, - output_name, output_varname); + out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, + output_varname, + output_name, + output_varname); } } return_types[return_position] = "paddle::experimental::Tensor"; @@ -1964,21 +2017,28 @@ static std::pair GenerateForwardFunctionContents( "%s\n" "%s\n" "}\n\n"; - std::string fwd_function_str = paddle::string::Sprintf( - FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name, - dygraph_function_args_str, fwd_record_event_str, generated_function_body); + std::string fwd_function_str = + paddle::string::Sprintf(FWD_FUNCTION_TEMPLATE, + function_proto_return_type_str, + function_name, + dygraph_function_args_str, + fwd_record_event_str, + generated_function_body); // [Generation] Generate forward functions header const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n"; - std::string dygraph_function_declaration_str = paddle::string::Sprintf( - FWD_HEADER_TEMPLATE, function_proto_return_type_str, function_name, - dygraph_function_args_str); + std::string dygraph_function_declaration_str = + paddle::string::Sprintf(FWD_HEADER_TEMPLATE, + function_proto_return_type_str, + function_name, + dygraph_function_args_str); return {fwd_function_str, dygraph_function_declaration_str}; } static std::string GenerateSingleOpBase( - const std::string& fwd_op_type, const std::string& op_base_type, + const std::string& fwd_op_type, + const std::string& op_base_type, const std::unordered_map& fwd_inputs_name_pos_map, const std::unordered_map& fwd_outputs_name_pos_map, const std::vector& in_vars, @@ -1994,7 +2054,8 @@ static std::string GenerateSingleOpBase( std::vector>>& grad_outs, const paddle::framework::AttributeMap& grad_attrs, - bool is_op_base_per_duplicable_input, size_t* outs_size) { + bool is_op_base_per_duplicable_input, + size_t* outs_size) { std::string generated_grad_function_body = ""; const std::string& ins_name = "ins" + std::to_string(*outs_size); @@ -2029,9 +2090,9 @@ static std::string GenerateSingleOpBase( "RecoverTensorWrapper(" "&" "this->%s)) },"; - ins_contents_str += - paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE, - grad_input_name, struct_fwd_input_name); + ins_contents_str += paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE, + grad_input_name, + struct_fwd_input_name); } else if (grad_ins_grad_slotname_map.count(grad_input_name)) { // Fwd Tensor's Grad @@ -2075,18 +2136,25 @@ static std::string GenerateSingleOpBase( " if(this->%s.size() > 0) %s[\"%s\"] = " "egr::EagerUtils::TrySyncToVars(egr::EagerUtils::" "RecoverTensorWrapper(&this->%s));\n"; - generated_grad_function_body += paddle::string::Sprintf( - DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, struct_fwd_input_name, - ins_name, grad_input_name, struct_fwd_input_name); + generated_grad_function_body += + paddle::string::Sprintf(DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, + struct_fwd_input_name, + ins_name, + grad_input_name, + struct_fwd_input_name); } else { const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE = " auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s);\n" " if(%s.defined()) %s[\"%s\"] = " " egr::EagerUtils::TrySyncToVars(%s);\n"; - generated_grad_function_body += paddle::string::Sprintf( - DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name, - struct_fwd_input_name, grad_input_name, ins_name, grad_input_name, - grad_input_name); + generated_grad_function_body += + paddle::string::Sprintf(DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, + grad_input_name, + struct_fwd_input_name, + grad_input_name, + ins_name, + grad_input_name, + grad_input_name); } } } @@ -2203,15 +2271,20 @@ static std::string GenerateSingleOpBase( " if(%s.size() > 0) %s[\"%s\"] = egr::EagerUtils::CreateVars( " "this->OutputMeta()[%d].size() );\n"; generated_grad_function_body += paddle::string::Sprintf( - DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name, outs_name, - grad_output_name, fwd_input_position); + DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, + fwd_name, + outs_name, + grad_output_name, + fwd_input_position); } else { const char* DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE = " if(%s.defined()) %s[\"%s\"] = " "{std::make_shared(egr::Controller::" "Instance().GenerateUniqueName())};\n"; generated_grad_function_body += paddle::string::Sprintf( - DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name, outs_name, + DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, + fwd_name, + outs_name, grad_output_name); } } @@ -2236,8 +2309,8 @@ static std::string GenerateSingleOpBase( " auto temp_type = %s[\"in_dtype\"];\n" " %s[\"in_dtype\"] = %s[\"out_dtype\"];\n" " %s[\"out_dtype\"] = temp_type;\n"; - grad_attrs_str += paddle::string::Sprintf(CAST_GRAD, attrs_name, attrs_name, - attrs_name, attrs_name); + grad_attrs_str += paddle::string::Sprintf( + CAST_GRAD, attrs_name, attrs_name, attrs_name, attrs_name); } // Handle dynamic grad attributes @@ -2278,8 +2351,8 @@ static std::string GenerateSingleOpBase( " " "outputs[0].emplace_back(egr::EagerUtils::GetOutputs(%s[\"%s\"])[0]" ");\n"; - outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE, outs_name, - grad_out_name); + outputs_str += paddle::string::Sprintf( + BWD_OUTPUT_TEMPLATE, outs_name, grad_out_name); } num_appended_outputs++; } else { @@ -2411,11 +2484,20 @@ static std::string GenerateGradNodeCCContents( const auto& grad_attrs = op_base_info.GetGradAttrs(); const std::string& op_base_type = op_base_info.GetOpBaseType(); - generated_grad_function_body += GenerateSingleOpBase( - fwd_op_type, op_base_type, fwd_inputs_name_pos_map, - fwd_outputs_name_pos_map, in_vars, grad_ins_fwd_slotname_map, - grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins, grad_outs, - grad_attrs, is_op_base_per_duplicable_input, &outs_size); + generated_grad_function_body += + GenerateSingleOpBase(fwd_op_type, + op_base_type, + fwd_inputs_name_pos_map, + fwd_outputs_name_pos_map, + in_vars, + grad_ins_fwd_slotname_map, + grad_ins_grad_slotname_map, + grad_outs_slotname_map, + grad_ins, + grad_outs, + grad_attrs, + is_op_base_per_duplicable_input, + &outs_size); } if (is_op_base_per_duplicable_input) { @@ -2436,7 +2518,9 @@ static std::string GenerateGradNodeCCContents( "HandleComplexGradToRealGrad(&outputs);\n" " return outputs;\n"; generated_grad_function_body = - paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(), + paddle::string::Sprintf(BWD_RETURN_TEMPLATE, + fwd_op_type, + in_vars.size(), generated_grad_function_body); // [Generation] Get Full Grad Function @@ -2455,8 +2539,10 @@ static std::string GenerateGradNodeCCContents( "this->InputMeta());\n"; } std::string grad_function_str = - paddle::string::Sprintf(GRAD_FUNCTION_TEMPLATE, fwd_op_type, - fill_zero_str, generated_grad_function_body); + paddle::string::Sprintf(GRAD_FUNCTION_TEMPLATE, + fwd_op_type, + fill_zero_str, + generated_grad_function_body); VLOG(6) << "Generated returns"; @@ -2579,9 +2665,12 @@ static std::string GenerateGradNodeHeaderContents( " %s.emplace_back( egr::TensorWrapper(eager_tensor, %s " "/*full_reserved*/, %s) );\n" " }\n"; - tensor_wrapper_body_str = paddle::string::Sprintf( - SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name, - struct_tensor_wrapper_name, full_reserved_str, no_need_buffer_str); + tensor_wrapper_body_str = + paddle::string::Sprintf(SET_TENSOR_WRAPPER_BODY_TEMPLATE, + tensor_wrapper_name, + struct_tensor_wrapper_name, + full_reserved_str, + no_need_buffer_str); const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = "for (auto tw: %s) {\n" @@ -2603,9 +2692,12 @@ static std::string GenerateGradNodeHeaderContents( const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE = "%s = egr::TensorWrapper(%s, %s /*full_reserved*/, %s);\n"; - tensor_wrapper_body_str = paddle::string::Sprintf( - SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name, - tensor_wrapper_name, full_reserved_str, no_need_buffer_str); + tensor_wrapper_body_str = + paddle::string::Sprintf(SET_TENSOR_WRAPPER_BODY_TEMPLATE, + struct_tensor_wrapper_name, + tensor_wrapper_name, + full_reserved_str, + no_need_buffer_str); const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = " %s.clear();\n"; clear_tensor_wrappers_str += paddle::string::Sprintf( @@ -2614,19 +2706,33 @@ static std::string GenerateGradNodeHeaderContents( std::string full_reserved_signature_str = "bool full_reserved"; const char* SET_TENSOR_WRAPPER_TEMPLATE = " void SetTensorWrapper%s(%s, %s) {\n %s\n }\n"; - set_tensor_wrappers_str += paddle::string::Sprintf( - SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, - tensor_wrapper_arg_str, full_reserved_signature_str, - tensor_wrapper_body_str); + set_tensor_wrappers_str += + paddle::string::Sprintf(SET_TENSOR_WRAPPER_TEMPLATE, + tensor_wrapper_name, + tensor_wrapper_arg_str, + full_reserved_signature_str, + tensor_wrapper_body_str); } } VLOG(6) << "Generated TensorWrapper"; - std::string grad_node_str = paddle::string::Sprintf( - GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type, - op_type, clear_tensor_wrappers_str, op_type, op_type, op_type, - set_tensor_wrappers_str, set_attr_map_str, tensor_wrapper_members_str, - attr_members_str); + std::string grad_node_str = + paddle::string::Sprintf(GRAD_NODE_TEMPLATE, + op_type, + op_type, + op_type, + op_type, + op_type, + op_type, + op_type, + clear_tensor_wrappers_str, + op_type, + op_type, + op_type, + set_tensor_wrappers_str, + set_attr_map_str, + tensor_wrapper_members_str, + attr_members_str); return grad_node_str; } @@ -2760,9 +2866,11 @@ static std::string GenerateCoreOpsReturnsInfo() { std::string core_ops_returns_info_init_str = ConvertCoreOpsInfosToString(core_ops_returns_info); - std::string core_ops_info_str = paddle::string::Sprintf( - Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_info_init_str, - core_ops_args_type_info_init_str, core_ops_returns_info_init_str); + std::string core_ops_info_str = + paddle::string::Sprintf(Core_Ops_Returns_MAP_TEMPLATE, + core_ops_args_info_init_str, + core_ops_args_type_info_init_str, + core_ops_returns_info_init_str); return core_ops_info_str; } diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 7ca5fc833ea..4a21d95eefe 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -1,11 +1,11 @@ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -71,7 +71,7 @@ PARSE_PYTHON_C_ARGS_TEMPLATE = \ RECORD_EVENT_TEMPLATE = \ -" paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);" +"paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::UserDefined, 1);" RETURN_INPLACE_PYOBJECT_TEMPLATE = \ @@ -253,6 +253,7 @@ NAMESPACE_WRAPPER_TEMPLATE = \ ## Generator Classes ## ####################### class PythonCSingleFunctionGenerator(FunctionGeneratorBase): + def __init__(self, forward_api_contents, namespace): # Members from Parent: #self.namespace @@ -265,7 +266,7 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase): #self.forward_outputs_position_map #self.optional_inputs #self.no_need_buffers - #self.intermediate_outputs + #self.intermediate_outputs #self.inplace_map FunctionGeneratorBase.__init__(self, forward_api_contents, namespace) @@ -327,8 +328,8 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase): set_device_str = FUNCTION_SET_DEVICE_TEMPLATE.format(expected_place_str) # Generate Dygraph Function Call Logic - num_args = len(forward_inputs_position_map.keys()) + len( - orig_forward_attrs_list) + num_args = len( + forward_inputs_position_map.keys()) + len(orig_forward_attrs_list) dygraph_function_call_list = ["" for i in range(num_args)] for name, (_, pos) in forward_inputs_position_map.items(): dygraph_function_call_list[pos] = f"{name}" @@ -336,7 +337,7 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase): dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_str = ",".join(dygraph_function_call_list) - # Generate Python-C Function Definitions + # Generate Python-C Function Definitions if is_forward_only: fwd_function_name = FUNCTION_NAME_TEMPLATE.format( "paddle::experimental::", namespace, forward_api_name) @@ -441,8 +442,9 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase): class PythonCYamlGenerator(YamlGeneratorBase): + def __init__(self, path): - # Parent members: + # Parent members: # self.namespace # self.api_yaml_path # self.forward_api_list @@ -457,8 +459,8 @@ class PythonCYamlGenerator(YamlGeneratorBase): forward_api_list = self.forward_api_list for forward_api_content in forward_api_list: - f_generator = PythonCSingleFunctionGenerator(forward_api_content, - namespace) + f_generator = PythonCSingleFunctionGenerator( + forward_api_content, namespace) status = f_generator.run() if status == True: diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index b1f31e20be4..aa8b6344d1e 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -30,10 +30,10 @@ namespace egr { /* -* GeneralGrad is Helpper class to implement custom grad operation between -* outputs and inputs. -* -* **/ + * GeneralGrad is Helpper class to implement custom grad operation between + * outputs and inputs. + * + * **/ class GeneralGrad { public: static GeneralGrad& Instance() { return *general_grad_; } @@ -64,7 +64,8 @@ class GeneralGrad { paddle::platform::errors::Fatal( "There is no grad op for %s:[%d] or it's" "stop_gradient=True.", - msg, i)); + msg, + i)); if (is_no_grad_vars) { (no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta; } else { // normal input @@ -248,7 +249,8 @@ class GeneralGrad { std::vector GetResults( const std::vector& inputs, - bool allow_unused, bool create_graph) { + bool allow_unused, + bool create_graph) { VLOG(6) << "Running in GetResults"; if (inputs.empty()) return {}; @@ -276,7 +278,8 @@ class GeneralGrad { tensor_auto_grad_meta->SetStopGradient(!create_graph); results.emplace_back(iter->second); } else { - PADDLE_ENFORCE_EQ(allow_unused, true, + PADDLE_ENFORCE_EQ(allow_unused, + true, paddle::platform::errors::InvalidArgument( "The %d-th input does not appear in the backward " "graph. Please check the input tensor or set " @@ -493,7 +496,8 @@ std::unordered_map getInDegreeMap( void EnforceGradNodeHasInput(GradNodeBase* node) { VLOG(6) << "Running in EnforceGradNodeHasInput"; PADDLE_ENFORCE_NE( - node->IsTensorWrappersCleared(), true, + node->IsTensorWrappersCleared(), + true, paddle::platform::errors::Fatal( "The TensorWrappers of %s do not exist. This may be because:\n" "You calculate backward twice for the same subgraph without " @@ -509,10 +513,13 @@ void DuplicateCheck(const std::vector& inputs, for (auto in : inputs) { AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in); PADDLE_ENFORCE_EQ( - visisted_ins.count(auto_grad_meta), 0, + visisted_ins.count(auto_grad_meta), + 0, paddle::platform::errors::AlreadyExists( - "%s contain duplicate tensor %s, please check %s carefully.", msg, - in.name(), msg)); + "%s contain duplicate tensor %s, please check %s carefully.", + msg, + in.name(), + msg)); visisted_ins.insert(auto_grad_meta); } } @@ -522,7 +529,8 @@ GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad(); std::vector RunBackward( const std::vector& tensors, // output const std::vector& grad_tensors, - bool retain_graph, bool create_graph = false, + bool retain_graph, + bool create_graph = false, const std::vector& inputs = {}, bool allow_unused = false, const std::vector& no_grad_vars = {}) { @@ -631,8 +639,8 @@ std::vector RunBackward( if (is_general_grad) { // Prepare several vital preprocess for GeneralGrad - GeneralGrad::Instance().PreparedForGeneralGrad(inputs, no_grad_vars, &queue, - node_input_buffers_dict); + GeneralGrad::Instance().PreparedForGeneralGrad( + inputs, no_grad_vars, &queue, node_input_buffers_dict); } VLOG(6) << " startup_ops' size is :" << queue.size(); @@ -651,7 +659,8 @@ std::vector RunBackward( paddle::platform::RecordEvent node_record_event( std::string((*node).name()) + " grad_node", - paddle::platform::TracerEventType::Operator, 1); + paddle::platform::TracerEventType::Operator, + 1); if (queue.size() > 1 && node_in_degree_map[node] != 0) { queue.pop(); @@ -716,7 +725,8 @@ std::vector RunBackward( "Number of edges should be either empty ( for leaf node " ") or the same as number of output grad tensors, but we " "got edges size is: %d, grad_output size is: %d", - edges.size(), grad_output_tensors.size())); + edges.size(), + grad_output_tensors.size())); for (size_t i = 0; i < edges.size(); i++) { for (size_t j = 0; j < edges[i].size(); j++) { @@ -739,7 +749,8 @@ std::vector RunBackward( } PADDLE_ENFORCE_LT( - j, grad_output_tensors[i].size(), + j, + grad_output_tensors[i].size(), paddle::platform::errors::Fatal( "Rank of grad_output_tensors should be less than " "grad_output_tensors[i].size(), which is: %d. This error may " @@ -771,9 +782,10 @@ std::vector RunBackward( VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first << ", rank: " << edge_rank.second; - node_input_buffers_dict[next_node]->add( - edge_rank.first, edge_rank.second, grad_output_tensor, - create_graph); + node_input_buffers_dict[next_node]->add(edge_rank.first, + edge_rank.second, + grad_output_tensor, + create_graph); // Update queue node_in_degree_map[next_node]--; @@ -810,7 +822,7 @@ void Backward( bool retain_graph) { VLOG(6) << "Run in Backward"; paddle::platform::RecordEvent backward_record_event( - "backward", paddle::platform::TracerEventType::Operator, 1); + "backward", paddle::platform::TracerEventType::UserDefined, 1); RunBackward(tensors, grad_tensors, retain_graph); phi::autotune::AutoTuneStatus::Instance().Update(); } @@ -819,14 +831,22 @@ std::vector Grad( const std::vector& tensors, // outputs const std::vector& inputs, const std::vector& grad_tensors, - bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused, + bool retain_graph, + bool create_graph, + bool only_inputs, + bool allow_unused, const std::vector& no_grad_vars) { VLOG(6) << "Run in Grad"; DuplicateCheck(inputs, true /* is_input */); DuplicateCheck(tensors, false /* is_input */); - return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs, - allow_unused, no_grad_vars); + return RunBackward(tensors, + grad_tensors, + retain_graph, + create_graph, + inputs, + allow_unused, + no_grad_vars); } } // namespace egr diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index e8fe5412721..ad522651723 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -588,7 +588,7 @@ void ChromeTracingLogger::StartLog() { std::string( R"JSON( { - "id": %d, "name": "%s", "totalGlobalMem": %u, + "id": %d, "name": "%s", "totalGlobalMem": %llu, "computeMajor": %d, "computeMinor": %d, "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d, "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d, @@ -618,7 +618,7 @@ void ChromeTracingLogger::StartLog() { std::string( R"JSON( { - "id": %d, "name": "%s", "totalGlobalMem": %u, + "id": %d, "name": "%s", "totalGlobalMem": %llu, "computeMajor": %d, "computeMinor": %d, "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d, "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d, diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py index 7079d9678b2..88e42d2c5a5 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py +++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py @@ -19,6 +19,7 @@ import paddle.profiler as profiler class HostPythonNode: + def __init__(self, name, type, start_ns, end_ns, process_id, thread_id): self.name = name self.type = type @@ -32,6 +33,7 @@ class HostPythonNode: class DevicePythonNode: + def __init__(self, name, type, start_ns, end_ns, device_id, context_id, stream_id): self.name = name @@ -44,6 +46,7 @@ class DevicePythonNode: class TestProfilerStatistic(unittest.TestCase): + def test_statistic_case1(self): root_node = HostPythonNode('Root Node', profiler.TracerEventType.UserDefined, 0, @@ -54,14 +57,16 @@ class TestProfilerStatistic(unittest.TestCase): dataloader_node = HostPythonNode('Dataloader', profiler.TracerEventType.Dataloader, 5, 15, 1000, 1001) - mobilenet_node = HostPythonNode( - 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) - yolonet_node = HostPythonNode( - 'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001) + mobilenet_node = HostPythonNode('MobileNet', + profiler.TracerEventType.Forward, 20, + 50, 1000, 1001) + yolonet_node = HostPythonNode('Yolov3Net', + profiler.TracerEventType.Forward, 50, 110, + 1000, 1001) - userdefined_node = HostPythonNode('Communication Time', - profiler.TracerEventType.UserDefined, - 100, 110, 1000, 1001) + userdefined_node = HostPythonNode( + 'Communication Time', profiler.TracerEventType.PythonUserDefined, + 100, 110, 1000, 1001) communication_node = HostPythonNode( 'Communication', profiler.TracerEventType.Communication, 105, 110, @@ -72,8 +77,9 @@ class TestProfilerStatistic(unittest.TestCase): optimization_node = HostPythonNode( 'Optimization', profiler.TracerEventType.Optimization, 220, 300, 1000, 1001) - conv2d_node = HostPythonNode( - 'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001) + conv2d_node = HostPythonNode('conv2d', + profiler.TracerEventType.Operator, 25, 40, + 1000, 1001) sync_batch_norm_node = HostPythonNode('sync_batch_norm', profiler.TracerEventType.Operator, 60, 100, 1000, 1001) @@ -92,10 +98,12 @@ class TestProfilerStatistic(unittest.TestCase): conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy', profiler.TracerEventType.CudaRuntime, 35, 40, 1000, 1001) - conv2d_kernel = DevicePythonNode( - 'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0) - conv2d_memcpy = DevicePythonNode( - 'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0) + conv2d_kernel = DevicePythonNode('conv2d_kernel', + profiler.TracerEventType.Kernel, 35, + 50, 0, 0, 0) + conv2d_memcpy = DevicePythonNode('conv2d_memcpy', + profiler.TracerEventType.Memcpy, 50, + 60, 0, 0, 0) sync_batch_norm_infer_shape = HostPythonNode( 'sync_batch_norm::infer_shape', profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001) @@ -146,8 +154,8 @@ class TestProfilerStatistic(unittest.TestCase): 'Process Cpu Utilization': '1.02', 'System Cpu Utilization': '0.68' } - statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, - extra_info) + statistic_data = profiler.profiler_statistic.StatisticData( + thread_tree, extra_info) time_range_summary = statistic_data.time_range_summary event_summary = statistic_data.event_summary @@ -180,7 +188,7 @@ class TestProfilerStatistic(unittest.TestCase): 0, profiler.TracerEventType.Memcpy), 60) self.assertEqual( time_range_summary.get_cpu_range_sum( - profiler.TracerEventType.UserDefined), 25) + profiler.TracerEventType.UserDefined), 15) self.assertEqual( time_range_summary.get_cpu_range_sum( profiler.TracerEventType.Communication), 5) @@ -200,8 +208,9 @@ class TestProfilerStatistic(unittest.TestCase): 0) self.assertEqual( event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15) - self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy'] - .general_gpu_time, 60) + self.assertEqual( + event_summary.memory_manipulation_items['AsyncMemcpy']. + general_gpu_time, 60) print( profiler.profiler_statistic._build_table( statistic_data, @@ -222,14 +231,16 @@ class TestProfilerStatistic(unittest.TestCase): profiler.TracerEventType.Dataloader, 5, 15, 1000, 1001) - mobilenet_node = HostPythonNode( - 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) - yolonet_node = HostPythonNode( - 'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001) + mobilenet_node = HostPythonNode('MobileNet', + profiler.TracerEventType.Forward, 20, + 50, 1000, 1001) + yolonet_node = HostPythonNode('Yolov3Net', + profiler.TracerEventType.Forward, 50, 110, + 1000, 1001) - userdefined_node = HostPythonNode('Communication Time', - profiler.TracerEventType.UserDefined, - 100, 110, 1000, 1001) + userdefined_node = HostPythonNode( + 'Communication Time', profiler.TracerEventType.PythonUserDefined, + 100, 110, 1000, 1001) allreduce_launchkernel0 = HostPythonNode( 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104, 1000, 1001) @@ -263,8 +274,9 @@ class TestProfilerStatistic(unittest.TestCase): optimization_node = HostPythonNode( 'Optimization', profiler.TracerEventType.Optimization, 220, 300, 1000, 1001) - conv2d_node = HostPythonNode( - 'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001) + conv2d_node = HostPythonNode('conv2d', + profiler.TracerEventType.Operator, 25, 40, + 1000, 1001) sync_batch_norm_node = HostPythonNode('sync_batch_norm', profiler.TracerEventType.Operator, 60, 100, 1000, 1001) @@ -283,10 +295,12 @@ class TestProfilerStatistic(unittest.TestCase): conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy', profiler.TracerEventType.CudaRuntime, 35, 40, 1000, 1001) - conv2d_kernel = DevicePythonNode( - 'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0) - conv2d_memcpy = DevicePythonNode( - 'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0) + conv2d_kernel = DevicePythonNode('conv2d_kernel', + profiler.TracerEventType.Kernel, 35, + 50, 0, 0, 0) + conv2d_memcpy = DevicePythonNode('conv2d_memcpy', + profiler.TracerEventType.Memcpy, 50, + 60, 0, 0, 0) sync_batch_norm_infer_shape = HostPythonNode( 'sync_batch_norm::infer_shape', profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001) @@ -363,8 +377,8 @@ class TestProfilerStatistic(unittest.TestCase): 'Process Cpu Utilization': '1.02', 'System Cpu Utilization': '0.68' } - statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, - extra_info) + statistic_data = profiler.profiler_statistic.StatisticData( + thread_tree, extra_info) time_range_summary = statistic_data.time_range_summary event_summary = statistic_data.event_summary distributed_summary = statistic_data.distributed_summary @@ -398,7 +412,7 @@ class TestProfilerStatistic(unittest.TestCase): 0, profiler.TracerEventType.Memcpy), 60) self.assertEqual( time_range_summary.get_cpu_range_sum( - profiler.TracerEventType.UserDefined), 25) + profiler.TracerEventType.UserDefined), 15) self.assertEqual( time_range_summary.get_cpu_range_sum( profiler.TracerEventType.Communication), 5) @@ -433,8 +447,9 @@ class TestProfilerStatistic(unittest.TestCase): 0) self.assertEqual( event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15) - self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy'] - .general_gpu_time, 60) + self.assertEqual( + event_summary.memory_manipulation_items['AsyncMemcpy']. + general_gpu_time, 60) print( profiler.profiler_statistic._build_table( statistic_data, @@ -454,8 +469,9 @@ class TestProfilerStatistic(unittest.TestCase): dataloader_node = HostPythonNode('Dataloader', profiler.TracerEventType.Dataloader, 5, 15, 1000, 1001) - mobilenet_node = HostPythonNode( - 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) + mobilenet_node = HostPythonNode('MobileNet', + profiler.TracerEventType.Forward, 20, + 50, 1000, 1001) backward_node = HostPythonNode('Gradient Backward', profiler.TracerEventType.Backward, 120, @@ -463,12 +479,13 @@ class TestProfilerStatistic(unittest.TestCase): optimization_node = HostPythonNode( 'Optimization', profiler.TracerEventType.Optimization, 220, 300, 1000, 1001) - userdefined_node = HostPythonNode('Communication Time', - profiler.TracerEventType.UserDefined, - 60, 70, 1000, 1001) + userdefined_node = HostPythonNode( + 'Communication Time', profiler.TracerEventType.PythonUserDefined, + 60, 70, 1000, 1001) - conv2d_node = HostPythonNode( - 'conv2d', profiler.TracerEventType.Operator, 25, 25, 1000, 1001) + conv2d_node = HostPythonNode('conv2d', + profiler.TracerEventType.Operator, 25, 25, + 1000, 1001) conv2d_infer_shape = HostPythonNode( 'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25, @@ -480,8 +497,9 @@ class TestProfilerStatistic(unittest.TestCase): 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25, 1000, 1001) - conv2d_kernel = DevicePythonNode( - 'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0) + conv2d_kernel = DevicePythonNode('conv2d_kernel', + profiler.TracerEventType.Kernel, 35, + 35, 0, 0, 0) another_kernel = DevicePythonNode( 'void phi::funcs::VectorizedBroadcastKernel, phi::funcs::AddFunctor>()', profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0) @@ -500,15 +518,16 @@ class TestProfilerStatistic(unittest.TestCase): 'Process Cpu Utilization': '1.02', 'System Cpu Utilization': '0.68' } - statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, - extra_info) + statistic_data = profiler.profiler_statistic.StatisticData( + thread_tree, extra_info) time_range_summary = statistic_data.time_range_summary event_summary = statistic_data.event_summary self.assertEqual(event_summary.items['conv2d'].cpu_time, 0) self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0) - self.assertEqual(event_summary.userdefined_items['Communication Time'] - .general_gpu_time, 0) + self.assertEqual( + event_summary.userdefined_items['Communication Time']. + general_gpu_time, 0) for sort_key in [ profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax, profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg, @@ -516,12 +535,11 @@ class TestProfilerStatistic(unittest.TestCase): profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg ]: print( - profiler.profiler_statistic._build_table( - statistic_data, - sorted_by=sort_key, - op_detail=True, - thread_sep=False, - time_unit='ms')) + profiler.profiler_statistic._build_table(statistic_data, + sorted_by=sort_key, + op_detail=True, + thread_sep=False, + time_unit='ms')) if __name__ == '__main__': diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index 50aa3a1f11f..6f5894b590c 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -197,8 +197,8 @@ class TimeRangeSummary: def __init__(self): self.CPUTimeRange = collections.defaultdict(list) self.GPUTimeRange = collections.defaultdict( - lambda: collections.defaultdict(list) - ) # GPU events should be divided into different devices + lambda: collections.defaultdict( + list)) # GPU events should be divided into different devices self.CPUTimeRangeSum = collections.defaultdict(int) self.GPUTimeRangeSum = collections.defaultdict( lambda: collections.defaultdict(int)) @@ -212,8 +212,8 @@ class TimeRangeSummary: for threadid, hostnodes in thread2hostnodes.items(): CPUTimeRange = collections.defaultdict(list) GPUTimeRange = collections.defaultdict( - lambda: collections.defaultdict(lambda: collections.defaultdict(list)) - ) # device_id/type/stream_id + lambda: collections.defaultdict(lambda: collections.defaultdict( + list))) # device_id/type/stream_id for hostnode in hostnodes[1:]: #skip root node CPUTimeRange[hostnode.type].append( (hostnode.start_ns, hostnode.end_ns)) @@ -235,8 +235,8 @@ class TimeRangeSummary: for device_id, device_time_ranges in GPUTimeRange.items(): for event_type, event_time_ranges in device_time_ranges.items(): for stream_id, time_ranges in event_time_ranges.items(): - time_ranges = merge_self_ranges( - time_ranges, is_sorted=False) + time_ranges = merge_self_ranges(time_ranges, + is_sorted=False) self.GPUTimeRange[device_id][event_type] = merge_ranges( self.GPUTimeRange[device_id][event_type], time_ranges, @@ -310,25 +310,27 @@ class DistributedSummary: for devicenode in runtimenode.device_node: if devicenode.type == TracerEventType.Kernel: if 'nccl' in devicenode.name.lower(): - self.gpu_communication_range.append(( - devicenode.start_ns, devicenode.end_ns)) + self.gpu_communication_range.append( + (devicenode.start_ns, + devicenode.end_ns)) else: - self.computation_range.append(( - devicenode.start_ns, devicenode.end_ns)) + self.computation_range.append( + (devicenode.start_ns, + devicenode.end_ns)) self.cpu_calls = len(set(self.cpu_communication_range)) self.gpu_calls = len(set(self.gpu_communication_range)) self.cpu_communication_range = merge_self_ranges( self.cpu_communication_range, is_sorted=False) self.gpu_communication_range = merge_self_ranges( self.gpu_communication_range, is_sorted=False) - self.communication_range = merge_ranges( - self.cpu_communication_range, - self.gpu_communication_range, - is_sorted=True) - self.computation_range = merge_self_ranges( - self.computation_range, is_sorted=False) - self.overlap_range = intersection_ranges( - self.communication_range, self.computation_range, is_sorted=True) + self.communication_range = merge_ranges(self.cpu_communication_range, + self.gpu_communication_range, + is_sorted=True) + self.computation_range = merge_self_ranges(self.computation_range, + is_sorted=False) + self.overlap_range = intersection_ranges(self.communication_range, + self.computation_range, + is_sorted=True) class EventSummary: @@ -337,6 +339,7 @@ class EventSummary: """ class DeviceItem: + def __init__(self, name): self.name = name self.call = 0 @@ -360,6 +363,7 @@ class EventSummary: self.add_gpu_time(node.end_ns - node.start_ns) class OperatorItem: + def __init__(self, name): self.name = name self.call = 0 @@ -430,6 +434,7 @@ class EventSummary: self.devices[name].add_item(devicenode) class GeneralItem: + def __init__(self, name): self.name = name self.call = 0 @@ -513,7 +518,8 @@ class EventSummary: or 'memset' in host_statistic_node.name.lower(): self.add_memory_manipulation_item(host_statistic_node) else: - self.add_userdefined_item(host_statistic_node) + if host_statistic_node.type == TracerEventType.PythonUserDefined: + self.add_userdefined_item(host_statistic_node) self.add_kernel_item(host_statistic_nodes[0]) for threadid, root_statistic_node in node_statistic_trees.items(): @@ -688,13 +694,14 @@ def _build_table(statistic_data, append(row_format.format(*headers)) append(header_sep) row_values = [ - 'CPU(Process)', format_ratio( - float(statistic_data.extra_info['Process Cpu Utilization'])) + 'CPU(Process)', + format_ratio(float( + statistic_data.extra_info['Process Cpu Utilization'])) ] append(row_format.format(*row_values)) row_values = [ - 'CPU(System)', format_ratio( - float(statistic_data.extra_info['System Cpu Utilization'])) + 'CPU(System)', + format_ratio(float(statistic_data.extra_info['System Cpu Utilization'])) ] append(row_format.format(*row_values)) for gpu_name in statistic_data.time_range_summary.get_gpu_devices(): @@ -783,20 +790,22 @@ def _build_table(statistic_data, TracerEventType. Communication] = statistic_data.distributed_summary.gpu_calls - sorted_items = sorted( - cpu_type_time.items(), key=lambda x: x[1], reverse=True) + sorted_items = sorted(cpu_type_time.items(), + key=lambda x: x[1], + reverse=True) event_type, time = sorted_items[0] row_values = [ '{}'.format(str(event_type).split('.')[1]), cpu_call_times[event_type], - format_time( - time, unit=time_unit), format_ratio(float(time) / total_time) + format_time(time, unit=time_unit), + format_ratio(float(time) / total_time) ] append(row_format.format(*row_values)) for event_type, time in sorted_items[1:]: row_values = [ ' {}'.format(str(event_type).split('.')[1]), - cpu_call_times[event_type], format_time( - time, unit=time_unit), format_ratio(float(time) / total_time) + cpu_call_times[event_type], + format_time(time, unit=time_unit), + format_ratio(float(time) / total_time) ] append(row_format.format(*row_values)) append(header_sep) @@ -806,8 +815,9 @@ def _build_table(statistic_data, for event_type, time in gpu_type_time.items(): row_values = [ ' {}'.format(str(event_type).split('.')[1]), - gpu_call_times[event_type], format_time( - time, unit=time_unit), format_ratio(float(time) / total_time) + gpu_call_times[event_type], + format_time(time, unit=time_unit), + format_ratio(float(time) / total_time) ] append(row_format.format(*row_values)) @@ -851,24 +861,16 @@ def _build_table(statistic_data, row_values = [ '{}'.format(name), item.call, '{} / {} / {} / {} / {}'.format( - format_time( - item.cpu_time, unit=time_unit), - format_time( - item.avg_cpu_time, unit=time_unit), - format_time( - item.max_cpu_time, unit=time_unit), - format_time( - item.min_cpu_time, unit=time_unit), + format_time(item.cpu_time, unit=time_unit), + format_time(item.avg_cpu_time, unit=time_unit), + format_time(item.max_cpu_time, unit=time_unit), + format_time(item.min_cpu_time, unit=time_unit), format_ratio(float(item.cpu_time) / total_time)), '{} / {} / {} / {} / {}'.format( - format_time( - item.gpu_time, unit=time_unit), - format_time( - item.avg_gpu_time, unit=time_unit), - format_time( - item.max_gpu_time, unit=time_unit), - format_time( - item.min_gpu_time, unit=time_unit), + format_time(item.gpu_time, unit=time_unit), + format_time(item.avg_gpu_time, unit=time_unit), + format_time(item.max_gpu_time, unit=time_unit), + format_time(item.min_gpu_time, unit=time_unit), format_ratio(gpu_ratio)) ] all_row_values.append(row_values) @@ -884,12 +886,10 @@ def _build_table(statistic_data, gpu_ratio = float(other_gpu_time) / gpu_total_time row_values = [ ' Others', '-', '{} / - / - / - / {}'.format( - format_time( - other_time, unit=time_unit), + format_time(other_time, unit=time_unit), format_ratio(float(other_time) / total_time)), '{} / - / - / - / {}'.format( - format_time( - other_gpu_time, unit=time_unit), + format_time(other_gpu_time, unit=time_unit), format_ratio(gpu_ratio)) ] all_row_values.append(row_values) @@ -971,28 +971,28 @@ def _build_table(statistic_data, overlap_time = sum_ranges( statistic_data.distributed_summary.overlap_range) row_values = [ - 'ProfileStep', format_time( - total_time, unit=time_unit), + 'ProfileStep', + format_time(total_time, unit=time_unit), format_ratio(float(total_time) / total_time) ] append(row_format.format(*row_values)) row_values = [ - ' Communication', format_time( - communication_time, unit=time_unit), + ' Communication', + format_time(communication_time, unit=time_unit), format_ratio(float(communication_time) / total_time) ] append(row_format.format(*row_values)) row_values = [ - ' Computation', format_time( - computation_time, unit=time_unit), + ' Computation', + format_time(computation_time, unit=time_unit), format_ratio(float(computation_time) / total_time) ] append(row_format.format(*row_values)) row_values = [ - ' Overlap', format_time( - overlap_time, unit=time_unit), + ' Overlap', + format_time(overlap_time, unit=time_unit), format_ratio(float(overlap_time) / total_time) ] append(row_format.format(*row_values)) @@ -1026,39 +1026,35 @@ def _build_table(statistic_data, for thread_id, items in thread_items.items(): all_row_values.append("Thread: {}".format(thread_id)) if sorted_by == SortedKeys.CPUTotal: - sorted_items = sorted( - items.items(), key=lambda x: x[1].cpu_time, reverse=True) + sorted_items = sorted(items.items(), + key=lambda x: x[1].cpu_time, + reverse=True) elif sorted_by == SortedKeys.CPUAvg: - sorted_items = sorted( - items.items(), - key=lambda x: x[1].avg_cpu_time, - reverse=True) + sorted_items = sorted(items.items(), + key=lambda x: x[1].avg_cpu_time, + reverse=True) elif sorted_by == SortedKeys.CPUMax: - sorted_items = sorted( - items.items(), - key=lambda x: x[1].max_cpu_time, - reverse=True) + sorted_items = sorted(items.items(), + key=lambda x: x[1].max_cpu_time, + reverse=True) elif sorted_by == SortedKeys.CPUMin: - sorted_items = sorted( - items.items(), key=lambda x: x[1].min_cpu_time) + sorted_items = sorted(items.items(), + key=lambda x: x[1].min_cpu_time) elif sorted_by == SortedKeys.GPUTotal: - sorted_items = sorted( - items.items(), - key=lambda x: x[1].general_gpu_time, - reverse=True) + sorted_items = sorted(items.items(), + key=lambda x: x[1].general_gpu_time, + reverse=True) elif sorted_by == SortedKeys.GPUAvg: - sorted_items = sorted( - items.items(), - key=lambda x: x[1].avg_general_gpu_time, - reverse=True) + sorted_items = sorted(items.items(), + key=lambda x: x[1].avg_general_gpu_time, + reverse=True) elif sorted_by == SortedKeys.GPUMax: - sorted_items = sorted( - items.items(), - key=lambda x: x[1].max_general_gpu_time, - reverse=True) + sorted_items = sorted(items.items(), + key=lambda x: x[1].max_general_gpu_time, + reverse=True) elif sorted_by == SortedKeys.GPUMin: - sorted_items = sorted( - items.items(), key=lambda x: x[1].min_general_gpu_time) + sorted_items = sorted(items.items(), + key=lambda x: x[1].min_general_gpu_time) total_op_cpu_time = 0 total_op_gpu_time = 0 @@ -1077,24 +1073,16 @@ def _build_table(statistic_data, gpu_ratio = float(item.general_gpu_time) / total_op_gpu_time row_values = [ name, item.call, '{} / {} / {} / {} / {}'.format( - format_time( - item.cpu_time, unit=time_unit), - format_time( - item.avg_cpu_time, unit=time_unit), - format_time( - item.max_cpu_time, unit=time_unit), - format_time( - item.min_cpu_time, unit=time_unit), + format_time(item.cpu_time, unit=time_unit), + format_time(item.avg_cpu_time, unit=time_unit), + format_time(item.max_cpu_time, unit=time_unit), + format_time(item.min_cpu_time, unit=time_unit), format_ratio(cpu_ratio)), '{} / {} / {} / {} / {}'.format( - format_time( - item.general_gpu_time, unit=time_unit), - format_time( - item.avg_general_gpu_time, unit=time_unit), - format_time( - item.max_general_gpu_time, unit=time_unit), - format_time( - item.min_general_gpu_time, unit=time_unit), + format_time(item.general_gpu_time, unit=time_unit), + format_time(item.avg_general_gpu_time, unit=time_unit), + format_time(item.max_general_gpu_time, unit=time_unit), + format_time(item.min_general_gpu_time, unit=time_unit), format_ratio(gpu_ratio)) ] all_row_values.append(row_values) @@ -1117,28 +1105,24 @@ def _build_table(statistic_data, row_values = [ ' {}'.format(innerop_name), innerop_node.call, '{} / {} / {} / {} / {}'.format( - format_time( - innerop_node.cpu_time, unit=time_unit), - format_time( - innerop_node.avg_cpu_time, unit=time_unit), - format_time( - innerop_node.max_cpu_time, unit=time_unit), - format_time( - innerop_node.min_cpu_time, unit=time_unit), + format_time(innerop_node.cpu_time, + unit=time_unit), + format_time(innerop_node.avg_cpu_time, + unit=time_unit), + format_time(innerop_node.max_cpu_time, + unit=time_unit), + format_time(innerop_node.min_cpu_time, + unit=time_unit), format_ratio(cpu_ratio)), '{} / {} / {} / {} / {}'.format( - format_time( - innerop_node.general_gpu_time, - unit=time_unit), - format_time( - innerop_node.avg_general_gpu_time, - unit=time_unit), - format_time( - innerop_node.max_general_gpu_time, - unit=time_unit), - format_time( - innerop_node.min_general_gpu_time, - unit=time_unit), + format_time(innerop_node.general_gpu_time, + unit=time_unit), + format_time(innerop_node.avg_general_gpu_time, + unit=time_unit), + format_time(innerop_node.max_general_gpu_time, + unit=time_unit), + format_time(innerop_node.min_general_gpu_time, + unit=time_unit), format_ratio(gpu_ratio)) ] all_row_values.append(row_values) @@ -1148,8 +1132,8 @@ def _build_table(statistic_data, gpu_ratio = 0 else: gpu_ratio = float( - device_node. - gpu_time) / innerop_node.general_gpu_time + device_node.gpu_time + ) / innerop_node.general_gpu_time if len(device_node_name) + 4 > name_column_width: device_node_name = device_node_name[: name_column_width @@ -1159,17 +1143,14 @@ def _build_table(statistic_data, ' {}'.format(device_node_name), device_node.call, '- / - / - / - / -', '{} / {} / {} / {} / {}'.format( - format_time( - device_node.gpu_time, unit=time_unit), - format_time( - device_node.avg_gpu_time, - unit=time_unit), - format_time( - device_node.max_gpu_time, - unit=time_unit), - format_time( - device_node.min_gpu_time, - unit=time_unit), + format_time(device_node.gpu_time, + unit=time_unit), + format_time(device_node.avg_gpu_time, + unit=time_unit), + format_time(device_node.max_gpu_time, + unit=time_unit), + format_time(device_node.min_gpu_time, + unit=time_unit), format_ratio(gpu_ratio)) ] all_row_values.append(row_values) @@ -1188,14 +1169,14 @@ def _build_table(statistic_data, ' {}'.format(device_node_name), device_node.call, '- / - / - / - / -', '{} / {} / {} / {} / {}'.format( - format_time( - device_node.gpu_time, unit=time_unit), - format_time( - device_node.avg_gpu_time, unit=time_unit), - format_time( - device_node.max_gpu_time, unit=time_unit), - format_time( - device_node.min_gpu_time, unit=time_unit), + format_time(device_node.gpu_time, + unit=time_unit), + format_time(device_node.avg_gpu_time, + unit=time_unit), + format_time(device_node.max_gpu_time, + unit=time_unit), + format_time(device_node.min_gpu_time, + unit=time_unit), format_ratio(gpu_ratio)) ] all_row_values.append(row_values) @@ -1249,21 +1230,20 @@ def _build_table(statistic_data, all_row_values = [] kernel_items = statistic_data.event_summary.kernel_items if sorted_by == SortedKeys.GPUAvg: - sorted_items = sorted( - kernel_items.items(), - key=lambda x: x[1].avg_gpu_time, - reverse=True) + sorted_items = sorted(kernel_items.items(), + key=lambda x: x[1].avg_gpu_time, + reverse=True) elif sorted_by == SortedKeys.GPUMax: - sorted_items = sorted( - kernel_items.items(), - key=lambda x: x[1].max_gpu_time, - reverse=True) + sorted_items = sorted(kernel_items.items(), + key=lambda x: x[1].max_gpu_time, + reverse=True) elif sorted_by == SortedKeys.GPUMin: - sorted_items = sorted( - kernel_items.items(), key=lambda x: x[1].min_gpu_time) + sorted_items = sorted(kernel_items.items(), + key=lambda x: x[1].min_gpu_time) else: - sorted_items = sorted( - kernel_items.items(), key=lambda x: x[1].gpu_time, reverse=True) + sorted_items = sorted(kernel_items.items(), + key=lambda x: x[1].gpu_time, + reverse=True) total_kernel_gpu_time = 0 for name, item in sorted_items: @@ -1277,14 +1257,10 @@ def _build_table(statistic_data, name, item.call, '{} / {} / {} / {} / {}'.format( - format_time( - item.gpu_time, unit=time_unit), - format_time( - item.avg_gpu_time, unit=time_unit), - format_time( - item.max_gpu_time, unit=time_unit), - format_time( - item.min_gpu_time, unit=time_unit), + format_time(item.gpu_time, unit=time_unit), + format_time(item.avg_gpu_time, unit=time_unit), + format_time(item.max_gpu_time, unit=time_unit), + format_time(item.min_gpu_time, unit=time_unit), format_ratio(gpu_ratio)), ] all_row_values.append(row_values) @@ -1349,24 +1325,16 @@ def _build_table(statistic_data, name, item.call, '{} / {} / {} / {} / {}'.format( - format_time( - item.cpu_time, unit=time_unit), - format_time( - item.avg_cpu_time, unit=time_unit), - format_time( - item.max_cpu_time, unit=time_unit), - format_time( - item.min_cpu_time, unit=time_unit), + format_time(item.cpu_time, unit=time_unit), + format_time(item.avg_cpu_time, unit=time_unit), + format_time(item.max_cpu_time, unit=time_unit), + format_time(item.min_cpu_time, unit=time_unit), format_ratio(float(item.cpu_time) / total_time)), '{} / {} / {} / {} / {}'.format( - format_time( - item.general_gpu_time, unit=time_unit), - format_time( - item.avg_general_gpu_time, unit=time_unit), - format_time( - item.max_general_gpu_time, unit=time_unit), - format_time( - item.min_general_gpu_time, unit=time_unit), + format_time(item.general_gpu_time, unit=time_unit), + format_time(item.avg_general_gpu_time, unit=time_unit), + format_time(item.max_general_gpu_time, unit=time_unit), + format_time(item.min_general_gpu_time, unit=time_unit), format_ratio(gpu_ratio)), ] all_row_values.append(row_values) @@ -1429,39 +1397,35 @@ def _build_table(statistic_data, for thread_id, items in userdefined_thread_items.items(): all_row_values.append("Thread: {}".format(thread_id)) if sorted_by == SortedKeys.CPUTotal: - sorted_items = sorted( - items.items(), key=lambda x: x[1].cpu_time, reverse=True) + sorted_items = sorted(items.items(), + key=lambda x: x[1].cpu_time, + reverse=True) elif sorted_by == SortedKeys.CPUAvg: - sorted_items = sorted( - items.items(), - key=lambda x: x[1].avg_cpu_time, - reverse=True) + sorted_items = sorted(items.items(), + key=lambda x: x[1].avg_cpu_time, + reverse=True) elif sorted_by == SortedKeys.CPUMax: - sorted_items = sorted( - items.items(), - key=lambda x: x[1].max_cpu_time, - reverse=True) + sorted_items = sorted(items.items(), + key=lambda x: x[1].max_cpu_time, + reverse=True) elif sorted_by == SortedKeys.CPUMin: - sorted_items = sorted( - items.items(), key=lambda x: x[1].min_cpu_time) + sorted_items = sorted(items.items(), + key=lambda x: x[1].min_cpu_time) elif sorted_by == SortedKeys.GPUTotal: - sorted_items = sorted( - items.items(), - key=lambda x: x[1].general_gpu_time, - reverse=True) + sorted_items = sorted(items.items(), + key=lambda x: x[1].general_gpu_time, + reverse=True) elif sorted_by == SortedKeys.GPUAvg: - sorted_items = sorted( - items.items(), - key=lambda x: x[1].avg_general_gpu_time, - reverse=True) + sorted_items = sorted(items.items(), + key=lambda x: x[1].avg_general_gpu_time, + reverse=True) elif sorted_by == SortedKeys.GPUMax: - sorted_items = sorted( - items.items(), - key=lambda x: x[1].max_general_gpu_time, - reverse=True) + sorted_items = sorted(items.items(), + key=lambda x: x[1].max_general_gpu_time, + reverse=True) elif sorted_by == SortedKeys.GPUMin: - sorted_items = sorted( - items.items(), key=lambda x: x[1].min_general_gpu_time) + sorted_items = sorted(items.items(), + key=lambda x: x[1].min_general_gpu_time) for name, item in sorted_items: if gpu_total_time == 0: @@ -1472,24 +1436,16 @@ def _build_table(statistic_data, name, item.call, '{} / {} / {} / {} / {}'.format( - format_time( - item.cpu_time, unit=time_unit), - format_time( - item.avg_cpu_time, unit=time_unit), - format_time( - item.max_cpu_time, unit=time_unit), - format_time( - item.min_cpu_time, unit=time_unit), + format_time(item.cpu_time, unit=time_unit), + format_time(item.avg_cpu_time, unit=time_unit), + format_time(item.max_cpu_time, unit=time_unit), + format_time(item.min_cpu_time, unit=time_unit), format_ratio(float(item.cpu_time) / total_time)), '{} / {} / {} / {} / {}'.format( - format_time( - item.general_gpu_time, unit=time_unit), - format_time( - item.avg_general_gpu_time, unit=time_unit), - format_time( - item.max_general_gpu_time, unit=time_unit), - format_time( - item.min_general_gpu_time, unit=time_unit), + format_time(item.general_gpu_time, unit=time_unit), + format_time(item.avg_general_gpu_time, unit=time_unit), + format_time(item.max_general_gpu_time, unit=time_unit), + format_time(item.min_general_gpu_time, unit=time_unit), format_ratio(gpu_ratio)), ] all_row_values.append(row_values) -- GitLab