未验证 提交 6de20581 编写于 作者: C chenjian 提交者: GitHub

Fix operator type record in profiler [cherry-pick PR44582] (#44654)

* fix record event for operator type in new dygraph (#44582)

* fix new dygraph record event for op

* update unit test

* fix file mode
上级 b71833ea
...@@ -476,7 +476,8 @@ static void SlotNameMatching( ...@@ -476,7 +476,8 @@ static void SlotNameMatching(
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
"Detected mismatched slot names." "Detected mismatched slot names."
"grad_slot_name %s matches both %s and %s fwd_slot_name", "grad_slot_name %s matches both %s and %s fwd_slot_name",
grad_slot_name, grad_fwd_slotname_map[grad_slot_name], grad_slot_name,
grad_fwd_slotname_map[grad_slot_name],
fwd_slot_name)); fwd_slot_name));
} }
grad_fwd_slotname_map[grad_slot_name] = fwd_slot_name; grad_fwd_slotname_map[grad_slot_name] = fwd_slot_name;
...@@ -489,7 +490,8 @@ static void SlotNameMatching( ...@@ -489,7 +490,8 @@ static void SlotNameMatching(
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
"Detected mismatched slot names." "Detected mismatched slot names."
"grad_slot_name %s matches both %s and %s fwd_slot_name", "grad_slot_name %s matches both %s and %s fwd_slot_name",
grad_slot_name, grad_grad_slotname_map[grad_slot_name], grad_slot_name,
grad_grad_slotname_map[grad_slot_name],
fwd_slot_name)); fwd_slot_name));
} }
grad_grad_slotname_map[grad_slot_name] = fwd_slot_name; grad_grad_slotname_map[grad_slot_name] = fwd_slot_name;
...@@ -509,7 +511,8 @@ static void SlotNameMatching( ...@@ -509,7 +511,8 @@ static void SlotNameMatching(
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
"Detected mismatched slot names" "Detected mismatched slot names"
"grad_slot_name %s matches both %s and %s fwd_slot_name", "grad_slot_name %s matches both %s and %s fwd_slot_name",
grad_slot_name, grad_fwd_slotname_map[grad_slot_name], grad_slot_name,
grad_fwd_slotname_map[grad_slot_name],
fwd_slot_name)); fwd_slot_name));
} }
grad_fwd_slotname_map[grad_slot_name] = fwd_slot_name; grad_fwd_slotname_map[grad_slot_name] = fwd_slot_name;
...@@ -522,7 +525,8 @@ static void SlotNameMatching( ...@@ -522,7 +525,8 @@ static void SlotNameMatching(
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
"Detected mismatched slot names." "Detected mismatched slot names."
"grad_slot_name %s matches both %s and %s fwd_slot_name", "grad_slot_name %s matches both %s and %s fwd_slot_name",
grad_slot_name, grad_grad_slotname_map[grad_slot_name], grad_slot_name,
grad_grad_slotname_map[grad_slot_name],
fwd_slot_name)); fwd_slot_name));
} }
grad_grad_slotname_map[grad_slot_name] = fwd_slot_name; grad_grad_slotname_map[grad_slot_name] = fwd_slot_name;
...@@ -900,8 +904,8 @@ static bool CollectGradInformationFromOpInfo( ...@@ -900,8 +904,8 @@ static bool CollectGradInformationFromOpInfo(
} }
std::shared_ptr<paddle::imperative::GradOpNode> grad_node = std::shared_ptr<paddle::imperative::GradOpNode> grad_node =
op_info.dygraph_grad_op_maker_(op_type, ins, outs, attrs, default_attrs, op_info.dygraph_grad_op_maker_(
{}); op_type, ins, outs, attrs, default_attrs, {});
if (!grad_node) { if (!grad_node) {
VLOG(6) << "Got nullptr GradOpNode for " << op_type VLOG(6) << "Got nullptr GradOpNode for " << op_type
...@@ -977,12 +981,16 @@ static bool CollectGradInformationFromOpInfo( ...@@ -977,12 +981,16 @@ static bool CollectGradInformationFromOpInfo(
/* ------ Slot Name Matching ---- */ /* ------ Slot Name Matching ---- */
for (auto& iter : *op_base_infos) { for (auto& iter : *op_base_infos) {
// grad_ins -> fwd_ins, fwd_outs // grad_ins -> fwd_ins, fwd_outs
SlotNameMatching(iter.GetGradIns(), fwd_ins, fwd_outs, SlotNameMatching(iter.GetGradIns(),
fwd_ins,
fwd_outs,
iter.GetMutableGradInsFwdSlotnameMap(), iter.GetMutableGradInsFwdSlotnameMap(),
iter.GetMutableGradInsGradSlotnameMap()); iter.GetMutableGradInsGradSlotnameMap());
// grad_outs -> fwd_ins, fwd_outs // grad_outs -> fwd_ins, fwd_outs
SlotNameMatching(iter.GetGradOuts(), fwd_ins, fwd_outs, SlotNameMatching(iter.GetGradOuts(),
fwd_ins,
fwd_outs,
iter.GetMutableGradOutsSlotnameMap(), iter.GetMutableGradOutsSlotnameMap(),
iter.GetMutableGradOutsSlotnameMap()); iter.GetMutableGradOutsSlotnameMap());
} }
...@@ -1042,16 +1050,18 @@ static std::string GenerateGradNodeCreationContent( ...@@ -1042,16 +1050,18 @@ static std::string GenerateGradNodeCreationContent(
"p_autograd_" + inplace_input_name; "p_autograd_" + inplace_input_name;
const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
" %s = egr::EagerUtils::autograd_meta(&%s);\n"; " %s = egr::EagerUtils::autograd_meta(&%s);\n";
get_output_autograd_meta_str += paddle::string::Sprintf( get_output_autograd_meta_str +=
GET_SINGLE_AUTOGRAD_META_TEMPLATE, inplace_input_autograd_name, paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE,
inplace_input_name); inplace_input_autograd_name,
inplace_input_name);
} else { } else {
const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
" egr::AutogradMeta* %s = " " egr::AutogradMeta* %s = "
"egr::EagerUtils::autograd_meta(&%s);\n"; "egr::EagerUtils::autograd_meta(&%s);\n";
get_output_autograd_meta_str += get_output_autograd_meta_str +=
paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE, paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE,
output_autograd_name, output_name); output_autograd_name,
output_name);
} }
} }
} }
...@@ -1097,8 +1107,8 @@ static std::string GenerateGradNodeCreationContent( ...@@ -1097,8 +1107,8 @@ static std::string GenerateGradNodeCreationContent(
"require_any_grad);\n"; "require_any_grad);\n";
for (auto& inplace_pair : inplace_map) { for (auto& inplace_pair : inplace_map) {
std::string inplace_name = inplace_pair.second; std::string inplace_name = inplace_pair.second;
check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE, check_inplace_str += paddle::string::Sprintf(
inplace_name, inplace_name); CHECKING_INPLACE_TEMPLATE, inplace_name, inplace_name);
} }
VLOG(6) << "Check Inplace Input"; VLOG(6) << "Check Inplace Input";
} }
...@@ -1124,9 +1134,11 @@ static std::string GenerateGradNodeCreationContent( ...@@ -1124,9 +1134,11 @@ static std::string GenerateGradNodeCreationContent(
" auto grad_node = std::shared_ptr<GradNode%s>(new GradNode%s(%d, " " auto grad_node = std::shared_ptr<GradNode%s>(new GradNode%s(%d, "
"%d));\n"; "%d));\n";
grad_node_creation_str += " // Create GradOpNode\n"; grad_node_creation_str += " // Create GradOpNode\n";
grad_node_creation_str += grad_node_creation_str += paddle::string::Sprintf(GRAD_OP_NODE_TEMPLATE,
paddle::string::Sprintf(GRAD_OP_NODE_TEMPLATE, op_type, op_type, op_type,
bwd_in_slot_num, bwd_out_slot_num); op_type,
bwd_in_slot_num,
bwd_out_slot_num);
grad_node_creation_str += "\n"; grad_node_creation_str += "\n";
VLOG(6) << "Generated GradOpNode construction"; VLOG(6) << "Generated GradOpNode construction";
...@@ -1158,13 +1170,17 @@ static std::string GenerateGradNodeCreationContent( ...@@ -1158,13 +1170,17 @@ static std::string GenerateGradNodeCreationContent(
// Replace output directly with input in inplace op. // Replace output directly with input in inplace op.
if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) { if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
auto inplace_input_name = inplace_map[tensor_wrapper_name]; auto inplace_input_name = inplace_map[tensor_wrapper_name];
grad_node_creation_str += paddle::string::Sprintf( grad_node_creation_str +=
SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, paddle::string::Sprintf(SET_TENSOR_WRAPPER_TEMPLATE,
inplace_input_name, full_reserved); tensor_wrapper_name,
inplace_input_name,
full_reserved);
} else { } else {
grad_node_creation_str += paddle::string::Sprintf( grad_node_creation_str +=
SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, paddle::string::Sprintf(SET_TENSOR_WRAPPER_TEMPLATE,
tensor_wrapper_name, full_reserved); tensor_wrapper_name,
tensor_wrapper_name,
full_reserved);
} }
} }
} }
...@@ -1189,9 +1205,10 @@ static std::string GenerateGradNodeCreationContent( ...@@ -1189,9 +1205,10 @@ static std::string GenerateGradNodeCreationContent(
const char* ADD_EDGES_TEMPLATE = const char* ADD_EDGES_TEMPLATE =
" if(%s) grad_node->AddEdges(%s, %d);\n"; " if(%s) grad_node->AddEdges(%s, %d);\n";
grad_node_creation_str += grad_node_creation_str += paddle::string::Sprintf(ADD_EDGES_TEMPLATE,
paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name, input_autograd_name,
input_autograd_name, input_position); input_autograd_name,
input_position);
} else { } else {
compute_require_grad_args += ", &" + input_autograd_name; compute_require_grad_args += ", &" + input_autograd_name;
size_t input_position = fwd_inputs_name_pos_map.at(input_name); size_t input_position = fwd_inputs_name_pos_map.at(input_name);
...@@ -1319,7 +1336,7 @@ static std::string GenerateGradNodeCreationContent( ...@@ -1319,7 +1336,7 @@ static std::string GenerateGradNodeCreationContent(
"%s" "%s"
" {\n" " {\n"
" paddle::platform::RecordEvent node_creation_record_event(\"%s\", " " paddle::platform::RecordEvent node_creation_record_event(\"%s\", "
"paddle::platform::TracerEventType::Operator, 1);\n" "paddle::platform::TracerEventType::OperatorInner, 1);\n"
"%s" "%s"
" if(require_any_grad) {\n" " if(require_any_grad) {\n"
" VLOG(6) << \" Construct Grad for %s \"; \n" " VLOG(6) << \" Construct Grad for %s \"; \n"
...@@ -1327,11 +1344,17 @@ static std::string GenerateGradNodeCreationContent( ...@@ -1327,11 +1344,17 @@ static std::string GenerateGradNodeCreationContent(
" %s\n" " %s\n"
" }\n" " }\n"
" }"; " }";
std::string grad_node_creation_body_str = paddle::string::Sprintf( std::string grad_node_creation_body_str =
GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str, paddle::string::Sprintf(GRAD_NODE_CREATION_TEMPLATE,
compute_require_grad_args, check_inplace_str, trace_op_body_str, prepare_autograd_meta_str,
event_name, get_output_autograd_meta_str, op_type, compute_require_grad_args,
pass_stop_gradient_args, grad_node_creation_str); check_inplace_str,
trace_op_body_str,
event_name,
get_output_autograd_meta_str,
op_type,
pass_stop_gradient_args,
grad_node_creation_str);
return grad_node_creation_body_str; return grad_node_creation_body_str;
} }
...@@ -1454,8 +1477,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents( ...@@ -1454,8 +1477,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
const char* FWD_INS_CONTENT_TEMPLATE = const char* FWD_INS_CONTENT_TEMPLATE =
"{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },"; "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
ins_contents_str += paddle::string::Sprintf(FWD_INS_CONTENT_TEMPLATE, ins_contents_str += paddle::string::Sprintf(
input_name, input_name); FWD_INS_CONTENT_TEMPLATE, input_name, input_name);
if (input.duplicable()) { if (input.duplicable()) {
const char* AMP_TENSORS_VECTOR_TEMPLATE = "%s,"; const char* AMP_TENSORS_VECTOR_TEMPLATE = "%s,";
amp_tensors_vector_str += amp_tensors_vector_str +=
...@@ -1518,9 +1541,14 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents( ...@@ -1518,9 +1541,14 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE = const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE =
" auto NEW_%s = ((%s.size() > 0) ? egr::AmpAutoCasts(\"%s\", " " auto NEW_%s = ((%s.size() > 0) ? egr::AmpAutoCasts(\"%s\", "
"%s, amp_dst_dtype, \"%s\") : %s);\n"; "%s, amp_dst_dtype, \"%s\") : %s);\n";
dispensable_amp_auto_cast_str += paddle::string::Sprintf( dispensable_amp_auto_cast_str +=
DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name, paddle::string::Sprintf(DISPENSABLE_AMP_AUTO_CAST_TEMPLATE,
input_name, input_name, op_type, input_name); input_name,
input_name,
input_name,
input_name,
op_type,
input_name);
} else { } else {
const char* FWD_INS_CONTENT_TEMPLATE = const char* FWD_INS_CONTENT_TEMPLATE =
" if(%s.initialized()) " " if(%s.initialized()) "
...@@ -1535,9 +1563,14 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents( ...@@ -1535,9 +1563,14 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE = const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE =
" auto NEW_%s = ((%s.initialized()) ? egr::AmpAutoCast(\"%s\", " " auto NEW_%s = ((%s.initialized()) ? egr::AmpAutoCast(\"%s\", "
"%s, amp_dst_dtype, \"%s\") : %s);\n"; "%s, amp_dst_dtype, \"%s\") : %s);\n";
dispensable_amp_auto_cast_str += paddle::string::Sprintf( dispensable_amp_auto_cast_str +=
DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name, paddle::string::Sprintf(DISPENSABLE_AMP_AUTO_CAST_TEMPLATE,
input_name, input_name, op_type, input_name); input_name,
input_name,
input_name,
input_name,
op_type,
input_name);
} }
} }
} }
...@@ -1594,9 +1627,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents( ...@@ -1594,9 +1627,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
} else if (!inplace_map.empty() && inplace_map.count(output_name)) { } else if (!inplace_map.empty() && inplace_map.count(output_name)) {
// In inplace op, replace the output with the input directly. // In inplace op, replace the output with the input directly.
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
inplace_map[output_name], "", inplace_map[output_name],
"",
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"Inplace op %s has no input corresponding to output %s.", op_type, "Inplace op %s has no input corresponding to output %s.",
op_type,
output_name)); output_name));
const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },"; const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },";
auto inplace_input_name = inplace_map[output_name]; auto inplace_input_name = inplace_map[output_name];
...@@ -1618,8 +1653,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents( ...@@ -1618,8 +1653,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
amp_function_call_args_str += (", " + outnum); amp_function_call_args_str += (", " + outnum);
const char* FWD_OUTS_CONTENT_TEMPLATE = const char* FWD_OUTS_CONTENT_TEMPLATE =
"{ \"%s\", egr::EagerUtils::CreateVars(%s) },"; "{ \"%s\", egr::EagerUtils::CreateVars(%s) },";
outs_contents_str += paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE, outs_contents_str += paddle::string::Sprintf(
output_name, outnum); FWD_OUTS_CONTENT_TEMPLATE, output_name, outnum);
core_ops_args_info[op_type].push_back(outnum); core_ops_args_info[op_type].push_back(outnum);
core_ops_args_type_info[op_type].push_back("int"); core_ops_args_type_info[op_type].push_back("int");
} else { } else {
...@@ -1738,9 +1773,12 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents( ...@@ -1738,9 +1773,12 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
std::string view_strategy_str = ""; std::string view_strategy_str = "";
std::string viwe_input_name = view_op_map[op_type].first; std::string viwe_input_name = view_op_map[op_type].first;
std::string viwe_output_name = view_op_map[op_type].second; std::string viwe_output_name = view_op_map[op_type].second;
view_strategy_str += paddle::string::Sprintf( view_strategy_str +=
HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name, paddle::string::Sprintf(HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT,
viwe_input_name, viwe_output_name); viwe_input_name,
viwe_output_name,
viwe_input_name,
viwe_output_name);
generated_function_body += view_strategy_str; generated_function_body += view_strategy_str;
generated_function_body += "\n"; generated_function_body += "\n";
...@@ -1794,26 +1832,33 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents( ...@@ -1794,26 +1832,33 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
" if (outs.count(\"%s\")) " " if (outs.count(\"%s\")) "
"egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n" "egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n"
" egr::EagerUtils::Output2Result(%s, &%s);\n"; " egr::EagerUtils::Output2Result(%s, &%s);\n";
out_tensor_str = paddle::string::Sprintf( out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE,
FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name, output_varname,
output_name, output_var_args_name, output_var_args_name, output_name,
output_varname); output_name,
output_var_args_name,
output_var_args_name,
output_varname);
} else { } else {
const char* FWD_OUT_TENSORS_TEMPLATE = const char* FWD_OUT_TENSORS_TEMPLATE =
" std::vector<paddle::experimental::Tensor> %s;\n" " std::vector<paddle::experimental::Tensor> %s;\n"
" egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n" " egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n"
" egr::EagerUtils::Output2Result(%s, &%s);\n"; " egr::EagerUtils::Output2Result(%s, &%s);\n";
out_tensor_str = paddle::string::Sprintf( out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE,
FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name, output_varname,
output_var_args_name, output_var_args_name, output_varname); output_name,
output_var_args_name,
output_var_args_name,
output_varname);
} }
} else { } else {
const char* FWD_OUT_TENSORS_TEMPLATE = const char* FWD_OUT_TENSORS_TEMPLATE =
" std::vector<paddle::experimental::Tensor> %s;\n" " std::vector<paddle::experimental::Tensor> %s;\n"
" egr::EagerUtils::GetOutputs(outs[\"%s\"], &%s);\n"; " egr::EagerUtils::GetOutputs(outs[\"%s\"], &%s);\n";
out_tensor_str = out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE,
paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE, output_varname, output_varname,
output_name, output_varname); output_name,
output_varname);
} }
return_types[return_position] = return_types[return_position] =
"std::vector<paddle::experimental::Tensor>"; "std::vector<paddle::experimental::Tensor>";
...@@ -1824,16 +1869,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents( ...@@ -1824,16 +1869,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
" if (outs.count(\"%s\")) " " if (outs.count(\"%s\")) "
"egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n" "egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n"
" paddle::experimental::Tensor& %s = *%s;\n"; " paddle::experimental::Tensor& %s = *%s;\n";
out_tensor_str = paddle::string::Sprintf( out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE,
FWD_OUT_TENSOR_TEMPLATE, output_name, output_name, output_name,
output_var_args_name, output_varname, output_var_args_name); output_name,
output_var_args_name,
output_varname,
output_var_args_name);
} else { } else {
const char* FWD_OUT_TENSOR_TEMPLATE = const char* FWD_OUT_TENSOR_TEMPLATE =
" egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n" " egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n"
" paddle::experimental::Tensor& %s = *%s;\n"; " paddle::experimental::Tensor& %s = *%s;\n";
out_tensor_str = paddle::string::Sprintf( out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE,
FWD_OUT_TENSOR_TEMPLATE, output_name, output_var_args_name, output_name,
output_varname, output_var_args_name); output_var_args_name,
output_varname,
output_var_args_name);
} }
} else { } else {
if (!inplace_map.empty() && inplace_map.count(output_name)) { if (!inplace_map.empty() && inplace_map.count(output_name)) {
...@@ -1845,16 +1895,19 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents( ...@@ -1845,16 +1895,19 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
" %s.bump_inplace_version();\n" " %s.bump_inplace_version();\n"
" VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace " " VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace "
"Strategy.\";\n"; "Strategy.\";\n";
out_tensor_str = paddle::string::Sprintf( out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE,
FWD_OUT_TENSOR_TEMPLATE, output_name, inplace_input_name, output_name,
inplace_input_name, inplace_input_name); inplace_input_name,
inplace_input_name,
inplace_input_name);
} else { } else {
const char* FWD_OUT_TENSOR_TEMPLATE = const char* FWD_OUT_TENSOR_TEMPLATE =
" paddle::experimental::Tensor %s;\n" " paddle::experimental::Tensor %s;\n"
" egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"; " egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n";
out_tensor_str = out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE,
paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname, output_varname,
output_name, output_varname); output_name,
output_varname);
} }
} }
return_types[return_position] = "paddle::experimental::Tensor"; return_types[return_position] = "paddle::experimental::Tensor";
...@@ -1964,21 +2017,28 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents( ...@@ -1964,21 +2017,28 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
"%s\n" "%s\n"
"%s\n" "%s\n"
"}\n\n"; "}\n\n";
std::string fwd_function_str = paddle::string::Sprintf( std::string fwd_function_str =
FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name, paddle::string::Sprintf(FWD_FUNCTION_TEMPLATE,
dygraph_function_args_str, fwd_record_event_str, generated_function_body); function_proto_return_type_str,
function_name,
dygraph_function_args_str,
fwd_record_event_str,
generated_function_body);
// [Generation] Generate forward functions header // [Generation] Generate forward functions header
const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n"; const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n";
std::string dygraph_function_declaration_str = paddle::string::Sprintf( std::string dygraph_function_declaration_str =
FWD_HEADER_TEMPLATE, function_proto_return_type_str, function_name, paddle::string::Sprintf(FWD_HEADER_TEMPLATE,
dygraph_function_args_str); function_proto_return_type_str,
function_name,
dygraph_function_args_str);
return {fwd_function_str, dygraph_function_declaration_str}; return {fwd_function_str, dygraph_function_declaration_str};
} }
static std::string GenerateSingleOpBase( static std::string GenerateSingleOpBase(
const std::string& fwd_op_type, const std::string& op_base_type, const std::string& fwd_op_type,
const std::string& op_base_type,
const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map, const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map, const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
const std::vector<proto::OpProto::Var>& in_vars, const std::vector<proto::OpProto::Var>& in_vars,
...@@ -1994,7 +2054,8 @@ static std::string GenerateSingleOpBase( ...@@ -1994,7 +2054,8 @@ static std::string GenerateSingleOpBase(
std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>& std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
grad_outs, grad_outs,
const paddle::framework::AttributeMap& grad_attrs, const paddle::framework::AttributeMap& grad_attrs,
bool is_op_base_per_duplicable_input, size_t* outs_size) { bool is_op_base_per_duplicable_input,
size_t* outs_size) {
std::string generated_grad_function_body = ""; std::string generated_grad_function_body = "";
const std::string& ins_name = "ins" + std::to_string(*outs_size); const std::string& ins_name = "ins" + std::to_string(*outs_size);
...@@ -2029,9 +2090,9 @@ static std::string GenerateSingleOpBase( ...@@ -2029,9 +2090,9 @@ static std::string GenerateSingleOpBase(
"RecoverTensorWrapper(" "RecoverTensorWrapper("
"&" "&"
"this->%s)) },"; "this->%s)) },";
ins_contents_str += ins_contents_str += paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE,
paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name,
grad_input_name, struct_fwd_input_name); struct_fwd_input_name);
} else if (grad_ins_grad_slotname_map.count(grad_input_name)) { } else if (grad_ins_grad_slotname_map.count(grad_input_name)) {
// Fwd Tensor's Grad // Fwd Tensor's Grad
...@@ -2075,18 +2136,25 @@ static std::string GenerateSingleOpBase( ...@@ -2075,18 +2136,25 @@ static std::string GenerateSingleOpBase(
" if(this->%s.size() > 0) %s[\"%s\"] = " " if(this->%s.size() > 0) %s[\"%s\"] = "
"egr::EagerUtils::TrySyncToVars(egr::EagerUtils::" "egr::EagerUtils::TrySyncToVars(egr::EagerUtils::"
"RecoverTensorWrapper(&this->%s));\n"; "RecoverTensorWrapper(&this->%s));\n";
generated_grad_function_body += paddle::string::Sprintf( generated_grad_function_body +=
DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, struct_fwd_input_name, paddle::string::Sprintf(DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE,
ins_name, grad_input_name, struct_fwd_input_name); struct_fwd_input_name,
ins_name,
grad_input_name,
struct_fwd_input_name);
} else { } else {
const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE = const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE =
" auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s);\n" " auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s);\n"
" if(%s.defined()) %s[\"%s\"] = " " if(%s.defined()) %s[\"%s\"] = "
" egr::EagerUtils::TrySyncToVars(%s);\n"; " egr::EagerUtils::TrySyncToVars(%s);\n";
generated_grad_function_body += paddle::string::Sprintf( generated_grad_function_body +=
DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name, paddle::string::Sprintf(DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE,
struct_fwd_input_name, grad_input_name, ins_name, grad_input_name, grad_input_name,
grad_input_name); struct_fwd_input_name,
grad_input_name,
ins_name,
grad_input_name,
grad_input_name);
} }
} }
} }
...@@ -2203,15 +2271,20 @@ static std::string GenerateSingleOpBase( ...@@ -2203,15 +2271,20 @@ static std::string GenerateSingleOpBase(
" if(%s.size() > 0) %s[\"%s\"] = egr::EagerUtils::CreateVars( " " if(%s.size() > 0) %s[\"%s\"] = egr::EagerUtils::CreateVars( "
"this->OutputMeta()[%d].size() );\n"; "this->OutputMeta()[%d].size() );\n";
generated_grad_function_body += paddle::string::Sprintf( generated_grad_function_body += paddle::string::Sprintf(
DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name, outs_name, DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE,
grad_output_name, fwd_input_position); fwd_name,
outs_name,
grad_output_name,
fwd_input_position);
} else { } else {
const char* DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE = const char* DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE =
" if(%s.defined()) %s[\"%s\"] = " " if(%s.defined()) %s[\"%s\"] = "
"{std::make_shared<egr::EagerVariable>(egr::Controller::" "{std::make_shared<egr::EagerVariable>(egr::Controller::"
"Instance().GenerateUniqueName())};\n"; "Instance().GenerateUniqueName())};\n";
generated_grad_function_body += paddle::string::Sprintf( generated_grad_function_body += paddle::string::Sprintf(
DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name, outs_name, DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE,
fwd_name,
outs_name,
grad_output_name); grad_output_name);
} }
} }
...@@ -2236,8 +2309,8 @@ static std::string GenerateSingleOpBase( ...@@ -2236,8 +2309,8 @@ static std::string GenerateSingleOpBase(
" auto temp_type = %s[\"in_dtype\"];\n" " auto temp_type = %s[\"in_dtype\"];\n"
" %s[\"in_dtype\"] = %s[\"out_dtype\"];\n" " %s[\"in_dtype\"] = %s[\"out_dtype\"];\n"
" %s[\"out_dtype\"] = temp_type;\n"; " %s[\"out_dtype\"] = temp_type;\n";
grad_attrs_str += paddle::string::Sprintf(CAST_GRAD, attrs_name, attrs_name, grad_attrs_str += paddle::string::Sprintf(
attrs_name, attrs_name); CAST_GRAD, attrs_name, attrs_name, attrs_name, attrs_name);
} }
// Handle dynamic grad attributes // Handle dynamic grad attributes
...@@ -2278,8 +2351,8 @@ static std::string GenerateSingleOpBase( ...@@ -2278,8 +2351,8 @@ static std::string GenerateSingleOpBase(
" " " "
"outputs[0].emplace_back(egr::EagerUtils::GetOutputs(%s[\"%s\"])[0]" "outputs[0].emplace_back(egr::EagerUtils::GetOutputs(%s[\"%s\"])[0]"
");\n"; ");\n";
outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE, outs_name, outputs_str += paddle::string::Sprintf(
grad_out_name); BWD_OUTPUT_TEMPLATE, outs_name, grad_out_name);
} }
num_appended_outputs++; num_appended_outputs++;
} else { } else {
...@@ -2411,11 +2484,20 @@ static std::string GenerateGradNodeCCContents( ...@@ -2411,11 +2484,20 @@ static std::string GenerateGradNodeCCContents(
const auto& grad_attrs = op_base_info.GetGradAttrs(); const auto& grad_attrs = op_base_info.GetGradAttrs();
const std::string& op_base_type = op_base_info.GetOpBaseType(); const std::string& op_base_type = op_base_info.GetOpBaseType();
generated_grad_function_body += GenerateSingleOpBase( generated_grad_function_body +=
fwd_op_type, op_base_type, fwd_inputs_name_pos_map, GenerateSingleOpBase(fwd_op_type,
fwd_outputs_name_pos_map, in_vars, grad_ins_fwd_slotname_map, op_base_type,
grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins, grad_outs, fwd_inputs_name_pos_map,
grad_attrs, is_op_base_per_duplicable_input, &outs_size); fwd_outputs_name_pos_map,
in_vars,
grad_ins_fwd_slotname_map,
grad_ins_grad_slotname_map,
grad_outs_slotname_map,
grad_ins,
grad_outs,
grad_attrs,
is_op_base_per_duplicable_input,
&outs_size);
} }
if (is_op_base_per_duplicable_input) { if (is_op_base_per_duplicable_input) {
...@@ -2436,7 +2518,9 @@ static std::string GenerateGradNodeCCContents( ...@@ -2436,7 +2518,9 @@ static std::string GenerateGradNodeCCContents(
"HandleComplexGradToRealGrad(&outputs);\n" "HandleComplexGradToRealGrad(&outputs);\n"
" return outputs;\n"; " return outputs;\n";
generated_grad_function_body = generated_grad_function_body =
paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(), paddle::string::Sprintf(BWD_RETURN_TEMPLATE,
fwd_op_type,
in_vars.size(),
generated_grad_function_body); generated_grad_function_body);
// [Generation] Get Full Grad Function // [Generation] Get Full Grad Function
...@@ -2455,8 +2539,10 @@ static std::string GenerateGradNodeCCContents( ...@@ -2455,8 +2539,10 @@ static std::string GenerateGradNodeCCContents(
"this->InputMeta());\n"; "this->InputMeta());\n";
} }
std::string grad_function_str = std::string grad_function_str =
paddle::string::Sprintf(GRAD_FUNCTION_TEMPLATE, fwd_op_type, paddle::string::Sprintf(GRAD_FUNCTION_TEMPLATE,
fill_zero_str, generated_grad_function_body); fwd_op_type,
fill_zero_str,
generated_grad_function_body);
VLOG(6) << "Generated returns"; VLOG(6) << "Generated returns";
...@@ -2579,9 +2665,12 @@ static std::string GenerateGradNodeHeaderContents( ...@@ -2579,9 +2665,12 @@ static std::string GenerateGradNodeHeaderContents(
" %s.emplace_back( egr::TensorWrapper(eager_tensor, %s " " %s.emplace_back( egr::TensorWrapper(eager_tensor, %s "
"/*full_reserved*/, %s) );\n" "/*full_reserved*/, %s) );\n"
" }\n"; " }\n";
tensor_wrapper_body_str = paddle::string::Sprintf( tensor_wrapper_body_str =
SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name, paddle::string::Sprintf(SET_TENSOR_WRAPPER_BODY_TEMPLATE,
struct_tensor_wrapper_name, full_reserved_str, no_need_buffer_str); tensor_wrapper_name,
struct_tensor_wrapper_name,
full_reserved_str,
no_need_buffer_str);
const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = const char* CLEAR_TENSOR_WRAPPER_TEMPLATE =
"for (auto tw: %s) {\n" "for (auto tw: %s) {\n"
...@@ -2603,9 +2692,12 @@ static std::string GenerateGradNodeHeaderContents( ...@@ -2603,9 +2692,12 @@ static std::string GenerateGradNodeHeaderContents(
const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE = const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
"%s = egr::TensorWrapper(%s, %s /*full_reserved*/, %s);\n"; "%s = egr::TensorWrapper(%s, %s /*full_reserved*/, %s);\n";
tensor_wrapper_body_str = paddle::string::Sprintf( tensor_wrapper_body_str =
SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name, paddle::string::Sprintf(SET_TENSOR_WRAPPER_BODY_TEMPLATE,
tensor_wrapper_name, full_reserved_str, no_need_buffer_str); struct_tensor_wrapper_name,
tensor_wrapper_name,
full_reserved_str,
no_need_buffer_str);
const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = " %s.clear();\n"; const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = " %s.clear();\n";
clear_tensor_wrappers_str += paddle::string::Sprintf( clear_tensor_wrappers_str += paddle::string::Sprintf(
...@@ -2614,19 +2706,33 @@ static std::string GenerateGradNodeHeaderContents( ...@@ -2614,19 +2706,33 @@ static std::string GenerateGradNodeHeaderContents(
std::string full_reserved_signature_str = "bool full_reserved"; std::string full_reserved_signature_str = "bool full_reserved";
const char* SET_TENSOR_WRAPPER_TEMPLATE = const char* SET_TENSOR_WRAPPER_TEMPLATE =
" void SetTensorWrapper%s(%s, %s) {\n %s\n }\n"; " void SetTensorWrapper%s(%s, %s) {\n %s\n }\n";
set_tensor_wrappers_str += paddle::string::Sprintf( set_tensor_wrappers_str +=
SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, paddle::string::Sprintf(SET_TENSOR_WRAPPER_TEMPLATE,
tensor_wrapper_arg_str, full_reserved_signature_str, tensor_wrapper_name,
tensor_wrapper_body_str); tensor_wrapper_arg_str,
full_reserved_signature_str,
tensor_wrapper_body_str);
} }
} }
VLOG(6) << "Generated TensorWrapper"; VLOG(6) << "Generated TensorWrapper";
std::string grad_node_str = paddle::string::Sprintf( std::string grad_node_str =
GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type, paddle::string::Sprintf(GRAD_NODE_TEMPLATE,
op_type, clear_tensor_wrappers_str, op_type, op_type, op_type, op_type,
set_tensor_wrappers_str, set_attr_map_str, tensor_wrapper_members_str, op_type,
attr_members_str); op_type,
op_type,
op_type,
op_type,
op_type,
clear_tensor_wrappers_str,
op_type,
op_type,
op_type,
set_tensor_wrappers_str,
set_attr_map_str,
tensor_wrapper_members_str,
attr_members_str);
return grad_node_str; return grad_node_str;
} }
...@@ -2760,9 +2866,11 @@ static std::string GenerateCoreOpsReturnsInfo() { ...@@ -2760,9 +2866,11 @@ static std::string GenerateCoreOpsReturnsInfo() {
std::string core_ops_returns_info_init_str = std::string core_ops_returns_info_init_str =
ConvertCoreOpsInfosToString(core_ops_returns_info); ConvertCoreOpsInfosToString(core_ops_returns_info);
std::string core_ops_info_str = paddle::string::Sprintf( std::string core_ops_info_str =
Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_info_init_str, paddle::string::Sprintf(Core_Ops_Returns_MAP_TEMPLATE,
core_ops_args_type_info_init_str, core_ops_returns_info_init_str); core_ops_args_info_init_str,
core_ops_args_type_info_init_str,
core_ops_returns_info_init_str);
return core_ops_info_str; return core_ops_info_str;
} }
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -71,7 +71,7 @@ PARSE_PYTHON_C_ARGS_TEMPLATE = \ ...@@ -71,7 +71,7 @@ PARSE_PYTHON_C_ARGS_TEMPLATE = \
RECORD_EVENT_TEMPLATE = \ RECORD_EVENT_TEMPLATE = \
" paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);" "paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::UserDefined, 1);"
RETURN_INPLACE_PYOBJECT_TEMPLATE = \ RETURN_INPLACE_PYOBJECT_TEMPLATE = \
...@@ -253,6 +253,7 @@ NAMESPACE_WRAPPER_TEMPLATE = \ ...@@ -253,6 +253,7 @@ NAMESPACE_WRAPPER_TEMPLATE = \
## Generator Classes ## ## Generator Classes ##
####################### #######################
class PythonCSingleFunctionGenerator(FunctionGeneratorBase): class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
def __init__(self, forward_api_contents, namespace): def __init__(self, forward_api_contents, namespace):
# Members from Parent: # Members from Parent:
#self.namespace #self.namespace
...@@ -265,7 +266,7 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase): ...@@ -265,7 +266,7 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
#self.forward_outputs_position_map #self.forward_outputs_position_map
#self.optional_inputs #self.optional_inputs
#self.no_need_buffers #self.no_need_buffers
#self.intermediate_outputs #self.intermediate_outputs
#self.inplace_map #self.inplace_map
FunctionGeneratorBase.__init__(self, forward_api_contents, namespace) FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
...@@ -327,8 +328,8 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase): ...@@ -327,8 +328,8 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
set_device_str = FUNCTION_SET_DEVICE_TEMPLATE.format(expected_place_str) set_device_str = FUNCTION_SET_DEVICE_TEMPLATE.format(expected_place_str)
# Generate Dygraph Function Call Logic # Generate Dygraph Function Call Logic
num_args = len(forward_inputs_position_map.keys()) + len( num_args = len(
orig_forward_attrs_list) forward_inputs_position_map.keys()) + len(orig_forward_attrs_list)
dygraph_function_call_list = ["" for i in range(num_args)] dygraph_function_call_list = ["" for i in range(num_args)]
for name, (_, pos) in forward_inputs_position_map.items(): for name, (_, pos) in forward_inputs_position_map.items():
dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_list[pos] = f"{name}"
...@@ -336,7 +337,7 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase): ...@@ -336,7 +337,7 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_list[pos] = f"{name}"
dygraph_function_call_str = ",".join(dygraph_function_call_list) dygraph_function_call_str = ",".join(dygraph_function_call_list)
# Generate Python-C Function Definitions # Generate Python-C Function Definitions
if is_forward_only: if is_forward_only:
fwd_function_name = FUNCTION_NAME_TEMPLATE.format( fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
"paddle::experimental::", namespace, forward_api_name) "paddle::experimental::", namespace, forward_api_name)
...@@ -441,8 +442,9 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase): ...@@ -441,8 +442,9 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
class PythonCYamlGenerator(YamlGeneratorBase): class PythonCYamlGenerator(YamlGeneratorBase):
def __init__(self, path): def __init__(self, path):
# Parent members: # Parent members:
# self.namespace # self.namespace
# self.api_yaml_path # self.api_yaml_path
# self.forward_api_list # self.forward_api_list
...@@ -457,8 +459,8 @@ class PythonCYamlGenerator(YamlGeneratorBase): ...@@ -457,8 +459,8 @@ class PythonCYamlGenerator(YamlGeneratorBase):
forward_api_list = self.forward_api_list forward_api_list = self.forward_api_list
for forward_api_content in forward_api_list: for forward_api_content in forward_api_list:
f_generator = PythonCSingleFunctionGenerator(forward_api_content, f_generator = PythonCSingleFunctionGenerator(
namespace) forward_api_content, namespace)
status = f_generator.run() status = f_generator.run()
if status == True: if status == True:
......
...@@ -30,10 +30,10 @@ ...@@ -30,10 +30,10 @@
namespace egr { namespace egr {
/* /*
* GeneralGrad is Helpper class to implement custom grad operation between * GeneralGrad is Helpper class to implement custom grad operation between
* outputs and inputs. * outputs and inputs.
* *
* **/ * **/
class GeneralGrad { class GeneralGrad {
public: public:
static GeneralGrad& Instance() { return *general_grad_; } static GeneralGrad& Instance() { return *general_grad_; }
...@@ -64,7 +64,8 @@ class GeneralGrad { ...@@ -64,7 +64,8 @@ class GeneralGrad {
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
"There is no grad op for %s:[%d] or it's" "There is no grad op for %s:[%d] or it's"
"stop_gradient=True.", "stop_gradient=True.",
msg, i)); msg,
i));
if (is_no_grad_vars) { if (is_no_grad_vars) {
(no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta; (no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta;
} else { // normal input } else { // normal input
...@@ -248,7 +249,8 @@ class GeneralGrad { ...@@ -248,7 +249,8 @@ class GeneralGrad {
std::vector<paddle::experimental::Tensor> GetResults( std::vector<paddle::experimental::Tensor> GetResults(
const std::vector<paddle::experimental::Tensor>& inputs, const std::vector<paddle::experimental::Tensor>& inputs,
bool allow_unused, bool create_graph) { bool allow_unused,
bool create_graph) {
VLOG(6) << "Running in GetResults"; VLOG(6) << "Running in GetResults";
if (inputs.empty()) return {}; if (inputs.empty()) return {};
...@@ -276,7 +278,8 @@ class GeneralGrad { ...@@ -276,7 +278,8 @@ class GeneralGrad {
tensor_auto_grad_meta->SetStopGradient(!create_graph); tensor_auto_grad_meta->SetStopGradient(!create_graph);
results.emplace_back(iter->second); results.emplace_back(iter->second);
} else { } else {
PADDLE_ENFORCE_EQ(allow_unused, true, PADDLE_ENFORCE_EQ(allow_unused,
true,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"The %d-th input does not appear in the backward " "The %d-th input does not appear in the backward "
"graph. Please check the input tensor or set " "graph. Please check the input tensor or set "
...@@ -493,7 +496,8 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap( ...@@ -493,7 +496,8 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
void EnforceGradNodeHasInput(GradNodeBase* node) { void EnforceGradNodeHasInput(GradNodeBase* node) {
VLOG(6) << "Running in EnforceGradNodeHasInput"; VLOG(6) << "Running in EnforceGradNodeHasInput";
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
node->IsTensorWrappersCleared(), true, node->IsTensorWrappersCleared(),
true,
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
"The TensorWrappers of %s do not exist. This may be because:\n" "The TensorWrappers of %s do not exist. This may be because:\n"
"You calculate backward twice for the same subgraph without " "You calculate backward twice for the same subgraph without "
...@@ -509,10 +513,13 @@ void DuplicateCheck(const std::vector<paddle::experimental::Tensor>& inputs, ...@@ -509,10 +513,13 @@ void DuplicateCheck(const std::vector<paddle::experimental::Tensor>& inputs,
for (auto in : inputs) { for (auto in : inputs) {
AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in); AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
visisted_ins.count(auto_grad_meta), 0, visisted_ins.count(auto_grad_meta),
0,
paddle::platform::errors::AlreadyExists( paddle::platform::errors::AlreadyExists(
"%s contain duplicate tensor %s, please check %s carefully.", msg, "%s contain duplicate tensor %s, please check %s carefully.",
in.name(), msg)); msg,
in.name(),
msg));
visisted_ins.insert(auto_grad_meta); visisted_ins.insert(auto_grad_meta);
} }
} }
...@@ -522,7 +529,8 @@ GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad(); ...@@ -522,7 +529,8 @@ GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad();
std::vector<paddle::experimental::Tensor> RunBackward( std::vector<paddle::experimental::Tensor> RunBackward(
const std::vector<paddle::experimental::Tensor>& tensors, // output const std::vector<paddle::experimental::Tensor>& tensors, // output
const std::vector<paddle::experimental::Tensor>& grad_tensors, const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph, bool create_graph = false, bool retain_graph,
bool create_graph = false,
const std::vector<paddle::experimental::Tensor>& inputs = {}, const std::vector<paddle::experimental::Tensor>& inputs = {},
bool allow_unused = false, bool allow_unused = false,
const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) { const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
...@@ -631,8 +639,8 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -631,8 +639,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
if (is_general_grad) { if (is_general_grad) {
// Prepare several vital preprocess for GeneralGrad // Prepare several vital preprocess for GeneralGrad
GeneralGrad::Instance().PreparedForGeneralGrad(inputs, no_grad_vars, &queue, GeneralGrad::Instance().PreparedForGeneralGrad(
node_input_buffers_dict); inputs, no_grad_vars, &queue, node_input_buffers_dict);
} }
VLOG(6) << " startup_ops' size is :" << queue.size(); VLOG(6) << " startup_ops' size is :" << queue.size();
...@@ -651,7 +659,8 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -651,7 +659,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
paddle::platform::RecordEvent node_record_event( paddle::platform::RecordEvent node_record_event(
std::string((*node).name()) + " grad_node", std::string((*node).name()) + " grad_node",
paddle::platform::TracerEventType::Operator, 1); paddle::platform::TracerEventType::Operator,
1);
if (queue.size() > 1 && node_in_degree_map[node] != 0) { if (queue.size() > 1 && node_in_degree_map[node] != 0) {
queue.pop(); queue.pop();
...@@ -716,7 +725,8 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -716,7 +725,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
"Number of edges should be either empty ( for leaf node " "Number of edges should be either empty ( for leaf node "
") or the same as number of output grad tensors, but we " ") or the same as number of output grad tensors, but we "
"got edges size is: %d, grad_output size is: %d", "got edges size is: %d, grad_output size is: %d",
edges.size(), grad_output_tensors.size())); edges.size(),
grad_output_tensors.size()));
for (size_t i = 0; i < edges.size(); i++) { for (size_t i = 0; i < edges.size(); i++) {
for (size_t j = 0; j < edges[i].size(); j++) { for (size_t j = 0; j < edges[i].size(); j++) {
...@@ -739,7 +749,8 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -739,7 +749,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
} }
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
j, grad_output_tensors[i].size(), j,
grad_output_tensors[i].size(),
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
"Rank of grad_output_tensors should be less than " "Rank of grad_output_tensors should be less than "
"grad_output_tensors[i].size(), which is: %d. This error may " "grad_output_tensors[i].size(), which is: %d. This error may "
...@@ -771,9 +782,10 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -771,9 +782,10 @@ std::vector<paddle::experimental::Tensor> RunBackward(
VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
<< ", rank: " << edge_rank.second; << ", rank: " << edge_rank.second;
node_input_buffers_dict[next_node]->add( node_input_buffers_dict[next_node]->add(edge_rank.first,
edge_rank.first, edge_rank.second, grad_output_tensor, edge_rank.second,
create_graph); grad_output_tensor,
create_graph);
// Update queue // Update queue
node_in_degree_map[next_node]--; node_in_degree_map[next_node]--;
...@@ -810,7 +822,7 @@ void Backward( ...@@ -810,7 +822,7 @@ void Backward(
bool retain_graph) { bool retain_graph) {
VLOG(6) << "Run in Backward"; VLOG(6) << "Run in Backward";
paddle::platform::RecordEvent backward_record_event( paddle::platform::RecordEvent backward_record_event(
"backward", paddle::platform::TracerEventType::Operator, 1); "backward", paddle::platform::TracerEventType::UserDefined, 1);
RunBackward(tensors, grad_tensors, retain_graph); RunBackward(tensors, grad_tensors, retain_graph);
phi::autotune::AutoTuneStatus::Instance().Update(); phi::autotune::AutoTuneStatus::Instance().Update();
} }
...@@ -819,14 +831,22 @@ std::vector<paddle::experimental::Tensor> Grad( ...@@ -819,14 +831,22 @@ std::vector<paddle::experimental::Tensor> Grad(
const std::vector<paddle::experimental::Tensor>& tensors, // outputs const std::vector<paddle::experimental::Tensor>& tensors, // outputs
const std::vector<paddle::experimental::Tensor>& inputs, const std::vector<paddle::experimental::Tensor>& inputs,
const std::vector<paddle::experimental::Tensor>& grad_tensors, const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused, bool retain_graph,
bool create_graph,
bool only_inputs,
bool allow_unused,
const std::vector<paddle::experimental::Tensor>& no_grad_vars) { const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
VLOG(6) << "Run in Grad"; VLOG(6) << "Run in Grad";
DuplicateCheck(inputs, true /* is_input */); DuplicateCheck(inputs, true /* is_input */);
DuplicateCheck(tensors, false /* is_input */); DuplicateCheck(tensors, false /* is_input */);
return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs, return RunBackward(tensors,
allow_unused, no_grad_vars); grad_tensors,
retain_graph,
create_graph,
inputs,
allow_unused,
no_grad_vars);
} }
} // namespace egr } // namespace egr
...@@ -588,7 +588,7 @@ void ChromeTracingLogger::StartLog() { ...@@ -588,7 +588,7 @@ void ChromeTracingLogger::StartLog() {
std::string( std::string(
R"JSON( R"JSON(
{ {
"id": %d, "name": "%s", "totalGlobalMem": %u, "id": %d, "name": "%s", "totalGlobalMem": %llu,
"computeMajor": %d, "computeMinor": %d, "computeMajor": %d, "computeMinor": %d,
"maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d, "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
"regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d, "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
...@@ -618,7 +618,7 @@ void ChromeTracingLogger::StartLog() { ...@@ -618,7 +618,7 @@ void ChromeTracingLogger::StartLog() {
std::string( std::string(
R"JSON( R"JSON(
{ {
"id": %d, "name": "%s", "totalGlobalMem": %u, "id": %d, "name": "%s", "totalGlobalMem": %llu,
"computeMajor": %d, "computeMinor": %d, "computeMajor": %d, "computeMinor": %d,
"maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d, "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
"regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d, "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
......
...@@ -19,6 +19,7 @@ import paddle.profiler as profiler ...@@ -19,6 +19,7 @@ import paddle.profiler as profiler
class HostPythonNode: class HostPythonNode:
def __init__(self, name, type, start_ns, end_ns, process_id, thread_id): def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
self.name = name self.name = name
self.type = type self.type = type
...@@ -32,6 +33,7 @@ class HostPythonNode: ...@@ -32,6 +33,7 @@ class HostPythonNode:
class DevicePythonNode: class DevicePythonNode:
def __init__(self, name, type, start_ns, end_ns, device_id, context_id, def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
stream_id): stream_id):
self.name = name self.name = name
...@@ -44,6 +46,7 @@ class DevicePythonNode: ...@@ -44,6 +46,7 @@ class DevicePythonNode:
class TestProfilerStatistic(unittest.TestCase): class TestProfilerStatistic(unittest.TestCase):
def test_statistic_case1(self): def test_statistic_case1(self):
root_node = HostPythonNode('Root Node', root_node = HostPythonNode('Root Node',
profiler.TracerEventType.UserDefined, 0, profiler.TracerEventType.UserDefined, 0,
...@@ -54,14 +57,16 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -54,14 +57,16 @@ class TestProfilerStatistic(unittest.TestCase):
dataloader_node = HostPythonNode('Dataloader', dataloader_node = HostPythonNode('Dataloader',
profiler.TracerEventType.Dataloader, 5, profiler.TracerEventType.Dataloader, 5,
15, 1000, 1001) 15, 1000, 1001)
mobilenet_node = HostPythonNode( mobilenet_node = HostPythonNode('MobileNet',
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) profiler.TracerEventType.Forward, 20,
yolonet_node = HostPythonNode( 50, 1000, 1001)
'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001) yolonet_node = HostPythonNode('Yolov3Net',
profiler.TracerEventType.Forward, 50, 110,
1000, 1001)
userdefined_node = HostPythonNode('Communication Time', userdefined_node = HostPythonNode(
profiler.TracerEventType.UserDefined, 'Communication Time', profiler.TracerEventType.PythonUserDefined,
100, 110, 1000, 1001) 100, 110, 1000, 1001)
communication_node = HostPythonNode( communication_node = HostPythonNode(
'Communication', profiler.TracerEventType.Communication, 105, 110, 'Communication', profiler.TracerEventType.Communication, 105, 110,
...@@ -72,8 +77,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -72,8 +77,9 @@ class TestProfilerStatistic(unittest.TestCase):
optimization_node = HostPythonNode( optimization_node = HostPythonNode(
'Optimization', profiler.TracerEventType.Optimization, 220, 300, 'Optimization', profiler.TracerEventType.Optimization, 220, 300,
1000, 1001) 1000, 1001)
conv2d_node = HostPythonNode( conv2d_node = HostPythonNode('conv2d',
'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001) profiler.TracerEventType.Operator, 25, 40,
1000, 1001)
sync_batch_norm_node = HostPythonNode('sync_batch_norm', sync_batch_norm_node = HostPythonNode('sync_batch_norm',
profiler.TracerEventType.Operator, profiler.TracerEventType.Operator,
60, 100, 1000, 1001) 60, 100, 1000, 1001)
...@@ -92,10 +98,12 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -92,10 +98,12 @@ class TestProfilerStatistic(unittest.TestCase):
conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy', conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
profiler.TracerEventType.CudaRuntime, profiler.TracerEventType.CudaRuntime,
35, 40, 1000, 1001) 35, 40, 1000, 1001)
conv2d_kernel = DevicePythonNode( conv2d_kernel = DevicePythonNode('conv2d_kernel',
'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0) profiler.TracerEventType.Kernel, 35,
conv2d_memcpy = DevicePythonNode( 50, 0, 0, 0)
'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0) conv2d_memcpy = DevicePythonNode('conv2d_memcpy',
profiler.TracerEventType.Memcpy, 50,
60, 0, 0, 0)
sync_batch_norm_infer_shape = HostPythonNode( sync_batch_norm_infer_shape = HostPythonNode(
'sync_batch_norm::infer_shape', 'sync_batch_norm::infer_shape',
profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001) profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
...@@ -146,8 +154,8 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -146,8 +154,8 @@ class TestProfilerStatistic(unittest.TestCase):
'Process Cpu Utilization': '1.02', 'Process Cpu Utilization': '1.02',
'System Cpu Utilization': '0.68' 'System Cpu Utilization': '0.68'
} }
statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, statistic_data = profiler.profiler_statistic.StatisticData(
extra_info) thread_tree, extra_info)
time_range_summary = statistic_data.time_range_summary time_range_summary = statistic_data.time_range_summary
event_summary = statistic_data.event_summary event_summary = statistic_data.event_summary
...@@ -180,7 +188,7 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -180,7 +188,7 @@ class TestProfilerStatistic(unittest.TestCase):
0, profiler.TracerEventType.Memcpy), 60) 0, profiler.TracerEventType.Memcpy), 60)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.UserDefined), 25) profiler.TracerEventType.UserDefined), 15)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Communication), 5) profiler.TracerEventType.Communication), 5)
...@@ -200,8 +208,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -200,8 +208,9 @@ class TestProfilerStatistic(unittest.TestCase):
0) 0)
self.assertEqual( self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15) event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy'] self.assertEqual(
.general_gpu_time, 60) event_summary.memory_manipulation_items['AsyncMemcpy'].
general_gpu_time, 60)
print( print(
profiler.profiler_statistic._build_table( profiler.profiler_statistic._build_table(
statistic_data, statistic_data,
...@@ -222,14 +231,16 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -222,14 +231,16 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.TracerEventType.Dataloader, 5, profiler.TracerEventType.Dataloader, 5,
15, 1000, 1001) 15, 1000, 1001)
mobilenet_node = HostPythonNode( mobilenet_node = HostPythonNode('MobileNet',
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) profiler.TracerEventType.Forward, 20,
yolonet_node = HostPythonNode( 50, 1000, 1001)
'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001) yolonet_node = HostPythonNode('Yolov3Net',
profiler.TracerEventType.Forward, 50, 110,
1000, 1001)
userdefined_node = HostPythonNode('Communication Time', userdefined_node = HostPythonNode(
profiler.TracerEventType.UserDefined, 'Communication Time', profiler.TracerEventType.PythonUserDefined,
100, 110, 1000, 1001) 100, 110, 1000, 1001)
allreduce_launchkernel0 = HostPythonNode( allreduce_launchkernel0 = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104, 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104,
1000, 1001) 1000, 1001)
...@@ -263,8 +274,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -263,8 +274,9 @@ class TestProfilerStatistic(unittest.TestCase):
optimization_node = HostPythonNode( optimization_node = HostPythonNode(
'Optimization', profiler.TracerEventType.Optimization, 220, 300, 'Optimization', profiler.TracerEventType.Optimization, 220, 300,
1000, 1001) 1000, 1001)
conv2d_node = HostPythonNode( conv2d_node = HostPythonNode('conv2d',
'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001) profiler.TracerEventType.Operator, 25, 40,
1000, 1001)
sync_batch_norm_node = HostPythonNode('sync_batch_norm', sync_batch_norm_node = HostPythonNode('sync_batch_norm',
profiler.TracerEventType.Operator, profiler.TracerEventType.Operator,
60, 100, 1000, 1001) 60, 100, 1000, 1001)
...@@ -283,10 +295,12 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -283,10 +295,12 @@ class TestProfilerStatistic(unittest.TestCase):
conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy', conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
profiler.TracerEventType.CudaRuntime, profiler.TracerEventType.CudaRuntime,
35, 40, 1000, 1001) 35, 40, 1000, 1001)
conv2d_kernel = DevicePythonNode( conv2d_kernel = DevicePythonNode('conv2d_kernel',
'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0) profiler.TracerEventType.Kernel, 35,
conv2d_memcpy = DevicePythonNode( 50, 0, 0, 0)
'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0) conv2d_memcpy = DevicePythonNode('conv2d_memcpy',
profiler.TracerEventType.Memcpy, 50,
60, 0, 0, 0)
sync_batch_norm_infer_shape = HostPythonNode( sync_batch_norm_infer_shape = HostPythonNode(
'sync_batch_norm::infer_shape', 'sync_batch_norm::infer_shape',
profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001) profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
...@@ -363,8 +377,8 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -363,8 +377,8 @@ class TestProfilerStatistic(unittest.TestCase):
'Process Cpu Utilization': '1.02', 'Process Cpu Utilization': '1.02',
'System Cpu Utilization': '0.68' 'System Cpu Utilization': '0.68'
} }
statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, statistic_data = profiler.profiler_statistic.StatisticData(
extra_info) thread_tree, extra_info)
time_range_summary = statistic_data.time_range_summary time_range_summary = statistic_data.time_range_summary
event_summary = statistic_data.event_summary event_summary = statistic_data.event_summary
distributed_summary = statistic_data.distributed_summary distributed_summary = statistic_data.distributed_summary
...@@ -398,7 +412,7 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -398,7 +412,7 @@ class TestProfilerStatistic(unittest.TestCase):
0, profiler.TracerEventType.Memcpy), 60) 0, profiler.TracerEventType.Memcpy), 60)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.UserDefined), 25) profiler.TracerEventType.UserDefined), 15)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Communication), 5) profiler.TracerEventType.Communication), 5)
...@@ -433,8 +447,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -433,8 +447,9 @@ class TestProfilerStatistic(unittest.TestCase):
0) 0)
self.assertEqual( self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15) event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy'] self.assertEqual(
.general_gpu_time, 60) event_summary.memory_manipulation_items['AsyncMemcpy'].
general_gpu_time, 60)
print( print(
profiler.profiler_statistic._build_table( profiler.profiler_statistic._build_table(
statistic_data, statistic_data,
...@@ -454,8 +469,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -454,8 +469,9 @@ class TestProfilerStatistic(unittest.TestCase):
dataloader_node = HostPythonNode('Dataloader', dataloader_node = HostPythonNode('Dataloader',
profiler.TracerEventType.Dataloader, 5, profiler.TracerEventType.Dataloader, 5,
15, 1000, 1001) 15, 1000, 1001)
mobilenet_node = HostPythonNode( mobilenet_node = HostPythonNode('MobileNet',
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) profiler.TracerEventType.Forward, 20,
50, 1000, 1001)
backward_node = HostPythonNode('Gradient Backward', backward_node = HostPythonNode('Gradient Backward',
profiler.TracerEventType.Backward, 120, profiler.TracerEventType.Backward, 120,
...@@ -463,12 +479,13 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -463,12 +479,13 @@ class TestProfilerStatistic(unittest.TestCase):
optimization_node = HostPythonNode( optimization_node = HostPythonNode(
'Optimization', profiler.TracerEventType.Optimization, 220, 300, 'Optimization', profiler.TracerEventType.Optimization, 220, 300,
1000, 1001) 1000, 1001)
userdefined_node = HostPythonNode('Communication Time', userdefined_node = HostPythonNode(
profiler.TracerEventType.UserDefined, 'Communication Time', profiler.TracerEventType.PythonUserDefined,
60, 70, 1000, 1001) 60, 70, 1000, 1001)
conv2d_node = HostPythonNode( conv2d_node = HostPythonNode('conv2d',
'conv2d', profiler.TracerEventType.Operator, 25, 25, 1000, 1001) profiler.TracerEventType.Operator, 25, 25,
1000, 1001)
conv2d_infer_shape = HostPythonNode( conv2d_infer_shape = HostPythonNode(
'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25, 'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
...@@ -480,8 +497,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -480,8 +497,9 @@ class TestProfilerStatistic(unittest.TestCase):
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25, 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25,
1000, 1001) 1000, 1001)
conv2d_kernel = DevicePythonNode( conv2d_kernel = DevicePythonNode('conv2d_kernel',
'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0) profiler.TracerEventType.Kernel, 35,
35, 0, 0, 0)
another_kernel = DevicePythonNode( another_kernel = DevicePythonNode(
'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()', 'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()',
profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0) profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
...@@ -500,15 +518,16 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -500,15 +518,16 @@ class TestProfilerStatistic(unittest.TestCase):
'Process Cpu Utilization': '1.02', 'Process Cpu Utilization': '1.02',
'System Cpu Utilization': '0.68' 'System Cpu Utilization': '0.68'
} }
statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, statistic_data = profiler.profiler_statistic.StatisticData(
extra_info) thread_tree, extra_info)
time_range_summary = statistic_data.time_range_summary time_range_summary = statistic_data.time_range_summary
event_summary = statistic_data.event_summary event_summary = statistic_data.event_summary
self.assertEqual(event_summary.items['conv2d'].cpu_time, 0) self.assertEqual(event_summary.items['conv2d'].cpu_time, 0)
self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0) self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0)
self.assertEqual(event_summary.userdefined_items['Communication Time'] self.assertEqual(
.general_gpu_time, 0) event_summary.userdefined_items['Communication Time'].
general_gpu_time, 0)
for sort_key in [ for sort_key in [
profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax, profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax,
profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg, profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg,
...@@ -516,12 +535,11 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -516,12 +535,11 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg
]: ]:
print( print(
profiler.profiler_statistic._build_table( profiler.profiler_statistic._build_table(statistic_data,
statistic_data, sorted_by=sort_key,
sorted_by=sort_key, op_detail=True,
op_detail=True, thread_sep=False,
thread_sep=False, time_unit='ms'))
time_unit='ms'))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -197,8 +197,8 @@ class TimeRangeSummary: ...@@ -197,8 +197,8 @@ class TimeRangeSummary:
def __init__(self): def __init__(self):
self.CPUTimeRange = collections.defaultdict(list) self.CPUTimeRange = collections.defaultdict(list)
self.GPUTimeRange = collections.defaultdict( self.GPUTimeRange = collections.defaultdict(
lambda: collections.defaultdict(list) lambda: collections.defaultdict(
) # GPU events should be divided into different devices list)) # GPU events should be divided into different devices
self.CPUTimeRangeSum = collections.defaultdict(int) self.CPUTimeRangeSum = collections.defaultdict(int)
self.GPUTimeRangeSum = collections.defaultdict( self.GPUTimeRangeSum = collections.defaultdict(
lambda: collections.defaultdict(int)) lambda: collections.defaultdict(int))
...@@ -212,8 +212,8 @@ class TimeRangeSummary: ...@@ -212,8 +212,8 @@ class TimeRangeSummary:
for threadid, hostnodes in thread2hostnodes.items(): for threadid, hostnodes in thread2hostnodes.items():
CPUTimeRange = collections.defaultdict(list) CPUTimeRange = collections.defaultdict(list)
GPUTimeRange = collections.defaultdict( GPUTimeRange = collections.defaultdict(
lambda: collections.defaultdict(lambda: collections.defaultdict(list)) lambda: collections.defaultdict(lambda: collections.defaultdict(
) # device_id/type/stream_id list))) # device_id/type/stream_id
for hostnode in hostnodes[1:]: #skip root node for hostnode in hostnodes[1:]: #skip root node
CPUTimeRange[hostnode.type].append( CPUTimeRange[hostnode.type].append(
(hostnode.start_ns, hostnode.end_ns)) (hostnode.start_ns, hostnode.end_ns))
...@@ -235,8 +235,8 @@ class TimeRangeSummary: ...@@ -235,8 +235,8 @@ class TimeRangeSummary:
for device_id, device_time_ranges in GPUTimeRange.items(): for device_id, device_time_ranges in GPUTimeRange.items():
for event_type, event_time_ranges in device_time_ranges.items(): for event_type, event_time_ranges in device_time_ranges.items():
for stream_id, time_ranges in event_time_ranges.items(): for stream_id, time_ranges in event_time_ranges.items():
time_ranges = merge_self_ranges( time_ranges = merge_self_ranges(time_ranges,
time_ranges, is_sorted=False) is_sorted=False)
self.GPUTimeRange[device_id][event_type] = merge_ranges( self.GPUTimeRange[device_id][event_type] = merge_ranges(
self.GPUTimeRange[device_id][event_type], self.GPUTimeRange[device_id][event_type],
time_ranges, time_ranges,
...@@ -310,25 +310,27 @@ class DistributedSummary: ...@@ -310,25 +310,27 @@ class DistributedSummary:
for devicenode in runtimenode.device_node: for devicenode in runtimenode.device_node:
if devicenode.type == TracerEventType.Kernel: if devicenode.type == TracerEventType.Kernel:
if 'nccl' in devicenode.name.lower(): if 'nccl' in devicenode.name.lower():
self.gpu_communication_range.append(( self.gpu_communication_range.append(
devicenode.start_ns, devicenode.end_ns)) (devicenode.start_ns,
devicenode.end_ns))
else: else:
self.computation_range.append(( self.computation_range.append(
devicenode.start_ns, devicenode.end_ns)) (devicenode.start_ns,
devicenode.end_ns))
self.cpu_calls = len(set(self.cpu_communication_range)) self.cpu_calls = len(set(self.cpu_communication_range))
self.gpu_calls = len(set(self.gpu_communication_range)) self.gpu_calls = len(set(self.gpu_communication_range))
self.cpu_communication_range = merge_self_ranges( self.cpu_communication_range = merge_self_ranges(
self.cpu_communication_range, is_sorted=False) self.cpu_communication_range, is_sorted=False)
self.gpu_communication_range = merge_self_ranges( self.gpu_communication_range = merge_self_ranges(
self.gpu_communication_range, is_sorted=False) self.gpu_communication_range, is_sorted=False)
self.communication_range = merge_ranges( self.communication_range = merge_ranges(self.cpu_communication_range,
self.cpu_communication_range, self.gpu_communication_range,
self.gpu_communication_range, is_sorted=True)
is_sorted=True) self.computation_range = merge_self_ranges(self.computation_range,
self.computation_range = merge_self_ranges( is_sorted=False)
self.computation_range, is_sorted=False) self.overlap_range = intersection_ranges(self.communication_range,
self.overlap_range = intersection_ranges( self.computation_range,
self.communication_range, self.computation_range, is_sorted=True) is_sorted=True)
class EventSummary: class EventSummary:
...@@ -337,6 +339,7 @@ class EventSummary: ...@@ -337,6 +339,7 @@ class EventSummary:
""" """
class DeviceItem: class DeviceItem:
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name
self.call = 0 self.call = 0
...@@ -360,6 +363,7 @@ class EventSummary: ...@@ -360,6 +363,7 @@ class EventSummary:
self.add_gpu_time(node.end_ns - node.start_ns) self.add_gpu_time(node.end_ns - node.start_ns)
class OperatorItem: class OperatorItem:
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name
self.call = 0 self.call = 0
...@@ -430,6 +434,7 @@ class EventSummary: ...@@ -430,6 +434,7 @@ class EventSummary:
self.devices[name].add_item(devicenode) self.devices[name].add_item(devicenode)
class GeneralItem: class GeneralItem:
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name
self.call = 0 self.call = 0
...@@ -513,7 +518,8 @@ class EventSummary: ...@@ -513,7 +518,8 @@ class EventSummary:
or 'memset' in host_statistic_node.name.lower(): or 'memset' in host_statistic_node.name.lower():
self.add_memory_manipulation_item(host_statistic_node) self.add_memory_manipulation_item(host_statistic_node)
else: else:
self.add_userdefined_item(host_statistic_node) if host_statistic_node.type == TracerEventType.PythonUserDefined:
self.add_userdefined_item(host_statistic_node)
self.add_kernel_item(host_statistic_nodes[0]) self.add_kernel_item(host_statistic_nodes[0])
for threadid, root_statistic_node in node_statistic_trees.items(): for threadid, root_statistic_node in node_statistic_trees.items():
...@@ -688,13 +694,14 @@ def _build_table(statistic_data, ...@@ -688,13 +694,14 @@ def _build_table(statistic_data,
append(row_format.format(*headers)) append(row_format.format(*headers))
append(header_sep) append(header_sep)
row_values = [ row_values = [
'CPU(Process)', format_ratio( 'CPU(Process)',
float(statistic_data.extra_info['Process Cpu Utilization'])) format_ratio(float(
statistic_data.extra_info['Process Cpu Utilization']))
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
row_values = [ row_values = [
'CPU(System)', format_ratio( 'CPU(System)',
float(statistic_data.extra_info['System Cpu Utilization'])) format_ratio(float(statistic_data.extra_info['System Cpu Utilization']))
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
for gpu_name in statistic_data.time_range_summary.get_gpu_devices(): for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
...@@ -783,20 +790,22 @@ def _build_table(statistic_data, ...@@ -783,20 +790,22 @@ def _build_table(statistic_data,
TracerEventType. TracerEventType.
Communication] = statistic_data.distributed_summary.gpu_calls Communication] = statistic_data.distributed_summary.gpu_calls
sorted_items = sorted( sorted_items = sorted(cpu_type_time.items(),
cpu_type_time.items(), key=lambda x: x[1], reverse=True) key=lambda x: x[1],
reverse=True)
event_type, time = sorted_items[0] event_type, time = sorted_items[0]
row_values = [ row_values = [
'{}'.format(str(event_type).split('.')[1]), cpu_call_times[event_type], '{}'.format(str(event_type).split('.')[1]), cpu_call_times[event_type],
format_time( format_time(time, unit=time_unit),
time, unit=time_unit), format_ratio(float(time) / total_time) format_ratio(float(time) / total_time)
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
for event_type, time in sorted_items[1:]: for event_type, time in sorted_items[1:]:
row_values = [ row_values = [
' {}'.format(str(event_type).split('.')[1]), ' {}'.format(str(event_type).split('.')[1]),
cpu_call_times[event_type], format_time( cpu_call_times[event_type],
time, unit=time_unit), format_ratio(float(time) / total_time) format_time(time, unit=time_unit),
format_ratio(float(time) / total_time)
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
append(header_sep) append(header_sep)
...@@ -806,8 +815,9 @@ def _build_table(statistic_data, ...@@ -806,8 +815,9 @@ def _build_table(statistic_data,
for event_type, time in gpu_type_time.items(): for event_type, time in gpu_type_time.items():
row_values = [ row_values = [
' {}'.format(str(event_type).split('.')[1]), ' {}'.format(str(event_type).split('.')[1]),
gpu_call_times[event_type], format_time( gpu_call_times[event_type],
time, unit=time_unit), format_ratio(float(time) / total_time) format_time(time, unit=time_unit),
format_ratio(float(time) / total_time)
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
...@@ -851,24 +861,16 @@ def _build_table(statistic_data, ...@@ -851,24 +861,16 @@ def _build_table(statistic_data,
row_values = [ row_values = [
'{}'.format(name), item.call, '{}'.format(name), item.call,
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(item.cpu_time, unit=time_unit),
item.cpu_time, unit=time_unit), format_time(item.avg_cpu_time, unit=time_unit),
format_time( format_time(item.max_cpu_time, unit=time_unit),
item.avg_cpu_time, unit=time_unit), format_time(item.min_cpu_time, unit=time_unit),
format_time(
item.max_cpu_time, unit=time_unit),
format_time(
item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)), format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(item.gpu_time, unit=time_unit),
item.gpu_time, unit=time_unit), format_time(item.avg_gpu_time, unit=time_unit),
format_time( format_time(item.max_gpu_time, unit=time_unit),
item.avg_gpu_time, unit=time_unit), format_time(item.min_gpu_time, unit=time_unit),
format_time(
item.max_gpu_time, unit=time_unit),
format_time(
item.min_gpu_time, unit=time_unit),
format_ratio(gpu_ratio)) format_ratio(gpu_ratio))
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
...@@ -884,12 +886,10 @@ def _build_table(statistic_data, ...@@ -884,12 +886,10 @@ def _build_table(statistic_data,
gpu_ratio = float(other_gpu_time) / gpu_total_time gpu_ratio = float(other_gpu_time) / gpu_total_time
row_values = [ row_values = [
' Others', '-', '{} / - / - / - / {}'.format( ' Others', '-', '{} / - / - / - / {}'.format(
format_time( format_time(other_time, unit=time_unit),
other_time, unit=time_unit),
format_ratio(float(other_time) / total_time)), format_ratio(float(other_time) / total_time)),
'{} / - / - / - / {}'.format( '{} / - / - / - / {}'.format(
format_time( format_time(other_gpu_time, unit=time_unit),
other_gpu_time, unit=time_unit),
format_ratio(gpu_ratio)) format_ratio(gpu_ratio))
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
...@@ -971,28 +971,28 @@ def _build_table(statistic_data, ...@@ -971,28 +971,28 @@ def _build_table(statistic_data,
overlap_time = sum_ranges( overlap_time = sum_ranges(
statistic_data.distributed_summary.overlap_range) statistic_data.distributed_summary.overlap_range)
row_values = [ row_values = [
'ProfileStep', format_time( 'ProfileStep',
total_time, unit=time_unit), format_time(total_time, unit=time_unit),
format_ratio(float(total_time) / total_time) format_ratio(float(total_time) / total_time)
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
row_values = [ row_values = [
' Communication', format_time( ' Communication',
communication_time, unit=time_unit), format_time(communication_time, unit=time_unit),
format_ratio(float(communication_time) / total_time) format_ratio(float(communication_time) / total_time)
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
row_values = [ row_values = [
' Computation', format_time( ' Computation',
computation_time, unit=time_unit), format_time(computation_time, unit=time_unit),
format_ratio(float(computation_time) / total_time) format_ratio(float(computation_time) / total_time)
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
row_values = [ row_values = [
' Overlap', format_time( ' Overlap',
overlap_time, unit=time_unit), format_time(overlap_time, unit=time_unit),
format_ratio(float(overlap_time) / total_time) format_ratio(float(overlap_time) / total_time)
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
...@@ -1026,39 +1026,35 @@ def _build_table(statistic_data, ...@@ -1026,39 +1026,35 @@ def _build_table(statistic_data,
for thread_id, items in thread_items.items(): for thread_id, items in thread_items.items():
all_row_values.append("Thread: {}".format(thread_id)) all_row_values.append("Thread: {}".format(thread_id))
if sorted_by == SortedKeys.CPUTotal: if sorted_by == SortedKeys.CPUTotal:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].cpu_time, reverse=True) key=lambda x: x[1].cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUAvg: elif sorted_by == SortedKeys.CPUAvg:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].avg_cpu_time,
key=lambda x: x[1].avg_cpu_time, reverse=True)
reverse=True)
elif sorted_by == SortedKeys.CPUMax: elif sorted_by == SortedKeys.CPUMax:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].max_cpu_time,
key=lambda x: x[1].max_cpu_time, reverse=True)
reverse=True)
elif sorted_by == SortedKeys.CPUMin: elif sorted_by == SortedKeys.CPUMin:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].min_cpu_time) key=lambda x: x[1].min_cpu_time)
elif sorted_by == SortedKeys.GPUTotal: elif sorted_by == SortedKeys.GPUTotal:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].general_gpu_time,
key=lambda x: x[1].general_gpu_time, reverse=True)
reverse=True)
elif sorted_by == SortedKeys.GPUAvg: elif sorted_by == SortedKeys.GPUAvg:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].avg_general_gpu_time,
key=lambda x: x[1].avg_general_gpu_time, reverse=True)
reverse=True)
elif sorted_by == SortedKeys.GPUMax: elif sorted_by == SortedKeys.GPUMax:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].max_general_gpu_time,
key=lambda x: x[1].max_general_gpu_time, reverse=True)
reverse=True)
elif sorted_by == SortedKeys.GPUMin: elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].min_general_gpu_time) key=lambda x: x[1].min_general_gpu_time)
total_op_cpu_time = 0 total_op_cpu_time = 0
total_op_gpu_time = 0 total_op_gpu_time = 0
...@@ -1077,24 +1073,16 @@ def _build_table(statistic_data, ...@@ -1077,24 +1073,16 @@ def _build_table(statistic_data,
gpu_ratio = float(item.general_gpu_time) / total_op_gpu_time gpu_ratio = float(item.general_gpu_time) / total_op_gpu_time
row_values = [ row_values = [
name, item.call, '{} / {} / {} / {} / {}'.format( name, item.call, '{} / {} / {} / {} / {}'.format(
format_time( format_time(item.cpu_time, unit=time_unit),
item.cpu_time, unit=time_unit), format_time(item.avg_cpu_time, unit=time_unit),
format_time( format_time(item.max_cpu_time, unit=time_unit),
item.avg_cpu_time, unit=time_unit), format_time(item.min_cpu_time, unit=time_unit),
format_time(
item.max_cpu_time, unit=time_unit),
format_time(
item.min_cpu_time, unit=time_unit),
format_ratio(cpu_ratio)), format_ratio(cpu_ratio)),
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(item.general_gpu_time, unit=time_unit),
item.general_gpu_time, unit=time_unit), format_time(item.avg_general_gpu_time, unit=time_unit),
format_time( format_time(item.max_general_gpu_time, unit=time_unit),
item.avg_general_gpu_time, unit=time_unit), format_time(item.min_general_gpu_time, unit=time_unit),
format_time(
item.max_general_gpu_time, unit=time_unit),
format_time(
item.min_general_gpu_time, unit=time_unit),
format_ratio(gpu_ratio)) format_ratio(gpu_ratio))
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
...@@ -1117,28 +1105,24 @@ def _build_table(statistic_data, ...@@ -1117,28 +1105,24 @@ def _build_table(statistic_data,
row_values = [ row_values = [
' {}'.format(innerop_name), innerop_node.call, ' {}'.format(innerop_name), innerop_node.call,
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(innerop_node.cpu_time,
innerop_node.cpu_time, unit=time_unit), unit=time_unit),
format_time( format_time(innerop_node.avg_cpu_time,
innerop_node.avg_cpu_time, unit=time_unit), unit=time_unit),
format_time( format_time(innerop_node.max_cpu_time,
innerop_node.max_cpu_time, unit=time_unit), unit=time_unit),
format_time( format_time(innerop_node.min_cpu_time,
innerop_node.min_cpu_time, unit=time_unit), unit=time_unit),
format_ratio(cpu_ratio)), format_ratio(cpu_ratio)),
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(innerop_node.general_gpu_time,
innerop_node.general_gpu_time, unit=time_unit),
unit=time_unit), format_time(innerop_node.avg_general_gpu_time,
format_time( unit=time_unit),
innerop_node.avg_general_gpu_time, format_time(innerop_node.max_general_gpu_time,
unit=time_unit), unit=time_unit),
format_time( format_time(innerop_node.min_general_gpu_time,
innerop_node.max_general_gpu_time, unit=time_unit),
unit=time_unit),
format_time(
innerop_node.min_general_gpu_time,
unit=time_unit),
format_ratio(gpu_ratio)) format_ratio(gpu_ratio))
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
...@@ -1148,8 +1132,8 @@ def _build_table(statistic_data, ...@@ -1148,8 +1132,8 @@ def _build_table(statistic_data,
gpu_ratio = 0 gpu_ratio = 0
else: else:
gpu_ratio = float( gpu_ratio = float(
device_node. device_node.gpu_time
gpu_time) / innerop_node.general_gpu_time ) / innerop_node.general_gpu_time
if len(device_node_name) + 4 > name_column_width: if len(device_node_name) + 4 > name_column_width:
device_node_name = device_node_name[: device_node_name = device_node_name[:
name_column_width name_column_width
...@@ -1159,17 +1143,14 @@ def _build_table(statistic_data, ...@@ -1159,17 +1143,14 @@ def _build_table(statistic_data,
' {}'.format(device_node_name), ' {}'.format(device_node_name),
device_node.call, '- / - / - / - / -', device_node.call, '- / - / - / - / -',
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(device_node.gpu_time,
device_node.gpu_time, unit=time_unit), unit=time_unit),
format_time( format_time(device_node.avg_gpu_time,
device_node.avg_gpu_time, unit=time_unit),
unit=time_unit), format_time(device_node.max_gpu_time,
format_time( unit=time_unit),
device_node.max_gpu_time, format_time(device_node.min_gpu_time,
unit=time_unit), unit=time_unit),
format_time(
device_node.min_gpu_time,
unit=time_unit),
format_ratio(gpu_ratio)) format_ratio(gpu_ratio))
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
...@@ -1188,14 +1169,14 @@ def _build_table(statistic_data, ...@@ -1188,14 +1169,14 @@ def _build_table(statistic_data,
' {}'.format(device_node_name), device_node.call, ' {}'.format(device_node_name), device_node.call,
'- / - / - / - / -', '- / - / - / - / -',
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(device_node.gpu_time,
device_node.gpu_time, unit=time_unit), unit=time_unit),
format_time( format_time(device_node.avg_gpu_time,
device_node.avg_gpu_time, unit=time_unit), unit=time_unit),
format_time( format_time(device_node.max_gpu_time,
device_node.max_gpu_time, unit=time_unit), unit=time_unit),
format_time( format_time(device_node.min_gpu_time,
device_node.min_gpu_time, unit=time_unit), unit=time_unit),
format_ratio(gpu_ratio)) format_ratio(gpu_ratio))
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
...@@ -1249,21 +1230,20 @@ def _build_table(statistic_data, ...@@ -1249,21 +1230,20 @@ def _build_table(statistic_data,
all_row_values = [] all_row_values = []
kernel_items = statistic_data.event_summary.kernel_items kernel_items = statistic_data.event_summary.kernel_items
if sorted_by == SortedKeys.GPUAvg: if sorted_by == SortedKeys.GPUAvg:
sorted_items = sorted( sorted_items = sorted(kernel_items.items(),
kernel_items.items(), key=lambda x: x[1].avg_gpu_time,
key=lambda x: x[1].avg_gpu_time, reverse=True)
reverse=True)
elif sorted_by == SortedKeys.GPUMax: elif sorted_by == SortedKeys.GPUMax:
sorted_items = sorted( sorted_items = sorted(kernel_items.items(),
kernel_items.items(), key=lambda x: x[1].max_gpu_time,
key=lambda x: x[1].max_gpu_time, reverse=True)
reverse=True)
elif sorted_by == SortedKeys.GPUMin: elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted( sorted_items = sorted(kernel_items.items(),
kernel_items.items(), key=lambda x: x[1].min_gpu_time) key=lambda x: x[1].min_gpu_time)
else: else:
sorted_items = sorted( sorted_items = sorted(kernel_items.items(),
kernel_items.items(), key=lambda x: x[1].gpu_time, reverse=True) key=lambda x: x[1].gpu_time,
reverse=True)
total_kernel_gpu_time = 0 total_kernel_gpu_time = 0
for name, item in sorted_items: for name, item in sorted_items:
...@@ -1277,14 +1257,10 @@ def _build_table(statistic_data, ...@@ -1277,14 +1257,10 @@ def _build_table(statistic_data,
name, name,
item.call, item.call,
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(item.gpu_time, unit=time_unit),
item.gpu_time, unit=time_unit), format_time(item.avg_gpu_time, unit=time_unit),
format_time( format_time(item.max_gpu_time, unit=time_unit),
item.avg_gpu_time, unit=time_unit), format_time(item.min_gpu_time, unit=time_unit),
format_time(
item.max_gpu_time, unit=time_unit),
format_time(
item.min_gpu_time, unit=time_unit),
format_ratio(gpu_ratio)), format_ratio(gpu_ratio)),
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
...@@ -1349,24 +1325,16 @@ def _build_table(statistic_data, ...@@ -1349,24 +1325,16 @@ def _build_table(statistic_data,
name, name,
item.call, item.call,
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(item.cpu_time, unit=time_unit),
item.cpu_time, unit=time_unit), format_time(item.avg_cpu_time, unit=time_unit),
format_time( format_time(item.max_cpu_time, unit=time_unit),
item.avg_cpu_time, unit=time_unit), format_time(item.min_cpu_time, unit=time_unit),
format_time(
item.max_cpu_time, unit=time_unit),
format_time(
item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)), format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(item.general_gpu_time, unit=time_unit),
item.general_gpu_time, unit=time_unit), format_time(item.avg_general_gpu_time, unit=time_unit),
format_time( format_time(item.max_general_gpu_time, unit=time_unit),
item.avg_general_gpu_time, unit=time_unit), format_time(item.min_general_gpu_time, unit=time_unit),
format_time(
item.max_general_gpu_time, unit=time_unit),
format_time(
item.min_general_gpu_time, unit=time_unit),
format_ratio(gpu_ratio)), format_ratio(gpu_ratio)),
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
...@@ -1429,39 +1397,35 @@ def _build_table(statistic_data, ...@@ -1429,39 +1397,35 @@ def _build_table(statistic_data,
for thread_id, items in userdefined_thread_items.items(): for thread_id, items in userdefined_thread_items.items():
all_row_values.append("Thread: {}".format(thread_id)) all_row_values.append("Thread: {}".format(thread_id))
if sorted_by == SortedKeys.CPUTotal: if sorted_by == SortedKeys.CPUTotal:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].cpu_time, reverse=True) key=lambda x: x[1].cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUAvg: elif sorted_by == SortedKeys.CPUAvg:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].avg_cpu_time,
key=lambda x: x[1].avg_cpu_time, reverse=True)
reverse=True)
elif sorted_by == SortedKeys.CPUMax: elif sorted_by == SortedKeys.CPUMax:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].max_cpu_time,
key=lambda x: x[1].max_cpu_time, reverse=True)
reverse=True)
elif sorted_by == SortedKeys.CPUMin: elif sorted_by == SortedKeys.CPUMin:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].min_cpu_time) key=lambda x: x[1].min_cpu_time)
elif sorted_by == SortedKeys.GPUTotal: elif sorted_by == SortedKeys.GPUTotal:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].general_gpu_time,
key=lambda x: x[1].general_gpu_time, reverse=True)
reverse=True)
elif sorted_by == SortedKeys.GPUAvg: elif sorted_by == SortedKeys.GPUAvg:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].avg_general_gpu_time,
key=lambda x: x[1].avg_general_gpu_time, reverse=True)
reverse=True)
elif sorted_by == SortedKeys.GPUMax: elif sorted_by == SortedKeys.GPUMax:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].max_general_gpu_time,
key=lambda x: x[1].max_general_gpu_time, reverse=True)
reverse=True)
elif sorted_by == SortedKeys.GPUMin: elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted( sorted_items = sorted(items.items(),
items.items(), key=lambda x: x[1].min_general_gpu_time) key=lambda x: x[1].min_general_gpu_time)
for name, item in sorted_items: for name, item in sorted_items:
if gpu_total_time == 0: if gpu_total_time == 0:
...@@ -1472,24 +1436,16 @@ def _build_table(statistic_data, ...@@ -1472,24 +1436,16 @@ def _build_table(statistic_data,
name, name,
item.call, item.call,
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(item.cpu_time, unit=time_unit),
item.cpu_time, unit=time_unit), format_time(item.avg_cpu_time, unit=time_unit),
format_time( format_time(item.max_cpu_time, unit=time_unit),
item.avg_cpu_time, unit=time_unit), format_time(item.min_cpu_time, unit=time_unit),
format_time(
item.max_cpu_time, unit=time_unit),
format_time(
item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)), format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(item.general_gpu_time, unit=time_unit),
item.general_gpu_time, unit=time_unit), format_time(item.avg_general_gpu_time, unit=time_unit),
format_time( format_time(item.max_general_gpu_time, unit=time_unit),
item.avg_general_gpu_time, unit=time_unit), format_time(item.min_general_gpu_time, unit=time_unit),
format_time(
item.max_general_gpu_time, unit=time_unit),
format_time(
item.min_general_gpu_time, unit=time_unit),
format_ratio(gpu_ratio)), format_ratio(gpu_ratio)),
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册