未验证 提交 0afef498 编写于 作者: H HongyuJia 提交者: GitHub

[Opt CustomOP] Optimize the perf and impl of custom grad operator (#52915)

上级 8a850af6
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/eager/custom_operator/custom_operator_node.h" #include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
#include "paddle/fluid/framework/custom_operator.h" #include "paddle/fluid/framework/custom_operator.h"
#include "paddle/fluid/framework/custom_operator_utils.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/api/ext/op_meta_info.h" #include "paddle/phi/api/ext/op_meta_info.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
...@@ -164,14 +165,16 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>, ...@@ -164,14 +165,16 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
bool create_graph, bool create_graph,
bool is_new_grad) { // NOLINT bool is_new_grad) { // NOLINT
paddle::CustomOpKernelContext ctx; paddle::CustomOpKernelContext ctx;
auto grad_inputs_name = paddle::OpMetaInfoHelper::GetInputs( const auto& meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); const auto& vec_map = meta_info_map.at(op_type_);
auto grad_outputs_names = paddle::OpMetaInfoHelper::GetOutputs( const auto& grad_inputs_name =
egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); paddle::OpMetaInfoHelper::GetInputs(vec_map[1]);
const auto& grad_inplace_map = paddle::OpMetaInfoHelper::GetInplaceMap( const auto& grad_outputs_names =
egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); paddle::OpMetaInfoHelper::GetOutputs(vec_map[1]);
auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_); const auto& grad_inplace_map =
auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap(); paddle::OpMetaInfoHelper::GetInplaceMap(vec_map[1]);
const auto& map =
egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
paddle::small_vector<std::vector<paddle::Tensor>, kSlotSmallVectorSize> paddle::small_vector<std::vector<paddle::Tensor>, kSlotSmallVectorSize>
tmp_ins(grad_inputs_name.size()); tmp_ins(grad_inputs_name.size());
...@@ -180,8 +183,8 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>, ...@@ -180,8 +183,8 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
auto hooked_grads = ApplyGradientHooks(grads); auto hooked_grads = ApplyGradientHooks(grads);
for (size_t i = 0; i < hooked_grads.size(); i++) { for (size_t i = 0; i < hooked_grads.size(); i++) {
if (map[0][1].find(i) != map[0][1].end()) { if (map[0][1].find(i) != map[0][1].end()) {
VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[0][1][i]; VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[0][1].at(i);
tmp_ins[map[0][1][i]] = hooked_grads[i]; tmp_ins[map[0][1].at(i)] = hooked_grads[i];
} }
} }
...@@ -213,7 +216,7 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>, ...@@ -213,7 +216,7 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size(); VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
for (size_t i = 0; i < OutputMeta().size(); i++) { for (size_t i = 0; i < OutputMeta().size(); i++) {
if (map[0][0].find(i) != map[0][0].end()) { if (map[0][0].find(i) != map[0][0].end()) {
int grad_output_idx = map[0][0][i]; int grad_output_idx = map[0][0].at(i);
VLOG(7) << "Insert grad outputs: " << i VLOG(7) << "Insert grad outputs: " << i
<< " with size: " << OutputMeta()[grad_output_idx].size() << " with size: " << OutputMeta()[grad_output_idx].size()
<< " to tmp_outputs: " << grad_output_idx; << " to tmp_outputs: " << grad_output_idx;
...@@ -238,58 +241,47 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>, ...@@ -238,58 +241,47 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
// handle inplace map // handle inplace map
ctx.UpdatePlainOutputs( ctx.UpdatePlainOutputs(
grad_inputs_name, grad_outputs_names, grad_inplace_map); grad_inputs_name, grad_outputs_names, grad_inplace_map);
(*paddle::OpMetaInfoHelper::GetKernelFn(kernel_map.at(op_type_)[1]))(&ctx); (*paddle::OpMetaInfoHelper::GetKernelFn(vec_map[1]))(&ctx);
ctx.AssignInplaceOutputs(); ctx.AssignInplaceOutputs();
// handle optional None output when construct backward graph // handle optional None output when construct backward graph
for (size_t i = 0; i < ctx.OutputRange().size(); i++) { for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
if (ctx.OutputRangeAt(i).first + 1 == ctx.OutputRangeAt(i).second) { if (ctx.OutputRangeAt(i).first + 1 == ctx.OutputRangeAt(i).second) {
size_t idx = ctx.OutputRangeAt(i).first; paddle::Tensor* out_tensor =
paddle::Tensor* out_tensor = ctx.MutableOutputAt(idx); ctx.MutableOutputAt(ctx.OutputRangeAt(i).first);
if (!out_tensor->initialized()) { if (!out_tensor->initialized()) {
PADDLE_ENFORCE(grad_outputs_names.at(idx).find( PADDLE_ENFORCE(
paddle::kOptionalSuffix) != std::string::npos, paddle::framework::detail::IsOptionalVar(grad_outputs_names.at(i)),
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"Custom operator's %d-th output is not initialized. " "Custom grad operator's %d-th output is not initialized. "
"Please check your implementation again. If you are " "Please check your implementation again. If you are "
"using inplace optional outputs, then you must use " "using inplace optional outputs, then you must use "
"`paddle::Optional` to decorate this output", "`paddle::Optional` to decorate this output",
idx)); i));
// We can also consider using `autograd_meta` to tolerant nullptr. // We can also consider using `autograd_meta` to tolerant nullptr.
out_tensor->set_autograd_meta(std::make_shared<egr::AutogradMeta>()); out_tensor->set_autograd_meta(std::make_shared<egr::AutogradMeta>());
} }
} }
} }
VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op"; VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom grad Op";
std::vector<std::vector<egr::AutogradMeta*>> ins_auto_grad_metas; size_t slot_ins_num = ctx.InputRange().size();
std::vector<std::vector<egr::AutogradMeta*>> outs_auto_grad_metas; size_t slot_outs_num = ctx.OutputRange().size();
VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size(); VLOG(7) << "We got slot num of ins is: " << slot_ins_num;
ins_auto_grad_metas.resize(ctx.InputRange().size()); VLOG(7) << "We got slot num of outs is: " << slot_outs_num;
VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size(); std::vector<egr::AutogradMeta*> ins_auto_grad_metas =
outs_auto_grad_metas.resize(ctx.OutputRange().size()); egr::EagerUtils::nullable_autograd_meta(*ctx.AllMutableInput());
std::vector<egr::AutogradMeta*> outs_auto_grad_metas =
for (size_t i = 0; i < ctx.InputRange().size(); i++) { egr::EagerUtils::unsafe_autograd_meta(*ctx.AllMutableOutput());
ins_auto_grad_metas[i] =
egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween(
ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second));
}
for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
outs_auto_grad_metas[i] =
egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen(
ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second));
}
bool require_any_grad = false; bool require_any_grad = false;
bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph; bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;
for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
require_any_grad = require_any_grad =
require_any_grad || egr::EagerUtils::ComputeRequireGrad( require_any_grad || egr::EagerUtils::ComputeRequireGrad(
trace_backward, &(ins_auto_grad_metas[i])); trace_backward, ins_auto_grad_metas[i]);
} }
auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
const auto& vec_map = meta_info_map.at(op_type_);
if (require_any_grad && (vec_map.size() > 2)) { if (require_any_grad && (vec_map.size() > 2)) {
paddle::platform::RecordEvent node_creation_record_event( paddle::platform::RecordEvent node_creation_record_event(
"Custom Op " + op_type_ + " double_grad node_creation", "Custom Op " + op_type_ + " double_grad node_creation",
...@@ -298,34 +290,39 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>, ...@@ -298,34 +290,39 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
VLOG(6) << " Construct Grad for Custom Op: " << op_type_; VLOG(6) << " Construct Grad for Custom Op: " << op_type_;
ConstructFwdAndBwdMap(vec_map, op_type_); ConstructFwdAndBwdMap(vec_map, op_type_);
for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i])); egr::EagerUtils::PassStopGradient(false, outs_auto_grad_metas[i]);
} }
// NOTE(HongyuJia): Does here needs to be consistent with forward process,
// PassStopGradient to ins_auto_grad_metas?
auto grad_node = std::make_shared<egr::RunCustomOpDoubleGradNode>( auto grad_node = std::make_shared<egr::RunCustomOpDoubleGradNode>(
outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type_); slot_outs_num, slot_ins_num, op_type_);
auto slot_map = const auto& slot_map = map;
egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
// Prepare Grad outputs // Prepare Grad outputs
size_t no_grad_cnt = 0; size_t no_grad_cnt = 0;
for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { for (size_t i = 0; i < slot_ins_num; i++) {
const std::vector<paddle::Tensor>& in_tensors = ctx.InputsBetween( const std::vector<paddle::Tensor>& in_tensors = ctx.InputsBetween(
ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second); ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second);
if (slot_map[1][0].find(i) != slot_map[1][0].end()) { if (slot_map[1][0].find(i) != slot_map[1][0].end()) {
grad_node->SetGradOutMeta(in_tensors, slot_map[1][0][i]); grad_node->SetGradOutMeta(in_tensors, slot_map[1][0].at(i));
} else { } else {
grad_node->SetGradOutMeta(in_tensors, grad_node->SetGradOutMeta(in_tensors, slot_ins_num - 1 - no_grad_cnt);
ins_auto_grad_metas.size() - 1 - no_grad_cnt);
no_grad_cnt++; no_grad_cnt++;
} }
} }
// Prepare Grad inputs with grad of fwd outputs // Prepare Grad inputs with grad of fwd outputs
for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { for (size_t i = 0; i < slot_outs_num; i++) {
const std::vector<paddle::Tensor>& out_tensors = ctx.OutputsBetweeen( const auto& size_pair = ctx.OutputRangeAt(i);
ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second); const std::vector<paddle::Tensor>& out_tensors =
egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i); ctx.OutputsBetweeen(size_pair.first, size_pair.second);
egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node); for (size_t j = size_pair.first; j < size_pair.second; j++) {
// SetOutRankWithSlot: slot_id = i, rank = j - size_pair.first
outs_auto_grad_metas[j]->SetSingleOutRankWithSlot(i,
j - size_pair.first);
egr::EagerUtils::SetHistory(outs_auto_grad_metas[j], grad_node);
}
grad_node->SetGradInMeta(out_tensors, i); grad_node->SetGradInMeta(out_tensors, i);
} }
...@@ -349,9 +346,7 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>, ...@@ -349,9 +346,7 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
ctx.InputRangeAt(it->first).second)); ctx.InputRangeAt(it->first).second));
} }
auto attrs_names = std::vector<paddle::any> attrs(attrs_.size());
paddle::OpMetaInfoHelper::GetAttrs(meta_info_map.at(op_type_)[2]);
std::vector<paddle::any> attrs(attrs_names.size());
// Prepare attrs for Grad node // Prepare attrs for Grad node
for (auto it = slot_map[1][4].begin(); it != slot_map[1][4].end(); it++) { for (auto it = slot_map[1][4].begin(); it != slot_map[1][4].end(); it++) {
VLOG(7) << "Prepare fwd attrs: " << it->first VLOG(7) << "Prepare fwd attrs: " << it->first
...@@ -371,14 +366,16 @@ RunCustomOpDoubleGradNode::operator()( ...@@ -371,14 +366,16 @@ RunCustomOpDoubleGradNode::operator()(
bool create_graph, bool create_graph,
bool is_new_grad) { // NOLINT bool is_new_grad) { // NOLINT
paddle::CustomOpKernelContext ctx; paddle::CustomOpKernelContext ctx;
auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap(); const auto& meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
const auto& vec_map = meta_info_map.at(op_type_); const auto& vec_map = meta_info_map.at(op_type_);
auto grad_inputs_name = paddle::OpMetaInfoHelper::GetInputs(vec_map[2]); const auto& grad_inputs_name =
auto grad_outputs_names = paddle::OpMetaInfoHelper::GetOutputs(vec_map[2]); paddle::OpMetaInfoHelper::GetInputs(vec_map[2]);
const auto& grad_outputs_names =
paddle::OpMetaInfoHelper::GetOutputs(vec_map[2]);
const auto& grad_inplace_map = const auto& grad_inplace_map =
paddle::OpMetaInfoHelper::GetInplaceMap(vec_map[2]); paddle::OpMetaInfoHelper::GetInplaceMap(vec_map[2]);
auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_); const auto& map =
auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap(); egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
paddle::small_vector<std::vector<paddle::Tensor>, kSlotSmallVectorSize> paddle::small_vector<std::vector<paddle::Tensor>, kSlotSmallVectorSize>
tmp_ins(grad_inputs_name.size()); tmp_ins(grad_inputs_name.size());
...@@ -389,8 +386,8 @@ RunCustomOpDoubleGradNode::operator()( ...@@ -389,8 +386,8 @@ RunCustomOpDoubleGradNode::operator()(
for (size_t i = 0; i < hooked_grads.size(); i++) { for (size_t i = 0; i < hooked_grads.size(); i++) {
if (map[1][1].find(i) != map[1][1].end()) { if (map[1][1].find(i) != map[1][1].end()) {
VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][1][i]; VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][1].at(i);
tmp_ins[map[1][1][i]] = hooked_grads[i]; tmp_ins[map[1][1].at(i)] = hooked_grads[i];
} }
} }
...@@ -416,13 +413,9 @@ RunCustomOpDoubleGradNode::operator()( ...@@ -416,13 +413,9 @@ RunCustomOpDoubleGradNode::operator()(
tmp_outs(grad_outputs_names.size()); tmp_outs(grad_outputs_names.size());
VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size(); VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
for (const auto& name : grad_outputs_names) {
VLOG(6) << "Prepare Grad outputs name is: " << name;
}
for (size_t i = 0; i < OutputMeta().size(); i++) { for (size_t i = 0; i < OutputMeta().size(); i++) {
if (map[1][0].find(i) != map[1][0].end()) { if (map[1][0].find(i) != map[1][0].end()) {
int grad_output_idx = map[1][0][i]; int grad_output_idx = map[1][0].at(i);
VLOG(7) << "Insert grad outputs: " << i VLOG(7) << "Insert grad outputs: " << i
<< " with size: " << OutputMeta()[grad_output_idx].size() << " with size: " << OutputMeta()[grad_output_idx].size()
<< " to tmp_outputs: " << grad_output_idx; << " to tmp_outputs: " << grad_output_idx;
...@@ -441,12 +434,12 @@ RunCustomOpDoubleGradNode::operator()( ...@@ -441,12 +434,12 @@ RunCustomOpDoubleGradNode::operator()(
VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size(); VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size();
ctx.EmplaceBackOutputs(tmp_outs[i]); ctx.EmplaceBackOutputs(tmp_outs[i]);
} }
VLOG(7) << "Run Kernel of Grad Custom Op: " << name(); VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_ << "_grad_grad";
// handle inplace map // handle inplace map
ctx.UpdatePlainOutputs( ctx.UpdatePlainOutputs(
grad_inputs_name, grad_outputs_names, grad_inplace_map); grad_inputs_name, grad_outputs_names, grad_inplace_map);
(*paddle::OpMetaInfoHelper::GetKernelFn(kernel_map.at(op_type_)[2]))(&ctx); (*paddle::OpMetaInfoHelper::GetKernelFn(vec_map[2]))(&ctx);
ctx.AssignInplaceOutputs(); ctx.AssignInplaceOutputs();
return outs; return outs;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册