support custom operator run in double grad mode (#42653)

00ecb98f · Jiabin Yang · GitHub · 6c696db1 · 00ecb98f · 00ecb98f
8 changed file
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -77,7 +77,8 @@ class Controller {
    op_meta_info_map_.insert(map.begin(), map.end());
  }
-  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>&
+  std::unordered_map<std::string,
+                     std::vector<std::vector<std::unordered_map<int, int>>>>&
  GetCustomEdgesSlotMap() {
    return custom_edges_slot_map_;
  }
@@ -89,8 +90,10 @@ class Controller {
      new paddle::imperative::Tracer()};
  std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>
      op_meta_info_map_;
-  /* op_type : {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}*/
+  /* op_type : {{{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}},
-  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>
+   * {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}}*/
+  std::unordered_map<std::string,
+                     std::vector<std::vector<std::unordered_map<int, int>>>>
      custom_edges_slot_map_;
  DISABLE_COPY_AND_ASSIGN(Controller);
 };

--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -698,8 +698,6 @@ std::vector<paddle::experimental::Tensor> RunBackward(
      }
    }
-    VLOG(6) << "Running GradNode:" << node->name();
    // Check input
    EnforceGradNodeHasInput(node);

--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -15,10 +15,151 @@
 #include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/core/dense_tensor.h"
 namespace egr {
+static void ConstructFwdAndBwdMap(
+    const std::vector<paddle::OpMetaInfo>& vec_map,
+    const std::string& op_type) {
+  auto& in_out_map = egr::Controller::Instance().GetCustomEdgesSlotMap();
+  if (in_out_map.find(op_type) != in_out_map.end()) {
+    if (in_out_map[op_type].size() == 2) {
+      VLOG(7) << "Find Exist CustomEdgesSlotMap Skip >>>> ";
+      return;
+    }
+  }
+  VLOG(7) << "Construct DoubleGrad's CustomEdgesSlotMap ";
+  auto inputs_names =
+      paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[1]);
+  auto outputs_names =
+      paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[1]);
+  auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[1]);
+  auto grad_outputs_names =
+      paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[2]);
+  auto grad_inputs_names =
+      paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[2]);
+  auto grad_attrs_names =
+      paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[2]);
+  std::vector<std::unordered_map<int, int>> res(5);
+  in_out_map[op_type].push_back(res);
+  // Prepare pos map for grad_outputs
+  VLOG(7) << "Prepare pos map for grad_outputs";
+  PADDLE_ENFORCE_LE(
+      grad_outputs_names.size(), inputs_names.size(),
+      paddle::platform::errors::InvalidArgument(
+          "Grad outputs num should be less equal than forward inputs num."));
+  for (size_t i = 0; i < grad_outputs_names.size(); i++) {
+    auto end = grad_outputs_names[i].find("@GRAD@GRAD");
+    if (end != std::string::npos) {
+      for (size_t j = 0; j < inputs_names.size(); j++) {
+        if (grad_outputs_names[i].substr(0, end + 5) == inputs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                  << "'s No." << j << " inputs: " << inputs_names[j]
+                  << " related to No." << i
+                  << " grad_outputs: " << grad_outputs_names[i];
+          in_out_map[op_type][1][0][j] = i;
+        }
+      }
+    } else {
+      size_t end_n = grad_outputs_names[i].find("@GRAD@NEW");
+      if (end_n != std::string::npos) {
+        for (size_t j = 0; j < inputs_names.size(); j++) {
+          if (grad_outputs_names[i].substr(0, end_n) == inputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                    << "'s No." << j << " inputs: " << inputs_names[j]
+                    << " related to No." << i
+                    << " grad_outputs: " << grad_outputs_names[i];
+            in_out_map[op_type][1][0][j] = i;
+          }
+        }
+      } else {
+        size_t end_one_grad = grad_outputs_names[i].find("@GRAD");
+        if (end_one_grad != std::string::npos) {
+          for (size_t j = 0; j < inputs_names.size(); j++) {
+            if (grad_outputs_names[i].substr(0, end_one_grad) ==
+                inputs_names[j]) {
+              VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                      << "'s No." << j << " inputs: " << inputs_names[j]
+                      << " related to No." << i
+                      << " grad_outputs: " << grad_outputs_names[i];
+              in_out_map[op_type][1][0][j] = i;
+            }
+          }
+        } else {
+          PADDLE_THROW(paddle::platform::errors::NotFound(
+              "All Grad outputs should be end of @GRAD@GRAD or @GRAD@NEW or "
+              "@GRAD and we got %s is not one of them, "
+              "please check your op and change to fit the rule.",
+              grad_outputs_names[i]));
+        }
+      }
+    }
+  }
+  // Prepare pos map for grad_inputs
+  for (size_t i = 0; i < grad_inputs_names.size(); i++) {
+    size_t end = grad_inputs_names[i].find("@GRAD@GRAD");
+    if (end != std::string::npos) {
+      for (size_t j = 0; j < outputs_names.size(); j++) {
+        if (grad_inputs_names[i].substr(0, end + 5) == outputs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                  << "'s No." << j << " outputs: " << outputs_names[j]
+                  << " related to No." << i
+                  << " grad_inputs's grad: " << grad_inputs_names[i];
+          in_out_map[op_type][1][1][j] = i;
+        }
+      }
+    } else {
+      if (std::find(outputs_names.begin(), outputs_names.end(),
+                    grad_inputs_names[i]) != outputs_names.end()) {
+        for (size_t j = 0; j < outputs_names.size(); j++) {
+          if (grad_inputs_names[i] == outputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                    << "'s No." << j << " outputs: " << outputs_names[j]
+                    << " related to No." << i
+                    << " grad_inputs fwd outputs: " << grad_inputs_names[i];
+            in_out_map[op_type][1][2][j] = i;
+          }
+        }
+      } else {
+        for (size_t j = 0; j < inputs_names.size(); j++) {
+          if (grad_inputs_names[i] == inputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                    << "'s No." << j << " inputs: " << inputs_names[j]
+                    << " related to No." << i
+                    << " grad_inputs fwd inputs: " << grad_inputs_names[i];
+            in_out_map[op_type][1][3][j] = i;
+          }
+        }
+      }
+    }
+  }
+  // Prepare pos map for grad attrs_
+  for (size_t i = 0; i < grad_attrs_names.size(); i++) {
+    auto end =
+        std::find(attrs_names.begin(), attrs_names.end(), grad_attrs_names[i]);
+    PADDLE_ENFORCE_NE(end, attrs_names.end(),
+                      paddle::platform::errors::NotFound(
+                          "All Grad attrs should be one of forward attrs and "
+                          "we got %s is not one of them, please check your "
+                          "op and change to fit the rule.",
+                          grad_attrs_names[i]));
+    for (size_t j = 0; j < attrs_names.size(); j++) {
+      if (grad_attrs_names[i] == attrs_names[j]) {
+        VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                << "'s No." << j << " attrs: " << attrs_names[j]
+                << " related to No." << i
+                << " grad_attrs: " << grad_attrs_names[i];
+        in_out_map[op_type][1][4][j] = i;
+      }
+    }
+  }
+}
 paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                     kSlotSmallVectorSize>
 RunCustomOpNode::operator()(
@@ -38,10 +179,11 @@ RunCustomOpNode::operator()(
      tmp_ins(grad_inputs_name.size());
  VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
          << ", whose grad_inputs_name size is: " << grad_inputs_name.size();
-  for (size_t i = 0; i < grads.size(); i++) {
+  auto hooked_grads = ApplyGradientHooks(grads);
-    if (map[1].find(i) != map[1].end()) {
+  for (size_t i = 0; i < hooked_grads.size(); i++) {
-      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][i];
+    if (map[0][1].find(i) != map[0][1].end()) {
-      tmp_ins[map[1][i]] = grads[i];
+      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[0][1][i];
+      tmp_ins[map[0][1][i]] = hooked_grads[i];
    }
  }
@@ -69,28 +211,218 @@ RunCustomOpNode::operator()(
      tmp_outs(grad_outputs_names.size());
  VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
  for (size_t i = 0; i < OutputMeta().size(); i++) {
-    if (map[0].find(i) != map[0].end()) {
+    if (map[0][0].find(i) != map[0][0].end()) {
      VLOG(7) << "Insert grad outputs: " << i
              << " with size: " << OutputMeta()[i].size()
-              << " to tmp_outputs: " << map[0][i];
+              << " to tmp_outputs: " << map[0][0][i];
      for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
        outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
                             std::make_shared<phi::DenseTensor>(
                                 phi::DataType::UNDEFINED),
                             egr::Controller::Instance().GenerateUniqueName(
                                 "custom_tmp_grad"));
+        egr::EagerUtils::autograd_meta(&(outs[i][j]));
      }
-      tmp_outs[map[0][i]] = outs[i];
+      tmp_outs[map[0][0][i]] = outs[i];
    }
  }
  for (size_t i = 0; i < tmp_outs.size(); i++) {
    VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size();
    ctx.EmplaceBackOutputs(tmp_outs[i]);
  }
-  VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_;
+  VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_ << "_grad";
  (*paddle::framework::OpMetaInfoHelper::GetKernelFn(
      kernel_map.at(op_type_)[1]))(&ctx);
+  VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op";
+  std::vector<std::vector<egr::AutogradMeta*>> ins_auto_grad_metas;
+  std::vector<std::vector<egr::AutogradMeta*>> outs_auto_grad_metas;
+  VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size();
+  ins_auto_grad_metas.resize(ctx.InputRange().size());
+  VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size();
+  outs_auto_grad_metas.resize(ctx.OutputRange().size());
+  for (size_t i = 0; i < ctx.InputRange().size(); i++) {
+    ins_auto_grad_metas[i] =
+        egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween(
+            ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second));
+  }
+  for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
+    outs_auto_grad_metas[i] =
+        egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen(
+            ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second));
+  }
+  bool require_any_grad = false;
+  bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;
+  for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+    require_any_grad =
+        require_any_grad || egr::EagerUtils::ComputeRequireGrad(
+                                trace_backward, &(ins_auto_grad_metas[i]));
+  }
+  if (require_any_grad) {
+    auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
+    const auto& vec_map = meta_info_map.at(op_type_);
+    paddle::platform::RecordEvent node_creation_record_event(
+        "Custom Op " + op_type_ + " double_grad node_creation",
+        paddle::platform::TracerEventType::OperatorInner, 1);
+    VLOG(6) << " Construct Grad for Custom Op: " << op_type_;
+    ConstructFwdAndBwdMap(vec_map, op_type_);
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i]));
+    }
+    auto grad_node = std::make_shared<egr::RunCustomOpDoubleGradNode>(
+        outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type_);
+    auto slot_map =
+        egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
+    // Prepare Grad outputs
+    size_t no_grad_cnt = 0;
+    for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& in_tensors =
+          ctx.InputsBetween(ctx.InputRangeAt(i).first,
+                            ctx.InputRangeAt(i).second);
+      if (slot_map[1][0].find(i) != slot_map[1][0].end()) {
+        grad_node->SetGradOutMeta(in_tensors, slot_map[1][0][i]);
+      } else {
+        grad_node->SetGradOutMeta(in_tensors,
+                                  ins_auto_grad_metas.size() - 1 - no_grad_cnt);
+        no_grad_cnt++;
+      }
+    }
+    // Prepare Grad inputs with grad of fwd outputs
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& out_tensors =
+          ctx.OutputsBetweeen(ctx.OutputRangeAt(i).first,
+                              ctx.OutputRangeAt(i).second);
+      egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i);
+      egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node);
+      grad_node->SetGradInMeta(out_tensors, i);
+      egr::EagerUtils::CheckAndRetainGrad(out_tensors);
+    }
+    // Prepare Grad inputs with fwd outputs
+    for (auto it = slot_map[1][2].begin(); it != slot_map[1][2].end(); it++) {
+      VLOG(7) << "Prepare fwd_outs: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_outs[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.OutputsBetweeen(ctx.OutputRangeAt(it->first).first,
+                                  ctx.OutputRangeAt(it->first).second));
+    }
+    // Prepare Grad inputs with fwd inputs
+    for (auto it = slot_map[1][3].begin(); it != slot_map[1][3].end(); it++) {
+      VLOG(7) << "Prepare fwd_ins: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_ins[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.InputsBetween(ctx.InputRangeAt(it->first).first,
+                                ctx.InputRangeAt(it->first).second));
+    }
+    auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs(
+        meta_info_map.at(op_type_)[2]);
+    std::vector<paddle::any> attrs(attrs_names.size());
+    // Prepare attrs for Grad node
+    for (auto it = slot_map[1][4].begin(); it != slot_map[1][4].end(); it++) {
+      VLOG(7) << "Prepare fwd attrs: " << it->first
+              << " to grad_attrs: " << it->second;
+      attrs[it->second] = attrs_[it->first];
+    }
+    grad_node->SetAttrs(attrs);
+  }
+  return outs;
+}
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
+RunCustomOpDoubleGradNode::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>& grads,
+    bool create_graph, bool is_new_grad) {  // NOLINT
+  paddle::CustomOpKernelContext ctx;
+  auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
+  const auto& vec_map = meta_info_map.at(op_type_);
+  auto grad_inputs_name =
+      paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[2]);
+  auto grad_outputs_names =
+      paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[2]);
+  auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
+  auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap();
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      tmp_ins(grad_inputs_name.size());
+  VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
+          << ", whose grad_inputs_name size is: " << grad_inputs_name.size();
+  auto hooked_grads = ApplyGradientHooks(grads);
+  for (size_t i = 0; i < hooked_grads.size(); i++) {
+    if (map[1][1].find(i) != map[1][1].end()) {
+      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][1][i];
+      tmp_ins[map[1][1][i]] = hooked_grads[i];
+    }
+  }
+  for (auto it : fwd_outs) {
+    VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpDoubleGradNode::Recover(&(it.second));
+  }
+  for (auto it : fwd_ins) {
+    VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpDoubleGradNode::Recover(&(it.second));
+  }
+  VLOG(6) << "Prepare Grad inputs";
+  for (const auto& in : tmp_ins) {
+    ctx.EmplaceBackInputs(in);
+  }
+  VLOG(6) << "Prepare Grad attrs";
+  ctx.EmplaceBackAttrs(attrs_);
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      outs(OutputMeta().size());
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      tmp_outs(grad_outputs_names.size());
+  VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
+  for (const auto& name : grad_outputs_names) {
+    VLOG(6) << "Prepare Grad outputs name is: " << name;
+  }
+  for (size_t i = 0; i < OutputMeta().size(); i++) {
+    if (map[1][0].find(i) != map[1][0].end()) {
+      VLOG(7) << "Insert grad outputs: " << i
+              << " with size: " << OutputMeta()[i].size()
+              << " to tmp_outputs: " << map[1][0][i];
+      for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
+        outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
+                             std::make_shared<phi::DenseTensor>(
+                                 phi::DataType::UNDEFINED),
+                             egr::Controller::Instance().GenerateUniqueName(
+                                 "custom_tmp_grad"));
+      }
+      tmp_outs[map[1][0][i]] = outs[i];
+    }
+  }
+  for (size_t i = 0; i < tmp_outs.size(); i++) {
+    VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size();
+    ctx.EmplaceBackOutputs(tmp_outs[i]);
+  }
+  VLOG(7) << "Run Kernel of Grad Custom Op: " << name();
+  (*paddle::framework::OpMetaInfoHelper::GetKernelFn(
+      kernel_map.at(op_type_)[2]))(&ctx);
  return outs;
 }
 }  // namespace egr
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.h
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h
@@ -67,7 +67,11 @@ class RunCustomOpNode : public GradNodeBase {
    return res;
  }
-  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  void ClearTensorWrappers() override {
+    fwd_outs.clear();
+    fwd_ins.clear();
+    grads2grad_in_map.clear();
+  }
  void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
@@ -87,4 +91,75 @@ class RunCustomOpNode : public GradNodeBase {
  std::string op_type_{""};
 };
+class RunCustomOpDoubleGradNode : public GradNodeBase {
+ public:
+  // Constructor: configure fwd input tensors to grad node
+  explicit RunCustomOpDoubleGradNode(size_t bwd_in_slot_num,
+                                     size_t bwd_out_slot_num,
+                                     const std::string& op_type)
+      : GradNodeBase(bwd_in_slot_num, bwd_out_slot_num), op_type_(op_type) {
+    VLOG(6) << "Construct RunCustomOpDoubleGradNode for op: " << op_type;
+  }
+  ~RunCustomOpDoubleGradNode() override {
+    VLOG(6) << "Destruct RunCustomOpDoubleGradNode for op: " << op_type_;
+  }
+  // Functor: perform backward computations
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(  // NOLINT
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                           kSlotSmallVectorSize>& grads,  // NOLINT
+      bool create_graph = false,
+      bool is_new_grad = false)  // NOLINT
+      override;
+  std::string name() {
+    return paddle::string::Sprintf("RunCustomOpDoubleGradNode: %s_grad_grad",
+                                   op_type_);
+  }
+  static std::vector<egr::TensorWrapper> ConstructTensorWrapper(
+      const std::vector<paddle::experimental::Tensor>& fwd_var) {
+    std::vector<egr::TensorWrapper> res;
+    for (auto const& var : fwd_var) {
+      res.emplace_back(var);
+    }
+    return res;
+  }
+  static std::vector<paddle::experimental::Tensor> Recover(
+      std::vector<egr::TensorWrapper>* fwd_var) {
+    std::vector<paddle::experimental::Tensor> res;
+    for (size_t i = 0; i < fwd_var->size(); i++) {
+      res.emplace_back(fwd_var->at(i).recover());
+    }
+    return res;
+  }
+  void ClearTensorWrappers() override {
+    fwd_outs.clear();
+    fwd_ins.clear();
+    grads2grad_in_map.clear();
+  }
+  void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    auto copied_node = std::shared_ptr<RunCustomOpDoubleGradNode>(
+        new RunCustomOpDoubleGradNode(*this));
+    return copied_node;
+  }
+ public:
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_outs;
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_ins;
+  std::unordered_map<int, int> grads2grad_in_map;
+ private:
+  std::vector<paddle::any> attrs_;
+  std::string op_type_{""};
+};
 }  // namespace egr
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -119,18 +119,24 @@ class TensorWrapper {
      paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_;
      std::shared_ptr<GradNodeBase> new_grad_node = weak_grad_node_.lock();
-      auto* intermediate_autograd_meta =
-          EagerUtils::unsafe_autograd_meta(intermidiate_tensor_);
-      auto p_ab_autograd_meta =
-          std::make_shared<AutogradMeta>(*intermediate_autograd_meta);
      if (new_grad_node) {
        VLOG(3) << "Recovered TensorWrapper with GradNode "
                << new_grad_node->name() << " addr: " << new_grad_node.get();
-        p_ab_autograd_meta->SetGradNode(new_grad_node);
      } else {
-        VLOG(3) << "Recovered TensorWrapper with Empth GradNode";
+        VLOG(3) << "Recovered TensorWrapper with Empty GradNode";
+      }
+      auto* intermediate_autograd_meta =
+          EagerUtils::nullable_autograd_meta(intermidiate_tensor_);
+      if (intermediate_autograd_meta) {
+        auto p_ab_autograd_meta =
+            std::make_shared<AutogradMeta>(*intermediate_autograd_meta);
+        if (new_grad_node) {
+          p_ab_autograd_meta->SetGradNode(new_grad_node);
+        }
+        recovered_tensor.set_autograd_meta(p_ab_autograd_meta);
      }
-      recovered_tensor.set_autograd_meta(p_ab_autograd_meta);
      return recovered_tensor;
    }
  }

--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -157,7 +157,7 @@ void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
    if (autograd_meta->GradNode()) {
      VLOG(7) << "Should not set grad node twice, original node is:"
              << autograd_meta->GradNode()->name()
-              << "current is: " << grad_node->name();
+              << " current is: " << grad_node->name();
    }
    autograd_meta->SetGradNode(grad_node);
  }

--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -207,7 +207,8 @@ static void ConstructFwdAndBwdMap(
    auto grad_attrs_names =
        paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[1]);
    std::vector<std::unordered_map<int, int>> res(5);
-    in_out_map.insert({op_type, res});
+    in_out_map.insert({op_type, {res}});
    // Prepare pos map for grad_outputs
    VLOG(7) << "Prepare pos map for grad_outputs";
    PADDLE_ENFORCE_LE(
@@ -227,7 +228,7 @@ static void ConstructFwdAndBwdMap(
          VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
                  << " inputs: " << inputs_names[j] << " related to No." << i
                  << " grad_outputs: " << grad_outputs_names[i];
-          in_out_map[op_type][0][j] = i;
+          in_out_map[op_type][0][0][j] = i;
        }
      }
    }
@@ -240,7 +241,7 @@ static void ConstructFwdAndBwdMap(
            VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
                    << " outputs: " << outputs_names[j] << " related to No."
                    << i << " grad_inputs's grad: " << grad_inputs_names[i];
-            in_out_map[op_type][1][j] = i;
+            in_out_map[op_type][0][1][j] = i;
          }
        }
      } else {
@@ -252,7 +253,7 @@ static void ConstructFwdAndBwdMap(
                      << " outputs: " << outputs_names[j] << " related to No."
                      << i
                      << " grad_inputs fwd outputs: " << grad_inputs_names[i];
-              in_out_map[op_type][2][j] = i;
+              in_out_map[op_type][0][2][j] = i;
            }
          }
        } else {
@@ -262,7 +263,7 @@ static void ConstructFwdAndBwdMap(
                      << " inputs: " << inputs_names[j] << " related to No."
                      << i
                      << " grad_inputs fwd inputs: " << grad_inputs_names[i];
-              in_out_map[op_type][3][j] = i;
+              in_out_map[op_type][0][3][j] = i;
            }
          }
        }
@@ -284,7 +285,7 @@ static void ConstructFwdAndBwdMap(
          VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
                  << " attrs: " << attrs_names[j] << " related to No." << i
                  << " grad_attrs: " << grad_attrs_names[i];
-          in_out_map[op_type][4][j] = i;
+          in_out_map[op_type][0][4][j] = i;
        }
      }
    }
@@ -402,8 +403,8 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
          ctx.InputsBetween(ctx.InputRangeAt(i).first,
                            ctx.InputRangeAt(i).second);
-      if (slot_map[0].find(i) != slot_map[0].end()) {
+      if (slot_map[0][0].find(i) != slot_map[0][0].end()) {
-        grad_node->SetGradOutMeta(in_tensors, slot_map[0][i]);
+        grad_node->SetGradOutMeta(in_tensors, slot_map[0][0][i]);
      } else {
        grad_node->SetGradOutMeta(in_tensors,
                                  ins_auto_grad_metas.size() - 1 - no_grad_cnt);
@@ -423,7 +424,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
    }
    // Prepare Grad inputs with fwd outputs
-    for (auto it = slot_map[2].begin(); it != slot_map[2].end(); it++) {
+    for (auto it = slot_map[0][2].begin(); it != slot_map[0][2].end(); it++) {
      VLOG(7) << "Prepare fwd_outs: " << it->first
              << " to grad_inputs: " << it->second;
      grad_node->fwd_outs[it->second] =
@@ -433,7 +434,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
    }
    // Prepare Grad inputs with fwd inputs
-    for (auto it = slot_map[3].begin(); it != slot_map[3].end(); it++) {
+    for (auto it = slot_map[0][3].begin(); it != slot_map[0][3].end(); it++) {
      VLOG(7) << "Prepare fwd_ins: " << it->first
              << " to grad_inputs: " << it->second;
      grad_node->fwd_ins[it->second] =
@@ -446,7 +447,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
        meta_info_map.at(op_type)[1]);
    std::vector<paddle::any> attrs(attrs_names.size());
    // Prepare attrs for Grad node
-    for (auto it = slot_map[4].begin(); it != slot_map[4].end(); it++) {
+    for (auto it = slot_map[0][4].begin(); it != slot_map[0][4].end(); it++) {
      VLOG(7) << "Prepare fwd attrs: " << it->first
              << " to grad_attrs: " << it->second;
      attrs[it->second] = res_attrs[it->first];

--- a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
@@ -21,8 +21,7 @@ import paddle.static as static
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
-from paddle.fluid.framework import _test_eager_guard, _enable_legacy_dygraph
+from paddle.fluid.framework import _test_eager_guard
-_enable_legacy_dygraph()
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -64,7 +63,7 @@ class TestCustomTanhDoubleGradJit(unittest.TestCase):
        self.dtypes = ['float32', 'float64']
        self.devices = ['cpu']
-    def test_func_double_grad_dynamic(self):
+    def func_double_grad_dynamic(self):
        for device in self.devices:
            for dtype in self.dtypes:
                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
@@ -85,6 +84,11 @@ class TestCustomTanhDoubleGradJit(unittest.TestCase):
                    "custom op out grad: {},\n paddle api out grad: {}".format(
                        dout, pd_dout))
+    def test_func_double_grad_dynamic(self):
+        with _test_eager_guard():
+            self.func_double_grad_dynamic()
+        self.func_double_grad_dynamic()
 if __name__ == "__main__":
    unittest.main()