Fix higher order deriv with inplace (#44020)

* fix deriv with inplace * fix double grad bugs * remove additional file * fix compat dygraph mode * fix yaml remove additional yaml * fix slice double grad error and auto code gen logic error for higher order differentiate * fix fix_higher_order_deriv * remove additional include * fix fix_higher_order_deriv

Fix higher order deriv with inplace (#44020)
* fix deriv with inplace * fix double grad bugs * remove additional file * fix compat dygraph mode * fix yaml remove additional yaml * fix slice double grad error and auto code gen logic error for higher order differentiate * fix fix_higher_order_deriv * remove additional include * fix fix_higher_order_deriv
a7c98ddb · Jiabin Yang · GitHub · aa0c885a · a7c98ddb · a7c98ddb
21 changed file
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
@@ -7,6 +7,6 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(
    final_dygraph_node
    SRCS nodes.cc
-    DEPS ${eager_deps})
+    DEPS ${eager_deps} ${eager_manual_nodes})
  add_dependencies(final_dygraph_node eager_final_state_codegen)
 endif()
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
@@ -7,6 +7,6 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(
    final_dygraph_function
    SRCS dygraph_functions.cc
-    DEPS ${eager_deps})
+    DEPS ${eager_deps} ${eager_manual_functions})
  add_dependencies(final_dygraph_function eager_final_state_codegen)
 endif()
--- a/paddle/fluid/eager/api/manual/CMakeLists.txt
+++ b/paddle/fluid/eager/api/manual/CMakeLists.txt
@@ -6,4 +6,11 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
  set(fluid_manual_nodes
      ${fluid_manual_nodes}
      PARENT_SCOPE)
+  add_subdirectory(eager_manual)
+  set(eager_manual_functions
+      ${eager_manual_functions}
+      PARENT_SCOPE)
+  set(eager_manual_nodes
+      ${eager_manual_nodes}
+      PARENT_SCOPE)
 endif()
--- a/paddle/fluid/eager/api/manual/eager_manual/CMakeLists.txt
+++ b/paddle/fluid/eager/api/manual/eager_manual/CMakeLists.txt
+add_subdirectory(forwards)
+add_subdirectory(nodes)
+set(eager_manual_functions
+    ${eager_manual_functions}
+    PARENT_SCOPE)
+set(eager_manual_nodes
+    ${eager_manual_nodes}
+    PARENT_SCOPE)
--- a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
+++ b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/api/include/tensor.h"
+
+paddle::experimental::Tensor conv2d_final_state_dygraph_function(
+    const paddle::experimental::Tensor& input,
+    const paddle::experimental::Tensor& filter,
+    std::vector<int> strides,
+    std::vector<int> paddings,
+    std::string paddding_algorithm,
+    int groups,
+    std::vector<int> dilations,
+    std::string data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search);
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt
+cc_library(
+  conv2d_fwd_function
+  SRCS conv2d_fwd_function.cc
+  DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+
+add_dependencies(conv2d_fwd_function eager_codegen)
+
+set(eager_manual_functions
+    conv2d_fwd_function
+    PARENT_SCOPE)
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/amp_utils.h"
+#include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h"
+#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/eager_amp_auto_cast.h"
+#include "paddle/fluid/eager/nan_inf_utils.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+
+#pragma GCC diagnostic ignored "-Wunused-variable"
+DECLARE_bool(check_nan_inf);
+
+paddle::experimental::Tensor conv2d_final_state_dygraph_function(
+    const paddle::experimental::Tensor& input,
+    const paddle::experimental::Tensor& filter,
+    std::vector<int> strides,
+    std::vector<int> paddings,
+    std::string paddding_algorithm,
+    int groups,
+    std::vector<int> dilations,
+    std::string data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search) {
+  // Dygraph Record Event
+  paddle::platform::RecordEvent dygraph_entrance_record_event(
+      "conv2d dygraph", paddle::platform::TracerEventType::Operator, 1);
+
+  // AMP Logic
+  if (egr::Controller::Instance().GetAMPLevel() !=
+      paddle::imperative::AmpLevel::O0) {
+    VLOG(5) << "Check and Prepare For AMP";
+    auto op_name = phi::TransToFluidOpName("conv2d");
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>
+        amp_tensors_vector = {{input}, {filter}};
+
+    auto amp_dst_dtype = egr::GetAmpDestDtype(op_name, amp_tensors_vector);
+
+    auto NEW_input =
+        egr::EagerAmpAutoCast("input", input, amp_dst_dtype, op_name);
+    auto NEW_filter =
+        egr::EagerAmpAutoCast("filter", filter, amp_dst_dtype, op_name);
+
+    {
+      paddle::imperative::AutoCastGuard guard(
+          egr::Controller::Instance().GetCurrentTracer(),
+          paddle::imperative::AmpLevel::O0);
+      return conv2d_final_state_dygraph_function(NEW_input,
+                                                 NEW_filter,
+                                                 strides,
+                                                 paddings,
+                                                 paddding_algorithm,
+                                                 groups,
+                                                 dilations,
+                                                 data_format,
+                                                 use_addto,
+                                                 workspace_size_MB,
+                                                 exhaustive_search);
+    }
+  }
+
+  // Get Input AutoGradMeta
+  egr::AutogradMeta* input_autograd_meta =
+      egr::EagerUtils::nullable_autograd_meta(input);
+  egr::AutogradMeta* filter_autograd_meta =
+      egr::EagerUtils::nullable_autograd_meta(filter);
+  // Forward API Call
+  VLOG(3) << "Final State Running: "
+          << "conv2d_final_state_dygraph_function";
+  auto api_result = paddle::experimental::conv2d(input,
+                                                 filter,
+                                                 strides,
+                                                 paddings,
+                                                 paddding_algorithm,
+                                                 groups,
+                                                 dilations,
+                                                 data_format,
+                                                 use_addto,
+                                                 workspace_size_MB,
+                                                 exhaustive_search);
+  // Check NaN and Inf if needed
+  if (FLAGS_check_nan_inf) {
+    egr::CheckTensorHasNanOrInf("conv2d", api_result);
+  }
+
+  // Get Outputs
+  auto& out = api_result;
+
+  // Get Output AutoGradMeta
+  egr::AutogradMeta* out_autograd_meta = egr::EagerUtils::autograd_meta(&out);
+  bool trace_backward = egr::Controller::Instance().HasGrad();
+  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
+      trace_backward, input_autograd_meta, filter_autograd_meta);
+
+  // Check Inplace if needed
+
+  // Node Creation
+  if (require_any_grad) {
+    paddle::platform::RecordEvent node_creation_record_event(
+        "conv2d node_creation",
+        paddle::platform::TracerEventType::OperatorInner,
+        1);
+
+    egr::EagerUtils::PassStopGradient(false, out_autograd_meta);
+
+    // Node Construction
+    auto grad_node =
+        std::shared_ptr<Conv2dGradNodeFinal>(new Conv2dGradNodeFinal(1, 2));
+    // SetAttributes if needed
+    grad_node->SetAttributestrides(strides);
+    grad_node->SetAttributepaddings(paddings);
+    grad_node->SetAttributepaddding_algorithm(paddding_algorithm);
+    grad_node->SetAttributegroups(groups);
+    grad_node->SetAttributedilations(dilations);
+    grad_node->SetAttributedata_format(data_format);
+    grad_node->SetAttributeuse_addto(use_addto);
+    grad_node->SetAttributeworkspace_size_MB(workspace_size_MB);
+    grad_node->SetAttributeexhaustive_search(exhaustive_search);
+    // Set TensorWrappers for Forward Inputs if needed
+    grad_node->SetTensorWrapperinput(input);
+    grad_node->SetTensorWrapperfilter(filter);
+    // SetGradOutMeta & SetEdges
+    grad_node->SetGradOutMeta(input, 0);
+    grad_node->SetGradOutMeta(filter, 1);
+    // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
+    if (out_autograd_meta) {
+      egr::EagerUtils::SetOutRankWithSlot(out_autograd_meta, 0);
+    }
+    if (out_autograd_meta) {
+      egr::EagerUtils::SetHistory(out_autograd_meta, grad_node);
+    }
+    grad_node->SetGradInMeta(out, 0);
+    egr::EagerUtils::CheckAndRetainGrad(out);
+    // Set TensorWrappers for Forward Outputs if needed
+  }
+
+  // Returns
+  return out;
+}
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt
+cc_library(
+  conv2d_nodes
+  SRCS conv2d_nodes.cc
+  DEPS ${eager_deps} ${fluid_deps})
+
+set(eager_manual_nodes
+    conv2d_nodes
+    PARENT_SCOPE)
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/nan_inf_utils.h"
+#include "paddle/fluid/eager/to_static/run_program_op_node.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/phi/api/all.h"
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/backward/sparse_bw_api.h"
+
+#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h"
+#include "paddle/phi/api/include/sparse_api.h"
+DECLARE_bool(check_nan_inf);
+
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     egr::kSlotSmallVectorSize>
+Conv2dGradNodeFinal::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>& grads,
+    bool create_graph,
+    bool is_new_grad) {
+  // Fill Zero For GradIn Tensors
+  VLOG(3) << " Running Conv2dGradNodeFinal: " << this;
+  // Apply Gradient Hooks
+  auto hooked_grads = ApplyGradientHooks(grads);
+
+  // Collect GradIn Tensors, Attrs and Recovered TensorWrappers
+  auto input = egr::EagerUtils::RecoverTensorWrapper(&this->input_);
+  auto filter = egr::EagerUtils::RecoverTensorWrapper(&this->filter_);
+  auto& grad_out = hooked_grads[0][0];
+  auto& strides = this->strides_;
+  auto& paddings = this->paddings_;
+  auto& paddding_algorithm = this->paddding_algorithm_;
+  auto& groups = this->groups_;
+  auto& dilations = this->dilations_;
+  auto& data_format = this->data_format_;
+  auto& use_addto = this->use_addto_;
+  auto& workspace_size_MB = this->workspace_size_MB_;
+  auto& exhaustive_search = this->exhaustive_search_;
+  // Prepare Grad function call
+
+  const auto& out_metas = OutputMeta();
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      returns(2);
+  for (int i = 0; i < 2; ++i) {
+    out_metas[i].size() == 0 ? returns[i].resize(1)
+                             : returns[i].resize(out_metas[i].size());
+  }
+
+  auto* api_output_0 =
+      (out_metas[0].empty() || out_metas[0][0].IsStopGradient())
+          ? nullptr
+          : &returns[0][0];
+  auto* api_output_1 =
+      (out_metas[1].empty() || out_metas[1][0].IsStopGradient())
+          ? nullptr
+          : &returns[1][0];
+  // Runtime check if we need next grad
+  bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;
+
+  // Inplace Check
+
+  // Inplace Strategy
+
+  // Call grad_api function
+  VLOG(3) << "Final State Running: Conv2dGradNodeFinal";
+
+  paddle::experimental::conv2d_grad(input,
+                                    filter,
+                                    grad_out,
+                                    strides,
+                                    paddings,
+                                    paddding_algorithm,
+                                    groups,
+                                    dilations,
+                                    data_format,
+                                    use_addto,
+                                    workspace_size_MB,
+                                    exhaustive_search,
+                                    api_output_0,
+                                    api_output_1);
+  // Check NaN and Inf id needed
+  if (FLAGS_check_nan_inf) {
+    egr::CheckTensorHasNanOrInf("conv2d_grad", returns);
+  }
+
+  // Get GradOut autograd_meta
+
+  auto& grad_input = returns[0][0];
+  egr::AutogradMeta* grad_input_autograd_meta =
+      returns[0][0].initialized() ? egr::EagerUtils::autograd_meta(&grad_input)
+                                  : nullptr;
+  if (grad_input_autograd_meta)
+    grad_input_autograd_meta->SetStopGradient(false);
+  VLOG(3) << "Conv2dGradNodeFinal grad_input_autograd_meta: "
+          << grad_input_autograd_meta;
+
+  auto& grad_filter = returns[1][0];
+  egr::AutogradMeta* grad_filter_autograd_meta =
+      returns[1][0].initialized() ? egr::EagerUtils::autograd_meta(&grad_filter)
+                                  : nullptr;
+  if (grad_filter_autograd_meta)
+    grad_filter_autograd_meta->SetStopGradient(false);
+  VLOG(3) << "Conv2dGradNodeFinal grad_filter_autograd_meta: "
+          << grad_filter_autograd_meta;
+
+  // Create Grad Node
+  if (trace_backward) {
+    paddle::platform::RecordEvent node_creation_record_event(
+        "conv2d_grad node_creation",
+        paddle::platform::TracerEventType::OperatorInner,
+        1);
+
+    // Node Construction
+    auto grad_node = std::shared_ptr<Conv2dDoubleGradNodeFinal>(
+        new Conv2dDoubleGradNodeFinal(2, 3));
+    // SetAttributes if needed
+    grad_node->SetAttributestrides(strides);
+    grad_node->SetAttributepaddings(paddings);
+    grad_node->SetAttributepaddding_algorithm(paddding_algorithm);
+    grad_node->SetAttributegroups(groups);
+    grad_node->SetAttributedilations(dilations);
+    grad_node->SetAttributedata_format(data_format);
+    grad_node->SetAttributeuse_addto(use_addto);
+    grad_node->SetAttributeworkspace_size_MB(workspace_size_MB);
+    grad_node->SetAttributeexhaustive_search(exhaustive_search);
+    // Set TensorWrappers for Forward Inputs if needed
+    grad_node->SetTensorWrapperinput(input);
+    grad_node->SetTensorWrapperfilter(filter);
+    grad_node->SetTensorWrappergrad_out(grad_out);
+    // SetGradOutMeta & SetEdges
+    if (grad_filter_autograd_meta) {
+      grad_node->SetGradOutMeta(input, 0);
+    }
+    if (grad_input_autograd_meta) {
+      grad_node->SetGradOutMeta(filter, 1);
+      grad_node->SetGradOutMeta(grad_out, 2);
+    }
+    // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
+    if (grad_input_autograd_meta) {
+      egr::EagerUtils::SetOutRankWithSlot(grad_input_autograd_meta, 0);
+    }
+    if (grad_filter_autograd_meta) {
+      egr::EagerUtils::SetOutRankWithSlot(grad_filter_autograd_meta, 1);
+    }
+    if (grad_input_autograd_meta) {
+      egr::EagerUtils::SetHistory(grad_input_autograd_meta, grad_node);
+    }
+    if (grad_filter_autograd_meta) {
+      egr::EagerUtils::SetHistory(grad_filter_autograd_meta, grad_node);
+    }
+    grad_node->SetGradInMeta(grad_input, 0);
+    grad_node->SetGradInMeta(grad_filter, 1);
+    egr::EagerUtils::CheckAndRetainGrad(grad_input);
+    egr::EagerUtils::CheckAndRetainGrad(grad_filter);
+    // Set TensorWrappers for Forward Outputs if needed
+  }
+
+  // Return
+  if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);
+  return returns;
+}
+
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     egr::kSlotSmallVectorSize>
+Conv2dDoubleGradNodeFinal::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>& grads,
+    bool create_graph,
+    bool is_new_grad) {
+  // Fill Zero For GradIn Tensors
+  const auto& input_metas = this->InputMeta();
+  egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[0][0],
+                                                     input_metas[0][0]);
+  egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[1][0],
+                                                     input_metas[1][0]);
+
+  // Apply Gradient Hooks
+  auto hooked_grads = ApplyGradientHooks(grads);
+
+  // Collect GradIn Tensors, Attrs and Recovered TensorWrappers
+  auto input = egr::EagerUtils::RecoverTensorWrapper(&this->input_);
+  auto filter = egr::EagerUtils::RecoverTensorWrapper(&this->filter_);
+  auto grad_out = egr::EagerUtils::RecoverTensorWrapper(&this->grad_out_);
+  auto& grad_input_grad = hooked_grads[0][0];
+
+  paddle::optional<paddle::experimental::Tensor> grad_input_grad_optional;
+  if (grad_input_grad.initialized())
+    grad_input_grad_optional =
+        paddle::make_optional<paddle::experimental::Tensor>(grad_input_grad);
+
+  auto& grad_filter_grad = hooked_grads[1][0];
+
+  paddle::optional<paddle::experimental::Tensor> grad_filter_grad_optional;
+  if (grad_filter_grad.initialized())
+    grad_filter_grad_optional =
+        paddle::make_optional<paddle::experimental::Tensor>(grad_filter_grad);
+
+  auto& strides = this->strides_;
+  auto& paddings = this->paddings_;
+  auto& paddding_algorithm = this->paddding_algorithm_;
+  auto& groups = this->groups_;
+  auto& dilations = this->dilations_;
+  auto& data_format = this->data_format_;
+  auto& use_addto = this->use_addto_;
+  auto& workspace_size_MB = this->workspace_size_MB_;
+  auto& exhaustive_search = this->exhaustive_search_;
+  // Prepare Grad function call
+
+  const auto& out_metas = OutputMeta();
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      returns(3);
+  for (int i = 0; i < 3; ++i) {
+    out_metas[i].size() == 0 ? returns[i].resize(1)
+                             : returns[i].resize(out_metas[i].size());
+  }
+
+  auto* api_output_0 =
+      (out_metas[0].empty() || out_metas[0][0].IsStopGradient())
+          ? nullptr
+          : &returns[0][0];
+  auto* api_output_1 =
+      (out_metas[1].empty() || out_metas[1][0].IsStopGradient())
+          ? nullptr
+          : &returns[1][0];
+  auto* api_output_2 =
+      (out_metas[2].empty() || out_metas[2][0].IsStopGradient())
+          ? nullptr
+          : &returns[2][0];
+  // Runtime check if we need next grad
+
+  // Inplace Check
+
+  // Inplace Strategy
+
+  // Call grad_api function
+  VLOG(3) << "Final State Running: Conv2dGradGradNodeFinal";
+
+  paddle::experimental::conv2d_grad_grad(input,
+                                         filter,
+                                         grad_out,
+                                         grad_input_grad_optional,
+                                         grad_filter_grad_optional,
+                                         strides,
+                                         paddings,
+                                         paddding_algorithm,
+                                         groups,
+                                         dilations,
+                                         data_format,
+                                         use_addto,
+                                         workspace_size_MB,
+                                         exhaustive_search,
+                                         api_output_0,
+                                         api_output_1,
+                                         api_output_2);
+  // Check NaN and Inf id needed
+  if (FLAGS_check_nan_inf) {
+    egr::CheckTensorHasNanOrInf("conv2d_grad_grad", returns);
+  }
+
+  // Get GradOut autograd_meta
+
+  auto& input_grad = returns[0][0];
+  egr::AutogradMeta* input_grad_autograd_meta =
+      returns[0][0].initialized() ? egr::EagerUtils::autograd_meta(&input_grad)
+                                  : nullptr;
+  if (input_grad_autograd_meta)
+    input_grad_autograd_meta->SetStopGradient(false);
+
+  auto& filter_grad = returns[1][0];
+  egr::AutogradMeta* filter_grad_autograd_meta =
+      returns[1][0].initialized() ? egr::EagerUtils::autograd_meta(&filter_grad)
+                                  : nullptr;
+  if (filter_grad_autograd_meta)
+    filter_grad_autograd_meta->SetStopGradient(false);
+
+  auto& grad_out_grad = returns[2][0];
+  egr::AutogradMeta* grad_out_grad_autograd_meta =
+      returns[2][0].initialized()
+          ? egr::EagerUtils::autograd_meta(&grad_out_grad)
+          : nullptr;
+  if (grad_out_grad_autograd_meta)
+    grad_out_grad_autograd_meta->SetStopGradient(false);
+
+  // Create Grad Node
+
+  // Return
+  if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);
+  return returns;
+}
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tensor_wrapper.h"
+
+class Conv2dGradNodeFinal : public egr::GradNodeBase {
+ public:
+  Conv2dGradNodeFinal() : egr::GradNodeBase() {}
+  Conv2dGradNodeFinal(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
+  ~Conv2dGradNodeFinal() override = default;
+
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>
+  operator()(
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,  // NOLINT
+                           egr::kSlotSmallVectorSize>& grads,          // NOLINT
+      bool create_graph = false,                                       // NOLINT
+      bool is_new_grad = false) override;                              // NOLINT
+  std::string name() override { return "Conv2dGradNodeFinal"; }
+
+  void ClearTensorWrappers() override {
+    input_.clear();
+    filter_.clear();
+
+    SetIsTensorWrappersCleared(true);
+  }
+
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    auto copied_node =
+        std::shared_ptr<Conv2dGradNodeFinal>(new Conv2dGradNodeFinal(*this));
+    VLOG(3) << "Copy Conv2dGradNodeFinal: " << this
+            << " to: " << copied_node.get();
+    return copied_node;
+  }
+
+  // SetTensorWrapperX, SetTensorWrapperY, ...
+  void SetTensorWrapperinput(const paddle::experimental::Tensor& input) {
+    input_ = egr::TensorWrapper(input, false);
+  }
+  void SetTensorWrapperfilter(const paddle::experimental::Tensor& filter) {
+    filter_ = egr::TensorWrapper(filter, false);
+  }
+
+  // SetAttributes
+  void SetAttributestrides(const std::vector<int>& strides) {
+    strides_ = strides;
+  }
+  void SetAttributepaddings(const std::vector<int>& paddings) {
+    paddings_ = paddings;
+  }
+  void SetAttributepaddding_algorithm(const std::string& paddding_algorithm) {
+    paddding_algorithm_ = paddding_algorithm;
+  }
+  void SetAttributegroups(const int& groups) { groups_ = groups; }
+  void SetAttributedilations(const std::vector<int>& dilations) {
+    dilations_ = dilations;
+  }
+  void SetAttributedata_format(const std::string& data_format) {
+    data_format_ = data_format;
+  }
+  void SetAttributeuse_addto(const bool& use_addto) { use_addto_ = use_addto; }
+  void SetAttributeworkspace_size_MB(const int& workspace_size_MB) {
+    workspace_size_MB_ = workspace_size_MB;
+  }
+  void SetAttributeexhaustive_search(const bool& exhaustive_search) {
+    exhaustive_search_ = exhaustive_search;
+  }
+
+ private:
+  // TensorWrappers
+  egr::TensorWrapper input_;
+  egr::TensorWrapper filter_;
+
+  // Attributes
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  std::string paddding_algorithm_;
+  int groups_;
+  std::vector<int> dilations_;
+  std::string data_format_;
+  bool use_addto_;
+  int workspace_size_MB_;
+  bool exhaustive_search_;
+};
+
+class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase {
+ public:
+  Conv2dDoubleGradNodeFinal() : egr::GradNodeBase() {}
+  Conv2dDoubleGradNodeFinal(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
+  ~Conv2dDoubleGradNodeFinal() override = default;
+
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>
+  operator()(
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,  // NOLINT
+                           egr::kSlotSmallVectorSize>& grads,          // NOLINT
+      bool create_graph = false,                                       // NOLINT
+      bool is_new_grad = false) override;                              // NOLINT
+  std::string name() override { return "Conv2dDoubleGradNodeFinal"; }
+
+  void ClearTensorWrappers() override {
+    input_.clear();
+    filter_.clear();
+    grad_out_.clear();
+
+    SetIsTensorWrappersCleared(true);
+  }
+
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    auto copied_node = std::shared_ptr<Conv2dDoubleGradNodeFinal>(
+        new Conv2dDoubleGradNodeFinal(*this));
+    return copied_node;
+  }
+
+  // SetTensorWrapperX, SetTensorWrapperY, ...
+  void SetTensorWrapperinput(const paddle::experimental::Tensor& input) {
+    input_ = egr::TensorWrapper(input, false);
+  }
+  void SetTensorWrapperfilter(const paddle::experimental::Tensor& filter) {
+    filter_ = egr::TensorWrapper(filter, false);
+  }
+  void SetTensorWrappergrad_out(const paddle::experimental::Tensor& grad_out) {
+    grad_out_ = egr::TensorWrapper(grad_out, false);
+  }
+
+  // SetAttributes
+  void SetAttributestrides(const std::vector<int>& strides) {
+    strides_ = strides;
+  }
+  void SetAttributepaddings(const std::vector<int>& paddings) {
+    paddings_ = paddings;
+  }
+  void SetAttributepaddding_algorithm(const std::string& paddding_algorithm) {
+    paddding_algorithm_ = paddding_algorithm;
+  }
+  void SetAttributegroups(const int& groups) { groups_ = groups; }
+  void SetAttributedilations(const std::vector<int>& dilations) {
+    dilations_ = dilations;
+  }
+  void SetAttributedata_format(const std::string& data_format) {
+    data_format_ = data_format;
+  }
+  void SetAttributeuse_addto(const bool& use_addto) { use_addto_ = use_addto; }
+  void SetAttributeworkspace_size_MB(const int& workspace_size_MB) {
+    workspace_size_MB_ = workspace_size_MB;
+  }
+  void SetAttributeexhaustive_search(const bool& exhaustive_search) {
+    exhaustive_search_ = exhaustive_search;
+  }
+
+ private:
+  // TensorWrappers
+  egr::TensorWrapper input_;
+  egr::TensorWrapper filter_;
+  egr::TensorWrapper grad_out_;
+
+  // Attributes
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  std::string paddding_algorithm_;
+  int groups_;
+  std::vector<int> dilations_;
+  std::string data_format_;
+  bool use_addto_;
+  int workspace_size_MB_;
+  bool exhaustive_search_;
+};
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -40,6 +40,8 @@ from codegen_utils import AssertMessage, GetIndent
 # keeping the code compatible, here we also skip inplace check in new dygraph temporarily,
 # and this will be fixed in the futrue.
 inplace_check_blacklist = set(["assign_out_"])
+# # --- Black Ops list that's NO NEED to apply backward code generation
+black_ops_list = ["conv2d", "conv2d_grad", "conv2d_grad_grad"]


 ###########
@@ -154,9 +156,7 @@ paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallV
 {}
  // Prepare Grad function call
 {}
-  // Get GradIn autograd_meta
-{}
-  // Compute Require Grad
+  // Runtime check if we need next grad
 {}
  // Inplace Check
 {}
@@ -229,6 +229,27 @@ FORWARD_BODY_TEMPLATE = \
  }}
 """

+HIHGER_ORDER_DERIVATIVE_VALUE_TEMPLATE = \
+"""  if(trace_backward) {{
+{}
+    // Node Construction
+{}
+    // SetAttributes if needed
+{}
+    // Set TensorWrappers for Forward Inputs if needed
+{}
+    // SetGradOutMeta & SetEdges
+{}
+    // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
+{}
+{}
+{}
+{}
+    // Set TensorWrappers for Forward Outputs if needed
+{}
+  }}
+"""
+
 NAMESPACE_WRAPPER_TEMPLATE = \
 """
 namespace {} {{
@@ -252,7 +273,7 @@ NODE_CC_FILE_TEMPLATE = \
 #include "paddle/fluid/eager/nan_inf_utils.h"

 #include "paddle/phi/api/include/sparse_api.h"
-
+#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h"
 DECLARE_bool(check_nan_inf);
 {}
 """
@@ -279,7 +300,7 @@ FORWARD_CC_FILE_TEMPLATE = \
 #include "paddle/fluid/eager/eager_amp_auto_cast.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/fluid/eager/nan_inf_utils.h"
-
+#include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h"
 DECLARE_bool(check_nan_inf);
 {}
 {}
@@ -294,7 +315,7 @@ FORWARD_H_FILE_TEMPLATE = \
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/eager/to_static/run_program_op_func.h"
-
+#include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h"
 {}
 {}
 """
@@ -584,7 +605,6 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase):

        self.backward_api_name = forward_api_contents['backward']
        self.backward_forward_str = grad_api_contents['forward']
-
        backward_args_str = grad_api_contents['args']
        backward_returns_str = grad_api_contents['output']

@@ -663,7 +683,7 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase):
                backward_output_pos
            ]

-    def GenerateNodeCreationCodes(self):
+    def GenerateNodeCreationCodes(self, for_backward=False):
        forward_api_name = self.forward_api_name
        forward_inputs_position_map = self.forward_inputs_position_map
        forward_outputs_position_map = self.forward_outputs_position_map
@@ -794,13 +814,21 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase):

        node_event_name = forward_api_name + " node_creation"
        node_creation_event_str = f"{indent}paddle::platform::RecordEvent node_creation_record_event(\"{node_event_name}\", paddle::platform::TracerEventType::OperatorInner, 1);\n"
+        if not for_backward:
+            self.node_creation_str = FORWARD_BODY_TEMPLATE.format(
+                node_creation_event_str, pass_stop_gradient_args_str,
+                node_construction_str, set_attributes_str,
+                set_input_tensor_wrappers_str, set_grad_out_meta_str,
+                set_out_rank_str, set_history_str, set_grad_in_meta_str,
+                set_retain_grad_str, set_output_tensor_wrappers_str)
+        else:
+            self.node_creation_str = HIHGER_ORDER_DERIVATIVE_VALUE_TEMPLATE.format(
+                node_creation_event_str, node_construction_str,
+                set_attributes_str, set_input_tensor_wrappers_str,
+                set_grad_out_meta_str, set_out_rank_str, set_history_str,
+                set_grad_in_meta_str, set_retain_grad_str,
+                set_output_tensor_wrappers_str)

-        self.node_creation_str = FORWARD_BODY_TEMPLATE.format(
-            node_creation_event_str, pass_stop_gradient_args_str,
-            node_construction_str, set_attributes_str,
-            set_input_tensor_wrappers_str, set_grad_out_meta_str,
-            set_out_rank_str, set_history_str, set_grad_in_meta_str,
-            set_retain_grad_str, set_output_tensor_wrappers_str)
        self.grad_node_out_list = grad_node_out_list

    def run(self):
@@ -1234,7 +1262,7 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
            next_node_generator = DygraphFunctionGeneratorBase(
                forward_api_contents, backward_api_contents, namespace)
            next_node_generator.run()
-            next_node_generator.GenerateNodeCreationCodes()
+            next_node_generator.GenerateNodeCreationCodes(True)

            next_grad_node_creation_str = next_node_generator.node_creation_str
            next_grad_node_out_list = next_node_generator.grad_node_out_list
@@ -1342,6 +1370,7 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
        inplace_grad_input_str = ""
        inplaced_tensor_wrapper = False
        inplace_check_str = ""
+        optional_inplace_var_name = []
        # Grad Ins from TensorWrappers
        for name, (_, is_fwd_input,
                   grad_api_position), in backward_forward_inputs_map.items():
@@ -1351,6 +1380,13 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
            is_optional = (name in self.optional_inputs)
            tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name});"
            if backward_inplace_map and name in backward_inplace_map.keys():
+                if len(next_grad_node_creation_str) > 0:
+                    if (transformed_tensor_name
+                            in backward_forward_inputs_map_next) and (
+                                backward_forward_inputs_map_next[
+                                    transformed_tensor_name][1]):
+                        optional_inplace_var_name.append(
+                            transformed_tensor_name)
                tensor_wrapper_intermidiate_tensor_str = f"(&this->{tensor_wrapper_name})->get_intermidiate_tensor()"
                inplace_check_str += CHECK_BACKWARD_INPLACE_TEMPLATE.format(
                    transformed_tensor_name, transformed_tensor_name, name,
@@ -1371,7 +1407,6 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):

            get_grad_in_args_list.append(tensor_wrapper_recover_str)

-        optional_inplace_check = False
        # Grad Ins from grads
        for name, (ttype, fwd_position,
                   grad_api_position) in backward_grad_inputs_map.items():
@@ -1388,7 +1423,8 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
                                in backward_forward_inputs_map_next) and (
                                    backward_forward_inputs_map_next[
                                        transformed_tensor_name][1]):
-                            optional_inplace_check = False
+                            optional_inplace_var_name.append(
+                                transformed_tensor_name)
                    grads_tensor_str = f"grads[{fwd_position}][0]"
                    inplace_check_str += CHECK_BACKWARD_INPLACE_TEMPLATE.format(
                        transformed_tensor_name, transformed_tensor_name, name,
@@ -1441,25 +1477,25 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
            transformed_tensor_name = self.TransformToNextGradName(name)
            out_index = out_index + 1
            grad_api_args.append(f"api_output_{out_index}")
-            if not optional_inplace_check:
-                optional_inplace_str = "VLOG(6) << \"No Inplace should happend for wrappered input\";"
+            if inplace_grad_input_str in optional_inplace_var_name:
+                optional_inplace_str = "VLOG(6) << \"No Inplace should happend for wrappered input: {inplace_grad_input_str}\";"
            else:
                optional_inplace_str = f"""if (api_output_{out_index} != nullptr && can_be_inplaced) {{
-            egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index});
-            }}"""
+      egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index});
+    }}"""
            if IsPlainTensorType(ttype):

                if backward_inplace_map and name in backward_inplace_map.values(
                ):
-                    inplace_str = f"""if (api_output_{out_index} != nullptr && can_be_inplaced) {{
-    egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index});
-  }}"""
+                    inplace_str = f""" if (api_output_{out_index} != nullptr && can_be_inplaced) {{
+      egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index});
+    }}"""
                    if len(next_grad_node_creation_str) > 0:
                        inplace_for_grad_outs_str += f"""
-  if (!require_any_grad) {{
-    {inplace_str}
-  }}else{{
+  if (trace_backward) {{
    {optional_inplace_str}
+  }} else {{
+    {inplace_str}
  }}"""
                    else:
                        inplace_for_grad_outs_str += inplace_str
@@ -1490,84 +1526,53 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
            backward_api_name, "returns")

        # Prepare for Node Creation if Necessary
-        inputs_autograd_meta_str = ""
        outputs_autograd_meta_str = ""
-        compute_require_grad_str = ""
+        compute_require_next_grad_str = ""
        if len(next_grad_node_creation_str) > 0:
-            # 1. Get Grad Input AutoGradMeta
-            inputs_autograd_meta_list = []
-            compute_require_grad_args_list = ["trace_backward"]
-            for name, (ttype, pos,
-                       grad_api_position) in backward_grad_inputs_map.items():
-                transformed_tensor_name = self.TransformToNextGradName(name)
-                if transformed_tensor_name in next_grad_node_out_list:
-                    input_autograd_meta_name = GetAutoGradMetaName(
-                        transformed_tensor_name)
-                    if IsPlainTensorType(ttype):
-                        input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
-                    else:
-                        assert IsVectorTensorType(ttype)
-                        input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
-                            transformed_tensor_name)
-                        input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
-                        input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
-
-                    inputs_autograd_meta_list.append(input_autograd_meta)
-                    compute_require_grad_args_list.append(
-                        input_autograd_meta_name)
-
-            # 2. Get TensorWrapper AutoGradMeta
-            for name, (ttype, _, pos), in backward_forward_inputs_map.items():
-                transformed_tensor_name = self.TransformToNextGradName(name)
-                if transformed_tensor_name in next_grad_node_out_list:
-                    input_autograd_meta_name = GetAutoGradMetaName(
-                        transformed_tensor_name)
-                    if IsPlainTensorType(ttype):
-                        input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
-                    else:
-                        assert IsVectorTensorType(ttype)
-                        input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
-                            transformed_tensor_name)
-                        input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
-                        input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
-
-                    inputs_autograd_meta_list.append(input_autograd_meta)
-                    compute_require_grad_args_list.append(
-                        input_autograd_meta_name)
-
-            inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list)
-            compute_require_grad_args_str = ",".join(
-                compute_require_grad_args_list)
-
-            # 3. Get Output AutoGradMeta
-            outputs_autograd_meta_list = []
-            num_fwd_outputs = len(backward_grad_outputs_map.keys())
-            for name, (rtype, pos,
-                       grad_api_position) in backward_grad_outputs_map.items():
-                transformed_tensor_name = self.TransformToNextGradName(name)
-
-                output_autograd_meta_name = GetAutoGradMetaName(
-                    transformed_tensor_name)
-                output_autograd_meta_vec_name = GetAutoGradMetaVectorName(
-                    transformed_tensor_name)
-                if IsPlainTensorType(rtype):
-                    output_autograd_meta = f"""
+            compute_require_next_grad_str = f"{indent}bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;\n"
+
+        # 3. Get Output AutoGradMeta
+        outputs_autograd_meta_list = []
+        # TODO(jiabin): Optimize this with SetStopGradient instead of Pass Stop gradient
+
+        num_fwd_outputs = len(backward_grad_outputs_map.keys())
+        for name, (rtype, pos,
+                   grad_api_position) in backward_grad_outputs_map.items():
+            transformed_tensor_name = self.TransformToNextGradName(name)
+
+            output_autograd_meta_name = GetAutoGradMetaName(
+                transformed_tensor_name)
+            output_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                transformed_tensor_name)
+            if IsPlainTensorType(rtype):
+                output_autograd_meta = f"""
  auto& {transformed_tensor_name} = returns[{pos}][0];
-  egr::AutogradMeta* {output_autograd_meta_name} = returns[{pos}][0].initialized() ? egr::EagerUtils::autograd_meta(&{transformed_tensor_name}) : nullptr;"""
+  egr::AutogradMeta* {output_autograd_meta_name} = returns[{pos}][0].initialized() ? egr::EagerUtils::autograd_meta(&{transformed_tensor_name}) : nullptr;
+  if ({output_autograd_meta_name}) {output_autograd_meta_name}->SetStopGradient(false);
+  """

+            else:
+                assert IsVectorTensorType(rtype)
+                if len(next_grad_node_creation_str) > 0:
+                    output_autograd_meta = f"""
+    auto& {transformed_tensor_name} = returns[{pos}];
+    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});
+    std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};
+    for(auto* meta : {output_autograd_meta_vec_name}){{
+        meta->SetStopGradient(false);
+    }}
+"""
                else:
-                    assert IsVectorTensorType(rtype)
                    output_autograd_meta = f"""
-  auto& {transformed_tensor_name} = returns[{pos}];
-  std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});
-  std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};
+    auto& {transformed_tensor_name} = returns[{pos}];
+    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});
+    for(auto* meta : {output_autograd_meta_vec_name}){{
+        meta->SetStopGradient(false);
+    }}
 """
+            outputs_autograd_meta_list.append(output_autograd_meta)

-                outputs_autograd_meta_list.append(output_autograd_meta)
-            outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)
-
-            compute_require_grad_str = f"{indent}bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;\n"
-            compute_require_grad_str += f"{indent}bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({compute_require_grad_args_str});"
+        outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)

        returns_str = f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
        returns_str += f"{indent}return returns;\n"
@@ -1576,11 +1581,10 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):

        self.node_definition_str = GRAD_FUNCTION_TEMPLATE.format(
            grad_node_name, fill_zero_str, get_grad_in_args_str,
-            grad_function_prepare_str, inputs_autograd_meta_str,
-            compute_require_grad_str, inplace_check_str,
-            inplace_for_grad_outs_str, grad_node_name, grad_function_call_str,
-            check_nan_inf_str, outputs_autograd_meta_str,
-            next_grad_node_creation_str, returns_str)
+            grad_function_prepare_str, compute_require_next_grad_str,
+            inplace_check_str, inplace_for_grad_outs_str, grad_node_name,
+            grad_function_call_str, check_nan_inf_str,
+            outputs_autograd_meta_str, next_grad_node_creation_str, returns_str)

    def run(self):
        super().run()
@@ -1631,6 +1635,7 @@ class DygraphForwardAndNodesGenerator(GeneratorBase):
        if 'backward' not in forward_api_contents.keys(): return None

        backward_api_name = forward_api_contents['backward']
+        if backward_api_name in black_ops_list: return None
        assert backward_api_name in grad_api_dict.keys(), AssertMessage(
            backward_api_name, grad_api_dict.keys())
        backward_api_contents = grad_api_dict[backward_api_name]
@@ -1646,7 +1651,7 @@ class DygraphForwardAndNodesGenerator(GeneratorBase):
            backward_api_contents = self.GetBackwardAPIContents(
                forward_api_contents)
            if backward_api_contents is None: continue
-
+            if forward_api_contents['api'] in black_ops_list: continue
            # Generate Dygraph Forward Function
            function_generator = DygraphForwardFunctionGenerator(
                forward_api_contents, backward_api_contents, namespace)

--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -52,7 +52,14 @@ class GeneralGrad {
        AutogradMeta* auto_grad_meta =
            EagerUtils::unsafe_autograd_meta(inputs[i]);
        auto* target_node = auto_grad_meta->GetMutableGradNode().get();
-
+        VLOG(8) << "Get no grad vars' grad_node: " << target_node->name()
+                << ", " << target_node << " with output rank info: "
+                << auto_grad_meta->OutRankInfo().first << ", "
+                << auto_grad_meta->OutRankInfo().second;
+        if (is_no_grad_vars) {
+          (no_grad_var_nodes_inputmeta_map_)[target_node] = auto_grad_meta;
+          continue;
+        }
        if (orig_to_copied_node_mapping_.count(target_node)) {
          target_node = orig_to_copied_node_mapping_[target_node].get();
        } else {
@@ -67,11 +74,8 @@ class GeneralGrad {
                                    "stop_gradient=True.",
                                    msg,
                                    i));
-        if (is_no_grad_vars) {
-          (no_grad_var_nodes_inputmeta_map_)[target_node] = auto_grad_meta;
-        } else {  // normal input
-          (input_target_nodes_inputmeta_map_)[target_node] = auto_grad_meta;
-        }
+        // normal input
+        (input_target_nodes_inputmeta_map_)[target_node] = auto_grad_meta;
      }
    }
  }
@@ -305,8 +309,6 @@ class GeneralGrad {
      const std::unordered_map<GradNodeBase*,
                               std::unique_ptr<GradTensorHolder>>&
          node_input_buffers_dict) {
-    // Get no_grad_vars's GradNodes and InputMeta Info
-    GetTargetNodesInfo(no_grad_vars, true /* is_no_grad_vars */);
    // Get inputs's GradNodes and InputMeta Info
    GetTargetNodesInfo(inputs, false /* is_no_grad_vars */);
    // Purify potentialstartup_ops, remove those nodes that are the same as
@@ -402,6 +404,21 @@ class GeneralGrad {

          std::shared_ptr<GradNodeBase> orig_next_node =
              orig_edge.GetMutableGradNode();
+
+          if (no_grad_var_nodes_inputmeta_map_.count(orig_next_node.get()) &&
+              (no_grad_var_nodes_inputmeta_map_[orig_next_node.get()]
+                   ->OutRankInfo() == orig_edge.GetEdgeRankInfo())) {
+            VLOG(3) << "Get no grad edge from grad_node: " << orig_node->name()
+                    << " : " << orig_node << " to:" << orig_next_node->name()
+                    << ", " << orig_next_node.get()
+                    << " with output rank info: "
+                    << orig_edge.GetEdgeRankInfo().first << ", "
+                    << orig_edge.GetEdgeRankInfo().second;
+            // Stop no grad var's preceding node
+            copied_node->MutableOutputMeta()[i][j].SetStopGradient(true);
+            copied_edge.Clear();
+            continue;
+          }
          if (!orig_next_node) continue;

          // Copy Next Node
@@ -638,6 +655,9 @@ std::vector<paddle::experimental::Tensor> RunBackward(
  }

  if (is_general_grad) {
+    // Get no_grad_vars's GradNodes and InputMeta Info
+    GeneralGrad::Instance().GetTargetNodesInfo(no_grad_vars,
+                                               true /* is_no_grad_vars */);
    // Copy Backward Graph
    GeneralGrad::Instance().ReconstructBackwardGraph(orig_queue);
  }
@@ -696,19 +716,6 @@ std::vector<paddle::experimental::Tensor> RunBackward(
                                                         node);
    }

-    // no_grad_vars
-    if (!no_grad_vars.empty() && is_general_grad) {
-      auto iter =
-          GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->find(node);
-      if (iter !=
-          GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->end()) {
-        VLOG(6) << "Change the input buffer[slot][rank] by Zeros";
-        auto rank_info = (iter->second)->OutRankInfo();
-        node_input_buffer->SetBufferSlotRankZeros(rank_info.first,
-                                                  rank_info.second);
-      }
-    }
-
    // Check input
    EnforceGradNodeHasInput(node);

@@ -750,7 +757,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
        // Since we make edge has as same rank as bwd outputs, we indexing them
        // with the same rank(i, j)
        auto next_node_shared = edge.GetMutableGradNode();
-        VLOG(3) << "Found pending node: " << next_node_shared->name();
+        VLOG(3) << "Found pending node: " << next_node_shared->name() << ": "
+                << next_node_shared.get();
        // Next node could be nullptr if it is leaf tensor with no
        // AccumulationNode attached
        // Or it could also originated from dispensable inputs
@@ -800,6 +808,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(

        // Update queue
        node_in_degree_map[next_node]--;
+        VLOG(6) << next_node->name()
+                << " ref_cnt is: " << node_in_degree_map[next_node];

        PADDLE_ENFORCE(
            node_in_degree_map[next_node] >= 0,

--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -106,6 +106,12 @@ class Edge {
    }
  }

+  void Clear() {
+    grad_node_.reset();
+    in_slot_id_ = 0;
+    in_rank_ = 0;
+  }
+
 private:
  size_t in_slot_id_;
  size_t in_rank_;

--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -24,6 +24,7 @@
 namespace egr {

 void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) {
+  // Set not grad var to zero and set stop gradient as default value: true
  buffer_[slot_id][rank] =
      paddle::experimental::zeros_like(buffer_[slot_id][rank]);
 }
@@ -59,8 +60,15 @@ void GradTensorHolder::CopyValueFromTensor(
    if ((!buffer_tensor.defined() || !buffer_tensor.initialized())) {
      // Perform deep copy here
      buffer_tensor.copy_(t, t.place(), false);
-      buffer_tensor.set_autograd_meta(t.mutable_autograd_meta());
-
+      auto* meta = egr::EagerUtils::autograd_meta(&buffer_tensor);
+      auto* origin_meta = egr::EagerUtils::nullable_autograd_meta(t);
+      if (origin_meta) {
+        auto grad_node = origin_meta->GetMutableGradNode();
+        if (grad_node && grad_node.get()) {
+          meta->SetGradNode(origin_meta->GetMutableGradNode());
+        }
+        meta->WeakGrad() = origin_meta->WeakGrad();
+      }
    } else {
      PADDLE_THROW(paddle::platform::errors::Fatal(
          "Cannot copy grad_tensors' value to grad tensor holders,"
@@ -81,10 +89,10 @@ void GradTensorHolder::CopyValueFromTensor(
            "Only Support DENSE_TENSOR, SPARSE_COO_TENSOR, SPARSE_CSR_TENSOR "
            "now."));
      }
-      egr::EagerUtils::autograd_meta(&(buffer_[slot_id][rank]))
-          ->SetStopGradient(false);
    }
  }
+  egr::EagerUtils::autograd_meta(&(buffer_[slot_id][rank]))
+      ->SetStopGradient(false);
 }

 void GradTensorHolder::add(size_t slot_id,

--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/utils.h"
+#include "paddle/phi/api/lib/utils/allocator.h"

 namespace egr {
 class TensorWrapper {
@@ -57,9 +58,12 @@ class TensorWrapper {
        // Only Copy Meta
        phi::DenseTensor* dense_tensor =
            static_cast<phi::DenseTensor*>(tensor.impl().get());
-        auto tw_dense_tensor = std::make_shared<phi::DenseTensor>();
-        tw_dense_tensor->set_meta(dense_tensor->meta());
-        intermidiate_tensor_.set_impl(tw_dense_tensor);
+        // TODO(jiabin): It's not a good idea to set memory size to zero, find
+        // another way and change this.
+        intermidiate_tensor_.set_impl(
+            std::move(std::make_shared<phi::DenseTensor>(
+                std::make_shared<phi::Allocation>(nullptr, 0, tensor.place()),
+                std::move(dense_tensor->meta()))));
      } else {
        PADDLE_THROW(paddle::platform::errors::Fatal(
            "Unrecognized tensor type for no_need_buffer feature"));

--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -98,6 +98,7 @@ static void GetGraphInfoBetweenTargets(
    auto &grad_node = output_target->GradVarBase()->GradNode();
    if (visited.count(grad_node.get()) == 0) {
      for (auto &op : *grad_node) {
+        VLOG(10) << "Pushed op: " << op.Type();
        q.emplace(&op, grad_node.get());
      }
    }
@@ -141,6 +142,8 @@ static void GetGraphInfoBetweenTargets(
    for (auto &pending_node : node->GradPendingNodes()) {
      for (auto &pending_op : *pending_node) {
        preceding_ops[&pending_op].insert(op);
+        VLOG(10) << "Find preceding op of: " << pending_op.Type()
+                 << " is: " << op->Type();
      }
      if (visited.count(pending_node.get()) == 0) {
        visited.insert(pending_node.get());
@@ -175,6 +178,7 @@ static void GetGraphInfoBetweenTargets(
  std::queue<std::pair<OpBase * /*op*/, OpBase * /*pending op*/>> op_queue;
  std::unordered_set<std::pair<OpBase *, OpBase *>, HashPair> op_base_visited;
  for (auto &endpoint_op : endpoint_ops) {
+    VLOG(10) << "Emplaced endpoint op: " << endpoint_op->Type();
    op_queue.emplace(endpoint_op, nullptr);
    op_base_visited.emplace(endpoint_op, nullptr);
  }
@@ -186,14 +190,18 @@ static void GetGraphInfoBetweenTargets(

    op_queue.pop();

+    VLOG(10) << "Get op: " << op->Type();
+
    bool is_valid = false;
    for (auto &output_pair : op->GetOutsMap()) {
      if (!output_pair.second.IsGrad()) {
+        VLOG(10) << "Continueded output for : " << op->Type();
        continue;
      }

      for (auto &out_var : output_pair.second) {
        if (out_var && target_vars.count(out_var.get()) > 0) {
+          VLOG(10) << "Find target output for : " << op->Type();
          is_valid = true;
          break;
        }
@@ -211,11 +219,13 @@ static void GetGraphInfoBetweenTargets(
    is_valid = false;
    for (auto &input_pair : op->GetInsMap()) {
      if (!input_pair.second.IsGrad()) {
+        VLOG(10) << "Continueded input for : " << op->Type();
        continue;
      }

      for (auto &in_var : input_pair.second) {
        if (in_var && no_grad_var_grad.count(in_var.get()) == 0) {
+          VLOG(10) << "Find not no grad var in input for : " << op->Type();
          target_vars.insert(in_var.get());
          is_valid = true;
        }
@@ -240,7 +250,10 @@ static void GetGraphInfoBetweenTargets(
    auto iter = preceding_ops.find(op);
    if (iter != preceding_ops.end()) {
      for (auto &preceding_op : iter->second) {
+        VLOG(10) << "Scan preceding op: " << preceding_op->Type() << " for "
+                 << op->Type();
        if (op_base_visited.count(std::make_pair(preceding_op, op)) == 0) {
+          VLOG(10) << "Emplace op: " << preceding_op->Type();
          op_queue.emplace(preceding_op, op);
          op_base_visited.emplace(preceding_op, op);
        }
@@ -648,6 +661,7 @@ PartialGradTask::PartialGradTask(
                    platform::errors::Unimplemented(
                        "only_inputs=False is not supported yet"));

+  VLOG(10) << "no_grad_vars size: " << no_grad_vars.size();
  for (auto &var : no_grad_vars) {
    if (var && var->GradVarBase()) {
      no_grad_var_grad_.insert(var->GradVarBase()->SharedVar().get());
@@ -853,6 +867,7 @@ std::vector<std::shared_ptr<VarBase>> PartialGradTask::Run() {
    }

    for (auto &pending_op : iter->second) {
+      VLOG(10) << "Find pending op" << pending_op->Type();
      auto dep_iter = op_deps_.find(pending_op);
      PADDLE_ENFORCE_EQ(
          dep_iter != op_deps_.end(),
@@ -862,6 +877,7 @@ std::vector<std::shared_ptr<VarBase>> PartialGradTask::Run() {
      if (--(dep_iter->second) == 0) {
        q.push(pending_op);
      }
+      VLOG(10) << "Pending op deps: " << dep_iter->second;
    }
  }


--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -82,7 +82,7 @@ std::shared_ptr<NameVarMap<VarType>> PrepareData(
      auto& template_var = name_pair.second[i];
      SetForwardDataTypeOfGradVar(template_var);
      const auto* tensor = GetTensorFromVar(template_var->Var());
-      if (tensor && tensor->IsInitialized()) {
+      if (tensor && tensor->IsInitialized() && (tensor->memory_size() != 0)) {
        auto kernel_type_for_var = op.GetKernelTypeForVar(
            name_pair.first, *tensor, expected_kernel_key);
        if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
@@ -91,7 +91,8 @@ std::shared_ptr<NameVarMap<VarType>> PrepareData(
          VLOG(3) << "Transform Variable " << GetNameFromVar(template_var)
                  << " from " << kernel_type_for_var << " to "
                  << expected_kernel_key;
-
+          VLOG(3) << GetNameFromVar(template_var)
+                  << " memory size is: " << tensor->memory_size();
          if (CheckCachedKey(template_var, expected_kernel_key)) {
            VLOG(3) << "Hit variable_wrapper cache: key="
                    << expected_kernel_key;
@@ -634,7 +635,8 @@ void PreparePhiData(const phi::Kernel& phi_kernel,
    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
      auto& var = ins_vector[offset];
      const auto* tensor_in = GetTensorFromVar(var->Var());
-      if (tensor_in && tensor_in->IsInitialized()) {
+      if (tensor_in && tensor_in->IsInitialized() &&
+          (tensor_in->memory_size() != 0)) {
        if (in_def.backend == phi::Backend::ALL_BACKEND) {
          continue;
        }

--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -98,10 +98,11 @@ void EmptyTensorInitializer(TensorObject* self,
  }

  if (!autograd_meta->GetMutableGradNode()) {
-    VLOG(3) << "Tensor(" << name
-            << ") have not GradNode, add GradNodeAccumulation for it.";
    autograd_meta->SetGradNode(
        std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
+    VLOG(3) << "Tensor(" << name
+            << ") have not GradNode, add GradNodeAccumulation"
+            << autograd_meta->GradNode() << " for it.";
  }
 }


--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -95,6 +95,7 @@ PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) {
  EAGER_TRY
  VLOG(6) << "Get grad for tensor: " << self->tensor.name();
  auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor);
+  VLOG(6) << meta << " initialized: " << meta->Grad().initialized();
  if (meta && meta->Grad().initialized()) {
    return ToPyObject(meta->Grad());
  } else {

--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -257,8 +257,8 @@ add_custom_command(
  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_source_file_tmp}
          ${bw_api_source_file}
  COMMENT "copy_if_different ${bw_api_header_file} ${bw_api_source_file}"
-  DEPENDS ${bw_api_yaml_file} ${legacy_bw_api_yaml_file} ${bw_api_gen_file}
-          ${api_gen_base}
+  DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
+          ${legacy_bw_api_yaml_file}
  VERBATIM)

 # generate sparse api

--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -133,6 +133,17 @@
    func : asinh_grad
  inplace : (out_grad -> x_grad)

+- backward_api : assign_double_grad
+  forward : assign_grad (Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : assign
+  backward: assign_triple_grad
+  inplace : (grad_x_grad -> grad_out_grad)
+
 - backward_api : assign_grad
  forward : assign (Tensor x) -> Tensor(out)
  args : (Tensor out_grad)
@@ -141,6 +152,7 @@
    func : UnchangedInferMeta
  kernel :
    func : assign
+  backward: assign_double_grad
  inplace : (out_grad -> x_grad)

 - backward_api : assign_out__grad
@@ -153,6 +165,16 @@
    func : assign
  inplace : (out_grad -> x_grad)

+- backward_api : assign_triple_grad
+  forward : assign_double_grad (Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : assign
+  inplace : (grad_x_grad -> grad_out_grad)
+
 - backward_api : atan_grad
  forward : atan (Tensor x) -> Tensor(out)
  args : (Tensor x, Tensor out_grad)
@@ -1823,6 +1845,16 @@
    func : sinh_grad
  inplace : (out_grad -> x_grad)

+- backward_api : slice_double_grad
+  forward : slice_grad (Tensor input, Tensor grad_out, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(grad_input)
+  args : (Tensor grad_input_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [grad_input_grad]
+  kernel :
+    func : slice
+
 - backward_api : slice_grad
  forward : slice (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(out)
  args : (Tensor input, Tensor out_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis)
@@ -1832,6 +1864,7 @@
    param : [input]
  kernel :
    func : slice_grad
+  backward : slice_double_grad
  no_need_buffer : input

 - backward_api : soft_shrink_grad