[AMP] Support amp for Intermediate_dygraph (#40623)

* approve amp for intermediate_dygraph * add amp_utils for intermediate_dygraph * add amp needcast check for mlu & npu * test unittest * add SetGradNode for set_stop_gradient && add checktensor for GradientHooks * refine code * refien unittest of imperative_amp for new dygraph * inplace api skip amp * add test_imperative_qat_amp for intermediate amp * refine code * refine test_amp ci strategy * refine unittest code * refine amp_utils code * refine amp getpromotetype for some special op * refine unittest code

[AMP] Support amp for Intermediate_dygraph (#40623)
* approve amp for intermediate_dygraph * add amp_utils for intermediate_dygraph * add amp needcast check for mlu & npu * test unittest * add SetGradNode for set_stop_gradient && add checktensor for GradientHooks * refine code * refien unittest of imperative_amp for new dygraph * inplace api skip amp * add test_imperative_qat_amp for intermediate amp * refine code * refine test_amp ci strategy * refine unittest code * refine amp_utils code * refine amp getpromotetype for some special op * refine unittest code
c12f7d48 · zhangbo9674 · GitHub · 38d1fe34 · c12f7d48 · c12f7d48
9 changed file
--- a/paddle/fluid/eager/amp_utils.h
+++ b/paddle/fluid/eager/amp_utils.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <string>
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/imperative/amp_auto_cast.h"
+
+namespace egr {
+
+static inline paddle::experimental::DataType GetPromoteType(
+    const std::string& api_name,
+    const std::vector<std::vector<paddle::experimental::Tensor>>&
+        amp_tensors_vector,
+    const paddle::experimental::DataType& amp_dtype) {
+  auto dst_type = amp_dtype;
+  if (egr::Controller::Instance().GetCurrentTracer()->GetAmpDtype() ==
+      "float16") {
+    if (api_name == "batch_norm" || api_name == "layer_norm" ||
+        api_name == "sync_batch_norm") {
+      if (amp_tensors_vector[0][0].dtype() ==
+          paddle::experimental::DataType::FLOAT32) {
+        dst_type = paddle::experimental::DataType::FLOAT32;
+      }
+    } else if (api_name == "fused_attention") {
+      for (size_t i = 0; i < amp_tensors_vector.size(); i++) {
+        if (i != 3 || i != 4 || i != 9 || i != 10) {
+          if (amp_tensors_vector[i][0].dtype() ==
+              paddle::experimental::DataType::FLOAT32) {
+            dst_type = paddle::experimental::DataType::FLOAT32;
+            break;
+          }
+        }
+      }
+    } else if (api_name == "fused_feedforward") {
+      for (size_t i = 0; i < amp_tensors_vector.size(); i++) {
+        if (i != 7 || i != 8 || i != 9 || i != 10) {
+          if (amp_tensors_vector[i][0].dtype() ==
+              paddle::experimental::DataType::FLOAT32) {
+            dst_type = paddle::experimental::DataType::FLOAT32;
+            break;
+          }
+        }
+      }
+    } else {
+      for (const auto& tensors : amp_tensors_vector) {
+        for (const auto& tensor : tensors) {
+          if (tensor.dtype() == paddle::experimental::DataType::FLOAT32) {
+            dst_type = tensor.dtype();
+            break;
+          }
+        }
+      }
+    }
+  } else {
+    for (const auto& tensors : amp_tensors_vector) {
+      for (const auto& tensor : tensors) {
+        if (tensor.dtype() == paddle::experimental::DataType::FLOAT32) {
+          dst_type = tensor.dtype();
+          break;
+        }
+      }
+    }
+  }
+  // NOTE(juncai): moving_average_abs_max_scale only consider the dtype of
+  // input(X)
+  if (api_name == "moving_average_abs_max_scale") {
+    if (amp_tensors_vector[0][0].dtype() ==
+        paddle::experimental::DataType::FLOAT16) {
+      dst_type = paddle::experimental::DataType::FLOAT16;
+    }
+  }
+  return dst_type;
+}
+
+paddle::experimental::DataType GetAmpDestDtype(
+    const std::string& api_name,
+    const std::vector<std::vector<paddle::experimental::Tensor>>&
+        amp_tensors_vector) {
+  auto amp_dtype =
+      egr::Controller::Instance().GetCurrentTracer()->GetAmpDtype();
+  auto amp_level = egr::Controller::Instance().GetAMPLevel();
+  VLOG(6) << "AMP GetAmpDestDtype:"
+          << " op(" << api_name << ") amp_dtype(" << amp_dtype << ") amp_level("
+          << static_cast<int>(amp_level) << ").";
+  if (amp_dtype == "float16") {
+    if (amp_level == paddle::imperative::AmpLevel::O1) {
+      if (paddle::imperative::AmpOperators::Instance()
+              .GetMutableAllowOps()
+              ->count(api_name)) {
+        return paddle::experimental::DataType::FLOAT16;
+      } else if (paddle::imperative::AmpOperators::Instance()
+                     .GetMutableBlockOps()
+                     ->count(api_name)) {
+        return paddle::experimental::DataType::FLOAT32;
+      } else {
+        auto dst_type = GetPromoteType(api_name, amp_tensors_vector,
+                                       paddle::experimental::DataType::FLOAT16);
+        if (dst_type == paddle::experimental::DataType::FLOAT16 &&
+            paddle::imperative::AmpOperators::Instance()
+                .GetMutableUnsupportedFp16Ops()
+                ->count(api_name)) {
+          dst_type = paddle::experimental::DataType::FLOAT32;
+        }
+        return dst_type;
+      }
+    } else if (amp_level == paddle::imperative::AmpLevel::O2) {
+      auto dst_type = paddle::experimental::DataType::FLOAT16;
+      if (paddle::imperative::AmpOperators::Instance()
+              .GetMutableUnsupportedFp16Ops()
+              ->count(api_name) ||
+          paddle::imperative::AmpOperators::Instance()
+              .GetMutableBlockOps()
+              ->count(api_name)) {
+        dst_type = paddle::experimental::DataType::FLOAT32;
+      }
+      return dst_type;
+    }
+  } else if (amp_dtype == "bfloat16") {
+    if (amp_level == paddle::imperative::AmpLevel::O1) {
+      if (paddle::imperative::AmpOperators::Instance()
+              .GetMutableAllowOps()
+              ->count(api_name)) {
+        return paddle::experimental::DataType::BFLOAT16;
+      } else if (paddle::imperative::AmpOperators::Instance()
+                     .GetMutableBlockOps()
+                     ->count(api_name)) {
+        return paddle::experimental::DataType::FLOAT32;
+      } else {
+        auto dst_type =
+            GetPromoteType(api_name, amp_tensors_vector,
+                           paddle::experimental::DataType::BFLOAT16);
+        if (dst_type == paddle::experimental::DataType::BFLOAT16 &&
+            paddle::imperative::AmpOperators::Instance()
+                .GetMutableUnsupportedBf16Ops()
+                ->count(api_name)) {
+          dst_type = paddle::experimental::DataType::FLOAT32;
+        }
+        return dst_type;
+      }
+    } else if (amp_level == paddle::imperative::AmpLevel::O2) {
+      auto dst_type = paddle::experimental::DataType::BFLOAT16;
+      if (paddle::imperative::AmpOperators::Instance()
+              .GetMutableUnsupportedBf16Ops()
+              ->count(api_name) ||
+          paddle::imperative::AmpOperators::Instance()
+              .GetMutableBlockOps()
+              ->count(api_name)) {
+        dst_type = paddle::experimental::DataType::FLOAT32;
+      }
+      return dst_type;
+    }
+  }
+  return paddle::experimental::DataType::FLOAT32;
+}
+
+static inline bool NeedCast(const paddle::experimental::Tensor& tensor,
+                            const paddle::experimental::DataType& dst_dtype) {
+  auto place = tensor.inner_place();
+  auto data_type = tensor.dtype();
+  if (paddle::platform::is_gpu_place(place) ||
+      paddle::platform::is_cuda_pinned_place(place) ||
+      paddle::platform::is_xpu_place(place) ||
+      paddle::platform::is_mlu_place(place) ||
+      paddle::platform::is_npu_place(place) ||
+      paddle::platform::is_npu_pinned_place(place)) {
+    // CudaPinndePlace is added for varbase created by dataloader
+    if ((data_type == paddle::experimental::DataType::FLOAT32 ||
+         data_type == paddle::experimental::DataType::FLOAT16 ||
+         data_type == paddle::experimental::DataType::BFLOAT16) &&
+        (data_type != dst_dtype)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<paddle::experimental::Tensor> AmpAutoCasts(
+    const std::string& inputs_name,
+    const std::vector<paddle::experimental::Tensor>& inputs,
+    const paddle::experimental::DataType& dst_dtype, std::string api_name) {
+  VLOG(6) << "AMP AmpAutoCasts:"
+          << " inputs(" << inputs_name << ") dst_dtype("
+          << paddle::framework::DataType2String(dst_dtype) << ").";
+  std::vector<paddle::experimental::Tensor> inputs_casted;
+  for (auto& input : inputs) {
+    if (NeedCast(input, dst_dtype)) {
+      paddle::framework::AttributeMap cast_attrs = {
+          {"in_dtype", paddle::framework::TransToProtoVarType(input.dtype())},
+          {"out_dtype", paddle::framework::TransToProtoVarType(dst_dtype)}};
+      inputs_casted.emplace_back(
+          std::move(cast_dygraph_function(input, cast_attrs)));
+    } else {
+      inputs_casted.emplace_back(input);
+    }
+  }
+  return inputs_casted;
+}
+
+paddle::experimental::Tensor AmpAutoCast(
+    const std::string& input_name, const paddle::experimental::Tensor& input,
+    const paddle::experimental::DataType& dst_dtype, std::string api_name) {
+  VLOG(6) << "AMP AmpAutoCasts:"
+          << " input(" << input_name << ") dst_dtype("
+          << paddle::framework::DataType2String(dst_dtype) << ").";
+  if (dst_dtype == paddle::experimental::DataType::FLOAT16) {
+    if (api_name == "run_program") {
+      return input;
+    }
+    if ((api_name == "batch_norm" || api_name == "layer_norm" ||
+         api_name == "sync_batch_norm") &&
+        input_name != "X") {
+      return input;
+    }
+    if ((api_name == "fused_attention" || api_name == "fused_feedforward")) {
+      if (input_name == "LnScale" || input_name == "LnBias" ||
+          input_name == "Ln2Scale" || input_name == "Ln2Bias" ||
+          input_name == "Ln1Scale" || input_name == "Ln1Bias") {
+        return input;
+      }
+    }
+  }
+  if (NeedCast(input, dst_dtype)) {
+    paddle::framework::AttributeMap cast_attrs = {
+        {"in_dtype", paddle::framework::TransToProtoVarType(input.dtype())},
+        {"out_dtype", paddle::framework::TransToProtoVarType(dst_dtype)}};
+    return cast_dygraph_function(input, cast_attrs);
+  }
+  return input;
+}
+}  // namespace egr
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1379,6 +1379,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
      paddle::string::Sprintf(FORWARD_FUNCTION_TEMPLATE, op_type);

  std::string dygraph_function_args_str = "";
+  std::string amp_function_call_args_str = "";
  core_ops_args_info[op_type] = {};
  core_ops_args_type_info[op_type] = {};
  core_ops_args_info[op_type].resize(in_vars.size());
@@ -1391,6 +1392,9 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
  // [Generation] Get Ins Map
  std::string ins_contents_str = "";
  std::vector<std::string> input_args_str_list(in_vars.size());
+  std::vector<std::string> amp_function_call_args_str_list(in_vars.size());
+  std::string amp_tensors_vector_str = "";
+  std::string amp_auto_cast_str = "";
  for (const proto::OpProto::Var& input : in_vars) {
    const std::string& input_name = input.name();
    size_t input_position = fwd_inputs_name_pos_map.at(input_name);
@@ -1400,6 +1404,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
          "const std::vector<paddle::experimental::Tensor>& %s";
      input_args_str_list[input_position] =
          paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name);
+      amp_function_call_args_str_list[input_position] = " NEW_" + input_name;

      core_ops_args_type_info[op_type][input_position] = "list";
    } else {
@@ -1420,6 +1425,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
      }
      input_args_str_list[input_position] =
          paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name);
+      amp_function_call_args_str_list[input_position] = " NEW_" + input_name;

      core_ops_args_type_info[op_type][input_position] = "tensor";
    }
@@ -1431,10 +1437,31 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
        "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
    ins_contents_str += paddle::string::Sprintf(FWD_INS_CONTENT_TEMPLATE,
                                                input_name, input_name);
+    if (input.duplicable()) {
+      const char* AMP_TENSORS_VECTOR_TEMPLATE = "%s,";
+      amp_tensors_vector_str +=
+          paddle::string::Sprintf(AMP_TENSORS_VECTOR_TEMPLATE, input_name);
+      const char* AMP_AUTO_CAST_TEMPLATE =
+          "    auto NEW_%s = egr::AmpAutoCasts(\"%s\", %s, amp_dst_dtype, "
+          "\"%s\");\n";
+      amp_auto_cast_str += paddle::string::Sprintf(
+          AMP_AUTO_CAST_TEMPLATE, input_name, input_name, input_name, op_type);
+    } else {
+      const char* AMP_TENSORS_VECTOR_TEMPLATE = "{%s},";
+      amp_tensors_vector_str +=
+          paddle::string::Sprintf(AMP_TENSORS_VECTOR_TEMPLATE, input_name);
+      const char* AMP_AUTO_CAST_TEMPLATE =
+          "    auto NEW_%s = egr::AmpAutoCast(\"%s\", %s, amp_dst_dtype, "
+          "\"%s\");\n";
+      amp_auto_cast_str += paddle::string::Sprintf(
+          AMP_AUTO_CAST_TEMPLATE, input_name, input_name, input_name, op_type);
+    }
  }
  if (ins_contents_str.size() > 0)
    ins_contents_str.pop_back();  // // Remove trailing ","

+  if (amp_tensors_vector_str.size() > 0) amp_tensors_vector_str.pop_back();
+
  for (const std::string& arg : input_args_str_list) {
    dygraph_function_args_str += arg;
    dygraph_function_args_str += ",";
@@ -1442,16 +1469,17 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
  if (dygraph_function_args_str.size() > 0)
    dygraph_function_args_str.pop_back();

-  const char* FWD_INS_MAP_TEMPLATE =
-      "  std::map<std::string, "
-      "std::vector<std::shared_ptr<egr::EagerVariable>>> ins = { "
-      "%s };\n";
-  std::string ins_map_str =
-      paddle::string::Sprintf(FWD_INS_MAP_TEMPLATE, ins_contents_str);
-  generated_function_body += ins_map_str;
-  generated_function_body += "\n";
+  for (const std::string& arg : amp_function_call_args_str_list) {
+    amp_function_call_args_str += arg;
+    amp_function_call_args_str += ",";
+  }
+  if (amp_function_call_args_str.size() > 0)
+    amp_function_call_args_str.pop_back();

  // Handle Dispensable Inputs
+  std::string dispensable_ins_contents_str = "";
+  std::string dispensable_amp_tensors_vector_str = "";
+  std::string dispensable_amp_auto_cast_str = "";
  std::set<std::string> input_names;
  for (const proto::OpProto::Var& input : in_vars) {
    const std::string& input_name = input.name();
@@ -1461,14 +1489,36 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
        const char* FWD_INS_CONTENT_TEMPLATE =
            "  if(%s.size() > 0) "
            "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
-        generated_function_body += paddle::string::Sprintf(
+        dispensable_ins_contents_str += paddle::string::Sprintf(
            FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
+        const char* FWD_AMP_TENSORS_VECTOR_TEMPLATE =
+            "    if(%s.size() > 0) "
+            "amp_tensors_vector.push_back(%s);\n";
+        dispensable_amp_tensors_vector_str += paddle::string::Sprintf(
+            FWD_AMP_TENSORS_VECTOR_TEMPLATE, input_name, input_name);
+        const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE =
+            "    auto NEW_%s = ((%s.size() > 0) ? egr::AmpAutoCasts(\"%s\", "
+            "%s, amp_dst_dtype, \"%s\") : %s);\n";
+        dispensable_amp_auto_cast_str += paddle::string::Sprintf(
+            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name,
+            input_name, input_name, op_type, input_name);
      } else {
        const char* FWD_INS_CONTENT_TEMPLATE =
            "  if(%s.initialized()) "
            "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
-        generated_function_body += paddle::string::Sprintf(
+        dispensable_ins_contents_str += paddle::string::Sprintf(
            FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
+        const char* FWD_AMP_TENSORS_VECTOR_TEMPLATE =
+            "    if(%s.initialized()) "
+            "amp_tensors_vector.push_back({ %s });\n";
+        dispensable_amp_tensors_vector_str += paddle::string::Sprintf(
+            FWD_AMP_TENSORS_VECTOR_TEMPLATE, input_name, input_name);
+        const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE =
+            "    auto NEW_%s = ((%s.initialized()) ? egr::AmpAutoCast(\"%s\", "
+            "%s, amp_dst_dtype, \"%s\") : %s);\n";
+        dispensable_amp_auto_cast_str += paddle::string::Sprintf(
+            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name,
+            input_name, input_name, op_type, input_name);
      }
    }
  }
@@ -1493,6 +1543,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
        std::string arg_str =
            paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
        dygraph_function_args_str += arg_str;
+        amp_function_call_args_str += (", " + output_var_name);

        core_ops_args_type_info[op_type].push_back("list");
      } else {
@@ -1500,6 +1551,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
        std::string arg_str =
            paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
        dygraph_function_args_str += arg_str;
+        amp_function_call_args_str += (", " + output_var_name);

        core_ops_args_type_info[op_type].push_back("tensor");
      }
@@ -1544,6 +1596,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
        std::string arg_str =
            paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, outnum);
        dygraph_function_args_str += arg_str;
+        amp_function_call_args_str += (", " + outnum);
        const char* FWD_OUTS_CONTENT_TEMPLATE =
            "{ \"%s\", egr::EagerUtils::CreateVars(%s) },";
        outs_contents_str += paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE,
@@ -1565,6 +1618,69 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
  if (inplace_mapping_str.size() > 0)
    inplace_mapping_str.pop_back();  // Remove trailing ","

+  if ((op_type != "cast") && (inplace_map.empty())) {
+    VLOG(6) << "Generating Dygraph Forward AMP";
+    const char* AMP_LOGIC_CONTEXT =
+        "  if (egr::Controller::Instance().GetAMPLevel() != "
+        "paddle::imperative::AmpLevel::O0) {\n"
+        "    VLOG(5) << \"Check and Prepare For AMP\";\n"
+        "  \n"
+        "%s\n"
+        "  }\n";
+    std::string amp_logic_str = "";
+    if (in_vars.size() != 0) {
+      const char* AMP_TENSORS_VECTOR_TEMPLATE =
+          "    std::vector<std::vector<paddle::experimental::Tensor>> "
+          "amp_tensors_vector = { "
+          "%s };\n";
+      std::string amp_tensors_vector = paddle::string::Sprintf(
+          AMP_TENSORS_VECTOR_TEMPLATE, amp_tensors_vector_str);
+      amp_tensors_vector += dispensable_amp_tensors_vector_str;
+      amp_logic_str += amp_tensors_vector;
+      amp_logic_str += "\n";
+      const char* GET_AMP_GET_DST_DTYPE_CONTEXT =
+          "    auto amp_dst_dtype = "
+          "egr::GetAmpDestDtype(\"%s\", "
+          "amp_tensors_vector);\n";
+      amp_logic_str +=
+          paddle::string::Sprintf(GET_AMP_GET_DST_DTYPE_CONTEXT, op_type);
+      amp_logic_str += "\n";
+      amp_logic_str += amp_auto_cast_str;
+      amp_logic_str += dispensable_amp_auto_cast_str;
+      amp_logic_str += "\n";
+    }
+    const char* CALL_BACK_TEMPLATE =
+        "    {\n"
+        "      paddle::imperative::AutoCastGuard "
+        "guard(egr::Controller::Instance().GetCurrentTracer(), "
+        "paddle::imperative::AmpLevel::O0);\n"
+        "      return %s_dygraph_function(%s);\n"
+        "    }";
+    amp_function_call_args_str += ", attr_map ";
+    if (amp_function_call_args_str.size() > 0) {
+      auto iter = amp_function_call_args_str.begin();
+      if ((*iter) == ',') amp_function_call_args_str.erase(iter);
+    }
+    std::string call_back_str = paddle::string::Sprintf(
+        CALL_BACK_TEMPLATE, op_type, amp_function_call_args_str);
+    amp_logic_str += call_back_str;
+    amp_logic_str += "\n";
+    std::string amp_context =
+        paddle::string::Sprintf(AMP_LOGIC_CONTEXT, amp_logic_str);
+    generated_function_body += amp_context;
+    generated_function_body += "\n";
+  }
+  // forward ins insert
+  const char* FWD_INS_MAP_TEMPLATE =
+      "  std::map<std::string, "
+      "std::vector<std::shared_ptr<egr::EagerVariable>>> ins = { "
+      "%s };\n";
+  std::string ins_map_str =
+      paddle::string::Sprintf(FWD_INS_MAP_TEMPLATE, ins_contents_str);
+  ins_map_str += dispensable_ins_contents_str;
+  generated_function_body += ins_map_str;
+  generated_function_body += "\n";
+  // forward outs insert
  const char* FWD_OUTS_MAP_TEMPLATE =
      "  std::map<std::string, "
      "std::vector<std::shared_ptr<egr::EagerVariable>>> outs = { "
@@ -2044,6 +2160,7 @@ static std::string GenerateSingleOpBase(
    grad_attrs_str += paddle::string::Sprintf(CAST_GRAD, attrs_name, attrs_name,
                                              attrs_name, attrs_name);
  }
+
  // Handle dynamic grad attributes
  grad_attrs_str += HandleDynamicGradAttributes(fwd_op_type, attrs_name);
  generated_grad_function_body += grad_attrs_str;
@@ -2469,6 +2586,7 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path,
      "#include "
      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n"
      "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
+      "#include \"paddle/fluid/eager/amp_utils.h\"\n"
      "#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n";
  std::string forward_cc_include_str =
      paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);

--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -364,6 +364,7 @@ GradNodeBase::ApplyGradientHooks(
      if (!outs[i][j].defined() || !outs[i][j].initialized()) {
        outs[i][j] = tensors[i][j];
      }
+      CheckTensor(tensors[i][j], outs[i][j]);
    }
  }


--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -279,4 +279,29 @@ class Edge {
  std::shared_ptr<GradNodeBase> grad_node_{nullptr};
 };

+inline void CheckTensor(const paddle::experimental::Tensor& pre,
+                        const paddle::experimental::Tensor& post) {
+  if (!pre.initialized() && post.initialized()) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "The tensor in before and after hook are not consistent"));
+  }
+  if (pre.initialized() && post.initialized()) {
+    VLOG(4) << paddle::framework::DataType2String(pre.dtype()) << " "
+            << paddle::framework::DataType2String(post.dtype());
+    PADDLE_ENFORCE_EQ(
+        pre.dtype(), post.dtype(),
+        paddle::platform::errors::PermissionDenied(
+            "The dtype of tensor before(%s) and after(%s) hook are not "
+            "consistent",
+            paddle::framework::DataType2String(pre.dtype()),
+            paddle::framework::DataType2String(post.dtype())));
+    PADDLE_ENFORCE_EQ(
+        pre.inner_place(), post.inner_place(),
+        paddle::platform::errors::PermissionDenied(
+            "The place of tensor before(%s) and after(%s) "
+            "hook are not consistent",
+            pre.inner_place().DebugString(), post.inner_place().DebugString()));
+  }
+}
+
 }  // namespace egr
--- a/paddle/fluid/framework/convert_utils.cc
+++ b/paddle/fluid/framework/convert_utils.cc
@@ -145,6 +145,8 @@ DataType String2DataType(const std::string& str) {
    return DataType::COMPLEX64;
  } else if (str == "complex128") {
    return DataType::COMPLEX128;
+  } else if (str == "bfloat16") {
+    return DataType::BFLOAT16;
  } else {
    return DataType::UNDEFINED;
  }
@@ -174,6 +176,8 @@ std::string DataType2String(DataType dtype) {
      return "complex64";
    case DataType::COMPLEX128:
      return "complex128";
+    case DataType::BFLOAT16:
+      return "bfloat16";
    default:
      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
          "Unknow phi::DataType, the int value = %d.",

--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -112,6 +112,9 @@ int tensor_properties_set_stop_gradient(TensorObject* self, PyObject* value,
  EAGER_TRY
  auto meta = egr::EagerUtils::autograd_meta(&self->tensor);
  meta->SetStopGradient(CastPyArg2AttrBoolean(value, 0));
+  if (!meta->GradNode()) {
+    meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
+  }
  return 0;
  EAGER_CATCH_AND_THROW_RETURN_ZERO
 }

--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
@@ -172,7 +172,7 @@ class TestImperativeQatAmp(unittest.TestCase):
        acc_top1 = sum(acc_top1_list) / len(acc_top1_list)
        return acc_top1

-    def test_ptq(self):
+    def ptq(self):
        start_time = time.time()

        self.set_vars()
@@ -217,6 +217,11 @@ class TestImperativeQatAmp(unittest.TestCase):
        end_time = time.time()
        print("total time: %ss" % (end_time - start_time))

+    def test_ptq(self):
+        self.ptq()
+        with _test_eager_guard():
+            self.ptq()
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -21,6 +21,7 @@ from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_p
 import paddle.nn as nn
 from paddle.static import InputSpec
 from paddle.autograd import PyLayer
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode, in_dygraph_mode

 if fluid.core.is_compiled_with_cuda():
    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
@@ -51,7 +52,7 @@ class SimpleConv(fluid.dygraph.Layer):


 class TestAutoCast(unittest.TestCase):
-    def test_amp_guard_white_op(self):
+    def amp_guard_white_op(self):
        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
        with fluid.dygraph.guard():
            conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
@@ -66,7 +67,12 @@ class TestAutoCast(unittest.TestCase):
        self.assertTrue(out_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
        self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)

-    def test_amp_guard_black_op(self):
+    def test_amp_guard_white_op(self):
+        with _test_eager_guard():
+            self.amp_guard_white_op()
+        self.amp_guard_white_op()
+
+    def amp_guard_black_op(self):
        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
        with fluid.dygraph.guard():
            data = fluid.dygraph.to_variable(data)
@@ -76,7 +82,12 @@ class TestAutoCast(unittest.TestCase):
        self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
        self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)

-    def test_custom_op_list(self):
+    def test_amp_guard_black_op(self):
+        with _test_eager_guard():
+            self.amp_guard_black_op()
+        self.amp_guard_black_op()
+
+    def custom_op_list(self):
        with fluid.dygraph.guard():
            tracer = fluid.framework._dygraph_tracer()
            base_white_list = fluid.dygraph.amp.auto_cast.WHITE_LIST
@@ -107,7 +118,12 @@ class TestAutoCast(unittest.TestCase):
                    set(black_list) ==
                    (set(base_black_list) - {"log"}) | {"conv2d"})

-    def test_custom_op_list_exception(self):
+    def test_custom_op_list(self):
+        with _test_eager_guard():
+            self.custom_op_list()
+        self.custom_op_list()
+
+    def custom_op_list_exception(self):
        inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)

        def func():
@@ -118,7 +134,6 @@ class TestAutoCast(unittest.TestCase):
                    filter_size=7,
                    stride=2,
                    act='relu')
-
                with fluid.dygraph.amp_guard(
                        custom_white_list=["conv2d"],
                        custom_black_list=["conv2d"]):
@@ -127,7 +142,12 @@ class TestAutoCast(unittest.TestCase):

        self.assertRaises(ValueError, func)

-    def test_amp_guard_upsupported_fp16_op(self):
+    def test_custom_op_list_exception(self):
+        with _test_eager_guard():
+            self.custom_op_list_exception()
+        self.custom_op_list_exception()
+
+    def amp_guard_upsupported_fp16_op(self):
        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
        with fluid.dygraph.guard():
            conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
@@ -143,7 +163,6 @@ class TestAutoCast(unittest.TestCase):
                out_purefp16_fp32 = paddle.expand_as(
                    out_purefp16_fp16,
                    out_purefp16_fp16)  # expand_as_v2 has no fp16 kernel
-
        self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
        self.assertTrue(out_amp_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
        self.assertTrue(out_amp_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
@@ -152,7 +171,12 @@ class TestAutoCast(unittest.TestCase):
        self.assertTrue(
            out_purefp16_fp32.dtype == fluid.core.VarDesc.VarType.FP32)

-    def test_mode_exception(self):
+    def test_amp_guard_upsupported_fp16_op(self):
+        with _test_eager_guard():
+            self.amp_guard_upsupported_fp16_op()
+        self.amp_guard_upsupported_fp16_op()
+
+    def mode_exception(self):
        def func():
            data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
            with fluid.dygraph.guard():
@@ -164,9 +188,14 @@ class TestAutoCast(unittest.TestCase):

        self.assertRaises(ValueError, func)

+    def test_mode_exception(self):
+        with _test_eager_guard():
+            self.mode_exception()
+        self.mode_exception()
+

 class TestAmpScaler(unittest.TestCase):
-    def test_scale(self):
+    def scale(self):
        with fluid.dygraph.guard():
            data = paddle.rand([10, 1024])
            scaler = paddle.fluid.dygraph.AmpScaler(init_loss_scaling=1024)
@@ -174,7 +203,12 @@ class TestAmpScaler(unittest.TestCase):
            self.assertEqual(
                np.array_equal(scaled_data.numpy(), data.numpy() * 1024), True)

-    def test_minimize(self):
+    def test_scale(self):
+        with _test_eager_guard():
+            self.scale()
+        self.scale()
+
+    def minimize(self):
        inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)

        def run_simple_conv(inp_np, use_scaler=True):
@@ -223,7 +257,12 @@ class TestAmpScaler(unittest.TestCase):
                np.allclose(outs_with_scaler[1][i][0].numpy(),
                            outs_no_scaler[1][i][0].numpy()), True)

-    def test_step(self):
+    def test_minimize(self):
+        with _test_eager_guard():
+            self.minimize()
+        self.minimize()
+
+    def step(self):
        inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)

        def run_simple_conv(inp_np, use_scaler=True):
@@ -264,7 +303,12 @@ class TestAmpScaler(unittest.TestCase):
                np.allclose(outs_with_scaler[i].numpy(),
                            outs_no_scaler[i].numpy()), True)

-    def test_nan_inf(self):
+    def test_step(self):
+        with _test_eager_guard():
+            self.step()
+        self.step()
+
+    def nan_inf(self):
        inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
        inp_np[0][1][2][3] = np.nan
        with fluid.dygraph.guard():
@@ -294,7 +338,12 @@ class TestAmpScaler(unittest.TestCase):
                self.assertTrue(
                    np.array_equal(param.numpy(), params_init[param.name]))

-    def test_step_update_exception(self):
+    def test_nan_inf(self):
+        with _test_eager_guard():
+            self.nan_inf()
+        self.nan_inf()
+
+    def step_update_exception(self):
        def func1():
            model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
            optimizer = paddle.optimizer.SGD(learning_rate=0.01,
@@ -340,6 +389,11 @@ class TestAmpScaler(unittest.TestCase):

        self.assertRaises(RuntimeError, func3)

+    def test_step_update_exception(self):
+        with _test_eager_guard():
+            self.step_update_exception()
+        self.step_update_exception()
+
    def test_get_and_set(self):
        with fluid.dygraph.guard():
            scaler = paddle.amp.GradScaler(
@@ -504,6 +558,7 @@ class TestGradScalerStateDict(unittest.TestCase):
        return dy_out, dy_param_value, dy_grad_value

    def test_with_state_dict(self):
+        def func_isinstance():
            with fluid.dygraph.guard():
                out_use_state_dict = self.train_resnet(
                    enable_amp=True, use_data_loader=True, use_save_load=True)
@@ -513,6 +568,10 @@ class TestGradScalerStateDict(unittest.TestCase):
            self.assertTrue(
                np.allclose(out_use_state_dict[0], out_no_state_dict[0]))

+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
+

 class TestAmpDecorator(unittest.TestCase):
    def test_mode_exception(self):
@@ -765,17 +824,23 @@ class TestPureFp16SaveLoad(unittest.TestCase):
        return dy_out, dy_param_value, dy_grad_value

    def test_with_save_load(self):
+        def func_isinstance():
            with fluid.dygraph.guard():
                out_use_save_load = self.train_resnet(
                    enable_amp=True, use_data_loader=True, use_save_load=True)
                out_no_save_load = self.train_resnet(
                    enable_amp=True, use_data_loader=True, use_save_load=False)
            print('save_load:', out_use_save_load[0], out_no_save_load[0])
-        self.assertTrue(np.allclose(out_use_save_load[0], out_no_save_load[0]))
+            self.assertTrue(
+                np.allclose(out_use_save_load[0], out_no_save_load[0]))
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()


 class TestPureFp16InferenceSaveLoad(unittest.TestCase):
-    def test_inference_save_load(self):
+    def inference_save_load(self):
        BATCH_SIZE = 16
        BATCH_NUM = 4
        EPOCH_NUM = 4
@@ -861,7 +926,15 @@ class TestPureFp16InferenceSaveLoad(unittest.TestCase):
        results = exe.run(inference_program,
                          feed={feed_target_names[0]: tensor_img},
                          fetch_list=fetch_targets)
+        print("pred.numpy()", pred.numpy())
+        print("results", results)
        self.assertTrue(np.allclose(pred.numpy(), results, atol=1.e-5))
+        paddle.disable_static()
+
+    def test_inference_save_load(self):
+        self.inference_save_load()
+        with _test_eager_guard():
+            self.inference_save_load()


 class TestResnet2(unittest.TestCase):
@@ -987,28 +1060,47 @@ class TestResnet2(unittest.TestCase):
        return dy_out, dy_param_value, dy_grad_value

    def test_resnet(self):
+        def func_isinstance():
            with fluid.dygraph.guard():
                out_fp32 = self.train_resnet(enable_amp=False)
                out_amp = self.train_resnet(enable_amp=True)
                out_pure_fp16 = self.train_resnet(enable_amp=True, level='O2')
            print(out_fp32[0], out_amp[0], out_pure_fp16[0])
            self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
-        self.assertTrue(np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+            self.assertTrue(
+                np.allclose(
+                    out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()

    def test_with_data_loader(self):
+        def func_isinstance():
            with fluid.dygraph.guard():
-            out_fp32 = self.train_resnet(enable_amp=False, use_data_loader=True)
-            out_amp = self.train_resnet(enable_amp=True, use_data_loader=True)
+                out_fp32 = self.train_resnet(
+                    enable_amp=False, use_data_loader=True)
+                out_amp = self.train_resnet(
+                    enable_amp=True, use_data_loader=True)
                out_pure_fp16 = self.train_resnet(
                    enable_amp=True, use_data_loader=True, level='O2')
            print(out_fp32[0], out_amp[0], out_pure_fp16[0])
            self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
-        self.assertTrue(np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+            self.assertTrue(
+                np.allclose(
+                    out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()

    def test_param_group(self):
+        def func_isinstance():
            with fluid.dygraph.guard():
                out_fp32 = self.train_resnet(
-                enable_amp=False, use_data_loader=True, use_param_group=True)
+                    enable_amp=False,
+                    use_data_loader=True,
+                    use_param_group=True)
                out_amp = self.train_resnet(
                    enable_amp=True, use_data_loader=True, use_param_group=True)
                out_pure_fp16 = self.train_resnet(
@@ -1018,7 +1110,13 @@ class TestResnet2(unittest.TestCase):
                    level='O2')
            print(out_fp32[0], out_amp[0], out_pure_fp16[0])
            self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
-        self.assertTrue(np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+            self.assertTrue(
+                np.allclose(
+                    out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()


 class TestResnet(unittest.TestCase):
@@ -1102,12 +1200,19 @@ class TestResnet(unittest.TestCase):
        return dy_out, dy_param_value, dy_grad_value

    def test_resnet(self):
+        def func_isinstance():
            out_fp32 = self.train_resnet(enable_amp=False)
            out_amp = self.train_resnet(enable_amp=True)
            out_pure_fp16 = self.train_resnet(enable_amp=True, level='O2')
            print(out_fp32[0], out_amp[0], out_pure_fp16[0])
            self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2))
-        self.assertTrue(np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-1))
+            self.assertTrue(
+                np.allclose(
+                    out_fp32[0], out_pure_fp16[0], atol=1.e-1))
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()


 class TestLayerNormFp16(unittest.TestCase):
@@ -1116,6 +1221,7 @@ class TestLayerNormFp16(unittest.TestCase):
    '''

    def test_layer_norm_fp16(self):
+        def func_isinstance():
            if fluid.is_compiled_with_cuda():
                with fluid.dygraph.guard(fluid.CUDAPlace(0)):
                    x = paddle.rand([2, 2, 2, 3])
@@ -1123,7 +1229,12 @@ class TestLayerNormFp16(unittest.TestCase):
                    with paddle.amp.auto_cast(custom_white_list=['layer_norm']):
                        out = layer_norm(x)

-                self.assertTrue(out.dtype == fluid.core.VarDesc.VarType.FP16)
+                    self.assertTrue(
+                        out.dtype == fluid.core.VarDesc.VarType.FP16)
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()


 class TestBf16(unittest.TestCase):
@@ -1142,6 +1253,7 @@ class TestBf16(unittest.TestCase):
        return output.numpy()

    def test_bf16(self):
+        def func_isinstance():
            if fluid.core.is_compiled_with_cuda():
                cudnn_version = paddle.device.get_cudnn_version()
                if cudnn_version is not None and cudnn_version >= 8100:
@@ -1155,6 +1267,10 @@ class TestBf16(unittest.TestCase):
                        np.allclose(
                            out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))

+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
+

 class TestAmpWithPyLyer(unittest.TestCase):
    def test_pylayer(self):
@@ -1176,7 +1292,7 @@ class TestAmpWithPyLyer(unittest.TestCase):
        x.stop_gradient = False
        y.stop_gradient = False

-        with paddle.amp.auto_cast():
+        # with paddle.amp.auto_cast():
        res = MyMM.apply(x, y)
        loss = paddle.mean(res)
        loss.backward()
@@ -1184,6 +1300,7 @@ class TestAmpWithPyLyer(unittest.TestCase):

 class TestAmpWithHook(unittest.TestCase):
    def test_hook_change_dtype(self):
+        def func_isinstance():
            with paddle.fluid.dygraph.guard():
                v = paddle.rand([3, 3])
                v.stop_gradient = False
@@ -1200,7 +1317,12 @@ class TestAmpWithHook(unittest.TestCase):
                    loss = a.sum()
                    self.assertRaises(RuntimeError, loss.backward)

+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
+
    def test_hook_change_place(self):
+        def func_isinstance():
            with paddle.fluid.dygraph.guard():
                v = paddle.rand([3, 3])
                v.stop_gradient = False
@@ -1215,6 +1337,10 @@ class TestAmpWithHook(unittest.TestCase):
                    loss = a.sum()
                    self.assertRaises(RuntimeError, loss.backward)

+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
+

 if __name__ == '__main__':
    unittest.main()
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -997,17 +997,17 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [
    'test_parallel_executor_transformer',
    'test_tensor_scalar_type_promotion_dynamic',
    'test_eager_deletion_delete_vars', 'test_asp_pruning_1d',
-    'test_imperative_auto_mixed_precision', 'test_imperative_using_non_zero_gpu',
-    'test_machine_translation', 'test_flatten_op', 'test_onnx_export',
-    'test_optimizer_for_varbase', 'test_fusion_transpose_flatten_concat_op',
-    'best_fit_allocator_test', 'test_ir_fusion_group_pass',
-    'test_trt_quant_conv2d_dequant_fuse_pass', 'test_allclose_op',
-    'test_ftrl_op', 'test_elementwise_add_op', 'test_instance_norm_op',
-    'test_lambv2_op', 'test_yolo_box_op', 'test_parallel_executor_drop_scope',
-    'test_generator_dataloader', 'test_conv2d_transpose_op_depthwise_conv',
-    'test_imperative_save_load_v2', 'test_lookahead',
-    'test_moving_average_abs_max_scale_op', 'test_roi_perspective_transform_op',
-    'test_tensorrt_engine', 'test_affine_grid_function', 'test_nonzero_api',
+    'test_imperative_using_non_zero_gpu', 'test_machine_translation',
+    'test_flatten_op', 'test_onnx_export', 'test_optimizer_for_varbase',
+    'test_fusion_transpose_flatten_concat_op', 'best_fit_allocator_test',
+    'test_ir_fusion_group_pass', 'test_trt_quant_conv2d_dequant_fuse_pass',
+    'test_allclose_op', 'test_ftrl_op', 'test_elementwise_add_op',
+    'test_instance_norm_op', 'test_lambv2_op', 'test_yolo_box_op',
+    'test_parallel_executor_drop_scope', 'test_generator_dataloader',
+    'test_conv2d_transpose_op_depthwise_conv', 'test_imperative_save_load_v2',
+    'test_lookahead', 'test_moving_average_abs_max_scale_op',
+    'test_roi_perspective_transform_op', 'test_tensorrt_engine',
+    'test_affine_grid_function', 'test_nonzero_api',
    'test_ir_memory_optimize_pass', 'test_reduce_mkldnn_op',
    'test_bilinear_interp_op', 'test_cvm_op', 'test_scale_op', 'test_matmul_op',
    'test_sequence_pool', 'test_complex_simplenet', 'test_complex_reshape',