From c12f7d48acac5c393ac8a45461a5ed7bcc649d5f Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Thu, 24 Mar 2022 19:25:02 +0800
Subject: [PATCH] [AMP] Support amp for Intermediate_dygraph (#40623)

* approve amp for intermediate_dygraph

* add amp_utils for intermediate_dygraph

* add amp needcast check for mlu & npu

* test unittest

* add SetGradNode for set_stop_gradient && add checktensor for GradientHooks

* refine code

* refien unittest of imperative_amp for new dygraph

* inplace api skip amp

* add test_imperative_qat_amp for intermediate amp

* refine code

* refine test_amp ci strategy

* refine unittest code

* refine amp_utils code

* refine amp getpromotetype for some special op

* refine unittest code
---
 paddle/fluid/eager/amp_utils.h                | 245 ++++++++++++
 .../auto_code_generator/eager_generator.cc    | 138 ++++++-
 paddle/fluid/eager/grad_node_info.cc          |   1 +
 paddle/fluid/eager/grad_node_info.h           |  25 ++
 paddle/fluid/framework/convert_utils.cc       |   4 +
 paddle/fluid/pybind/eager_properties.cc       |   3 +
 .../slim/tests/test_imperative_qat_amp.py     |   7 +-
 .../test_imperative_auto_mixed_precision.py   | 352 ++++++++++++------
 tools/parallel_UT_rule.py                     |  22 +-
 9 files changed, 662 insertions(+), 135 deletions(-)
 create mode 100644 paddle/fluid/eager/amp_utils.h
diff --git a/paddle/fluid/eager/amp_utils.h b/paddle/fluid/eager/amp_utils.h
new file mode 100644
index 00000000000..229af41a38a
--- /dev/null
+++ b/paddle/fluid/eager/amp_utils.h
@@ -0,0 +1,245 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <string>
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/imperative/amp_auto_cast.h"
+
+namespace egr {
+
+static inline paddle::experimental::DataType GetPromoteType(
+    const std::string& api_name,
+    const std::vector<std::vector<paddle::experimental::Tensor>>&
+        amp_tensors_vector,
+    const paddle::experimental::DataType& amp_dtype) {
+  auto dst_type = amp_dtype;
+  if (egr::Controller::Instance().GetCurrentTracer()->GetAmpDtype() ==
+      "float16") {
+    if (api_name == "batch_norm" || api_name == "layer_norm" ||
+        api_name == "sync_batch_norm") {
+      if (amp_tensors_vector[0][0].dtype() ==
+          paddle::experimental::DataType::FLOAT32) {
+        dst_type = paddle::experimental::DataType::FLOAT32;
+      }
+    } else if (api_name == "fused_attention") {
+      for (size_t i = 0; i < amp_tensors_vector.size(); i++) {
+        if (i != 3 || i != 4 || i != 9 || i != 10) {
+          if (amp_tensors_vector[i][0].dtype() ==
+              paddle::experimental::DataType::FLOAT32) {
+            dst_type = paddle::experimental::DataType::FLOAT32;
+            break;
+          }
+        }
+      }
+    } else if (api_name == "fused_feedforward") {
+      for (size_t i = 0; i < amp_tensors_vector.size(); i++) {
+        if (i != 7 || i != 8 || i != 9 || i != 10) {
+          if (amp_tensors_vector[i][0].dtype() ==
+              paddle::experimental::DataType::FLOAT32) {
+            dst_type = paddle::experimental::DataType::FLOAT32;
+            break;
+          }
+        }
+      }
+    } else {
+      for (const auto& tensors : amp_tensors_vector) {
+        for (const auto& tensor : tensors) {
+          if (tensor.dtype() == paddle::experimental::DataType::FLOAT32) {
+            dst_type = tensor.dtype();
+            break;
+          }
+        }
+      }
+    }
+  } else {
+    for (const auto& tensors : amp_tensors_vector) {
+      for (const auto& tensor : tensors) {
+        if (tensor.dtype() == paddle::experimental::DataType::FLOAT32) {
+          dst_type = tensor.dtype();
+          break;
+        }
+      }
+    }
+  }
+  // NOTE(juncai): moving_average_abs_max_scale only consider the dtype of
+  // input(X)
+  if (api_name == "moving_average_abs_max_scale") {
+    if (amp_tensors_vector[0][0].dtype() ==
+        paddle::experimental::DataType::FLOAT16) {
+      dst_type = paddle::experimental::DataType::FLOAT16;
+    }
+  }
+  return dst_type;
+}
+
+paddle::experimental::DataType GetAmpDestDtype(
+    const std::string& api_name,
+    const std::vector<std::vector<paddle::experimental::Tensor>>&
+        amp_tensors_vector) {
+  auto amp_dtype =
+      egr::Controller::Instance().GetCurrentTracer()->GetAmpDtype();
+  auto amp_level = egr::Controller::Instance().GetAMPLevel();
+  VLOG(6) << "AMP GetAmpDestDtype:"
+          << " op(" << api_name << ") amp_dtype(" << amp_dtype << ") amp_level("
+          << static_cast<int>(amp_level) << ").";
+  if (amp_dtype == "float16") {
+    if (amp_level == paddle::imperative::AmpLevel::O1) {
+      if (paddle::imperative::AmpOperators::Instance()
+              .GetMutableAllowOps()
+              ->count(api_name)) {
+        return paddle::experimental::DataType::FLOAT16;
+      } else if (paddle::imperative::AmpOperators::Instance()
+                     .GetMutableBlockOps()
+                     ->count(api_name)) {
+        return paddle::experimental::DataType::FLOAT32;
+      } else {
+        auto dst_type = GetPromoteType(api_name, amp_tensors_vector,
+                                       paddle::experimental::DataType::FLOAT16);
+        if (dst_type == paddle::experimental::DataType::FLOAT16 &&
+            paddle::imperative::AmpOperators::Instance()
+                .GetMutableUnsupportedFp16Ops()
+                ->count(api_name)) {
+          dst_type = paddle::experimental::DataType::FLOAT32;
+        }
+        return dst_type;
+      }
+    } else if (amp_level == paddle::imperative::AmpLevel::O2) {
+      auto dst_type = paddle::experimental::DataType::FLOAT16;
+      if (paddle::imperative::AmpOperators::Instance()
+              .GetMutableUnsupportedFp16Ops()
+              ->count(api_name) ||
+          paddle::imperative::AmpOperators::Instance()
+              .GetMutableBlockOps()
+              ->count(api_name)) {
+        dst_type = paddle::experimental::DataType::FLOAT32;
+      }
+      return dst_type;
+    }
+  } else if (amp_dtype == "bfloat16") {
+    if (amp_level == paddle::imperative::AmpLevel::O1) {
+      if (paddle::imperative::AmpOperators::Instance()
+              .GetMutableAllowOps()
+              ->count(api_name)) {
+        return paddle::experimental::DataType::BFLOAT16;
+      } else if (paddle::imperative::AmpOperators::Instance()
+                     .GetMutableBlockOps()
+                     ->count(api_name)) {
+        return paddle::experimental::DataType::FLOAT32;
+      } else {
+        auto dst_type =
+            GetPromoteType(api_name, amp_tensors_vector,
+                           paddle::experimental::DataType::BFLOAT16);
+        if (dst_type == paddle::experimental::DataType::BFLOAT16 &&
+            paddle::imperative::AmpOperators::Instance()
+                .GetMutableUnsupportedBf16Ops()
+                ->count(api_name)) {
+          dst_type = paddle::experimental::DataType::FLOAT32;
+        }
+        return dst_type;
+      }
+    } else if (amp_level == paddle::imperative::AmpLevel::O2) {
+      auto dst_type = paddle::experimental::DataType::BFLOAT16;
+      if (paddle::imperative::AmpOperators::Instance()
+              .GetMutableUnsupportedBf16Ops()
+              ->count(api_name) ||
+          paddle::imperative::AmpOperators::Instance()
+              .GetMutableBlockOps()
+              ->count(api_name)) {
+        dst_type = paddle::experimental::DataType::FLOAT32;
+      }
+      return dst_type;
+    }
+  }
+  return paddle::experimental::DataType::FLOAT32;
+}
+
+static inline bool NeedCast(const paddle::experimental::Tensor& tensor,
+                            const paddle::experimental::DataType& dst_dtype) {
+  auto place = tensor.inner_place();
+  auto data_type = tensor.dtype();
+  if (paddle::platform::is_gpu_place(place) ||
+      paddle::platform::is_cuda_pinned_place(place) ||
+      paddle::platform::is_xpu_place(place) ||
+      paddle::platform::is_mlu_place(place) ||
+      paddle::platform::is_npu_place(place) ||
+      paddle::platform::is_npu_pinned_place(place)) {
+    // CudaPinndePlace is added for varbase created by dataloader
+    if ((data_type == paddle::experimental::DataType::FLOAT32 ||
+         data_type == paddle::experimental::DataType::FLOAT16 ||
+         data_type == paddle::experimental::DataType::BFLOAT16) &&
+        (data_type != dst_dtype)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<paddle::experimental::Tensor> AmpAutoCasts(
+    const std::string& inputs_name,
+    const std::vector<paddle::experimental::Tensor>& inputs,
+    const paddle::experimental::DataType& dst_dtype, std::string api_name) {
+  VLOG(6) << "AMP AmpAutoCasts:"
+          << " inputs(" << inputs_name << ") dst_dtype("
+          << paddle::framework::DataType2String(dst_dtype) << ").";
+  std::vector<paddle::experimental::Tensor> inputs_casted;
+  for (auto& input : inputs) {
+    if (NeedCast(input, dst_dtype)) {
+      paddle::framework::AttributeMap cast_attrs = {
+          {"in_dtype", paddle::framework::TransToProtoVarType(input.dtype())},
+          {"out_dtype", paddle::framework::TransToProtoVarType(dst_dtype)}};
+      inputs_casted.emplace_back(
+          std::move(cast_dygraph_function(input, cast_attrs)));
+    } else {
+      inputs_casted.emplace_back(input);
+    }
+  }
+  return inputs_casted;
+}
+
+paddle::experimental::Tensor AmpAutoCast(
+    const std::string& input_name, const paddle::experimental::Tensor& input,
+    const paddle::experimental::DataType& dst_dtype, std::string api_name) {
+  VLOG(6) << "AMP AmpAutoCasts:"
+          << " input(" << input_name << ") dst_dtype("
+          << paddle::framework::DataType2String(dst_dtype) << ").";
+  if (dst_dtype == paddle::experimental::DataType::FLOAT16) {
+    if (api_name == "run_program") {
+      return input;
+    }
+    if ((api_name == "batch_norm" || api_name == "layer_norm" ||
+         api_name == "sync_batch_norm") &&
+        input_name != "X") {
+      return input;
+    }
+    if ((api_name == "fused_attention" || api_name == "fused_feedforward")) {
+      if (input_name == "LnScale" || input_name == "LnBias" ||
+          input_name == "Ln2Scale" || input_name == "Ln2Bias" ||
+          input_name == "Ln1Scale" || input_name == "Ln1Bias") {
+        return input;
+      }
+    }
+  }
+  if (NeedCast(input, dst_dtype)) {
+    paddle::framework::AttributeMap cast_attrs = {
+        {"in_dtype", paddle::framework::TransToProtoVarType(input.dtype())},
+        {"out_dtype", paddle::framework::TransToProtoVarType(dst_dtype)}};
+    return cast_dygraph_function(input, cast_attrs);
+  }
+  return input;
+}
+}  // namespace egr
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 22981759642..fc3ea234929 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1379,6 +1379,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       paddle::string::Sprintf(FORWARD_FUNCTION_TEMPLATE, op_type);
 
   std::string dygraph_function_args_str = "";
+  std::string amp_function_call_args_str = "";
   core_ops_args_info[op_type] = {};
   core_ops_args_type_info[op_type] = {};
   core_ops_args_info[op_type].resize(in_vars.size());
@@ -1391,6 +1392,9 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   // [Generation] Get Ins Map
   std::string ins_contents_str = "";
   std::vector<std::string> input_args_str_list(in_vars.size());
+  std::vector<std::string> amp_function_call_args_str_list(in_vars.size());
+  std::string amp_tensors_vector_str = "";
+  std::string amp_auto_cast_str = "";
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     size_t input_position = fwd_inputs_name_pos_map.at(input_name);
@@ -1400,6 +1404,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
           "const std::vector<paddle::experimental::Tensor>& %s";
       input_args_str_list[input_position] =
           paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name);
+      amp_function_call_args_str_list[input_position] = " NEW_" + input_name;
 
       core_ops_args_type_info[op_type][input_position] = "list";
     } else {
@@ -1420,6 +1425,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       }
       input_args_str_list[input_position] =
           paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name);
+      amp_function_call_args_str_list[input_position] = " NEW_" + input_name;
 
       core_ops_args_type_info[op_type][input_position] = "tensor";
     }
@@ -1431,10 +1437,31 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
     ins_contents_str += paddle::string::Sprintf(FWD_INS_CONTENT_TEMPLATE,
                                                 input_name, input_name);
+    if (input.duplicable()) {
+      const char* AMP_TENSORS_VECTOR_TEMPLATE = "%s,";
+      amp_tensors_vector_str +=
+          paddle::string::Sprintf(AMP_TENSORS_VECTOR_TEMPLATE, input_name);
+      const char* AMP_AUTO_CAST_TEMPLATE =
+          "    auto NEW_%s = egr::AmpAutoCasts(\"%s\", %s, amp_dst_dtype, "
+          "\"%s\");\n";
+      amp_auto_cast_str += paddle::string::Sprintf(
+          AMP_AUTO_CAST_TEMPLATE, input_name, input_name, input_name, op_type);
+    } else {
+      const char* AMP_TENSORS_VECTOR_TEMPLATE = "{%s},";
+      amp_tensors_vector_str +=
+          paddle::string::Sprintf(AMP_TENSORS_VECTOR_TEMPLATE, input_name);
+      const char* AMP_AUTO_CAST_TEMPLATE =
+          "    auto NEW_%s = egr::AmpAutoCast(\"%s\", %s, amp_dst_dtype, "
+          "\"%s\");\n";
+      amp_auto_cast_str += paddle::string::Sprintf(
+          AMP_AUTO_CAST_TEMPLATE, input_name, input_name, input_name, op_type);
+    }
   }
   if (ins_contents_str.size() > 0)
     ins_contents_str.pop_back();  // // Remove trailing ","
 
+  if (amp_tensors_vector_str.size() > 0) amp_tensors_vector_str.pop_back();
+
   for (const std::string& arg : input_args_str_list) {
     dygraph_function_args_str += arg;
     dygraph_function_args_str += ",";
@@ -1442,16 +1469,17 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   if (dygraph_function_args_str.size() > 0)
     dygraph_function_args_str.pop_back();
 
-  const char* FWD_INS_MAP_TEMPLATE =
-      "  std::map<std::string, "
-      "std::vector<std::shared_ptr<egr::EagerVariable>>> ins = { "
-      "%s };\n";
-  std::string ins_map_str =
-      paddle::string::Sprintf(FWD_INS_MAP_TEMPLATE, ins_contents_str);
-  generated_function_body += ins_map_str;
-  generated_function_body += "\n";
+  for (const std::string& arg : amp_function_call_args_str_list) {
+    amp_function_call_args_str += arg;
+    amp_function_call_args_str += ",";
+  }
+  if (amp_function_call_args_str.size() > 0)
+    amp_function_call_args_str.pop_back();
 
   // Handle Dispensable Inputs
+  std::string dispensable_ins_contents_str = "";
+  std::string dispensable_amp_tensors_vector_str = "";
+  std::string dispensable_amp_auto_cast_str = "";
   std::set<std::string> input_names;
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
@@ -1461,14 +1489,36 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         const char* FWD_INS_CONTENT_TEMPLATE =
             "  if(%s.size() > 0) "
             "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
-        generated_function_body += paddle::string::Sprintf(
+        dispensable_ins_contents_str += paddle::string::Sprintf(
             FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
+        const char* FWD_AMP_TENSORS_VECTOR_TEMPLATE =
+            "    if(%s.size() > 0) "
+            "amp_tensors_vector.push_back(%s);\n";
+        dispensable_amp_tensors_vector_str += paddle::string::Sprintf(
+            FWD_AMP_TENSORS_VECTOR_TEMPLATE, input_name, input_name);
+        const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE =
+            "    auto NEW_%s = ((%s.size() > 0) ? egr::AmpAutoCasts(\"%s\", "
+            "%s, amp_dst_dtype, \"%s\") : %s);\n";
+        dispensable_amp_auto_cast_str += paddle::string::Sprintf(
+            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name,
+            input_name, input_name, op_type, input_name);
       } else {
         const char* FWD_INS_CONTENT_TEMPLATE =
             "  if(%s.initialized()) "
             "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
-        generated_function_body += paddle::string::Sprintf(
+        dispensable_ins_contents_str += paddle::string::Sprintf(
             FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
+        const char* FWD_AMP_TENSORS_VECTOR_TEMPLATE =
+            "    if(%s.initialized()) "
+            "amp_tensors_vector.push_back({ %s });\n";
+        dispensable_amp_tensors_vector_str += paddle::string::Sprintf(
+            FWD_AMP_TENSORS_VECTOR_TEMPLATE, input_name, input_name);
+        const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE =
+            "    auto NEW_%s = ((%s.initialized()) ? egr::AmpAutoCast(\"%s\", "
+            "%s, amp_dst_dtype, \"%s\") : %s);\n";
+        dispensable_amp_auto_cast_str += paddle::string::Sprintf(
+            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name,
+            input_name, input_name, op_type, input_name);
       }
     }
   }
@@ -1493,6 +1543,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         std::string arg_str =
             paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
         dygraph_function_args_str += arg_str;
+        amp_function_call_args_str += (", " + output_var_name);
 
         core_ops_args_type_info[op_type].push_back("list");
       } else {
@@ -1500,6 +1551,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         std::string arg_str =
             paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
         dygraph_function_args_str += arg_str;
+        amp_function_call_args_str += (", " + output_var_name);
 
         core_ops_args_type_info[op_type].push_back("tensor");
       }
@@ -1544,6 +1596,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         std::string arg_str =
             paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, outnum);
         dygraph_function_args_str += arg_str;
+        amp_function_call_args_str += (", " + outnum);
         const char* FWD_OUTS_CONTENT_TEMPLATE =
             "{ \"%s\", egr::EagerUtils::CreateVars(%s) },";
         outs_contents_str += paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE,
@@ -1565,6 +1618,69 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   if (inplace_mapping_str.size() > 0)
     inplace_mapping_str.pop_back();  // Remove trailing ","
 
+  if ((op_type != "cast") && (inplace_map.empty())) {
+    VLOG(6) << "Generating Dygraph Forward AMP";
+    const char* AMP_LOGIC_CONTEXT =
+        "  if (egr::Controller::Instance().GetAMPLevel() != "
+        "paddle::imperative::AmpLevel::O0) {\n"
+        "    VLOG(5) << \"Check and Prepare For AMP\";\n"
+        "  \n"
+        "%s\n"
+        "  }\n";
+    std::string amp_logic_str = "";
+    if (in_vars.size() != 0) {
+      const char* AMP_TENSORS_VECTOR_TEMPLATE =
+          "    std::vector<std::vector<paddle::experimental::Tensor>> "
+          "amp_tensors_vector = { "
+          "%s };\n";
+      std::string amp_tensors_vector = paddle::string::Sprintf(
+          AMP_TENSORS_VECTOR_TEMPLATE, amp_tensors_vector_str);
+      amp_tensors_vector += dispensable_amp_tensors_vector_str;
+      amp_logic_str += amp_tensors_vector;
+      amp_logic_str += "\n";
+      const char* GET_AMP_GET_DST_DTYPE_CONTEXT =
+          "    auto amp_dst_dtype = "
+          "egr::GetAmpDestDtype(\"%s\", "
+          "amp_tensors_vector);\n";
+      amp_logic_str +=
+          paddle::string::Sprintf(GET_AMP_GET_DST_DTYPE_CONTEXT, op_type);
+      amp_logic_str += "\n";
+      amp_logic_str += amp_auto_cast_str;
+      amp_logic_str += dispensable_amp_auto_cast_str;
+      amp_logic_str += "\n";
+    }
+    const char* CALL_BACK_TEMPLATE =
+        "    {\n"
+        "      paddle::imperative::AutoCastGuard "
+        "guard(egr::Controller::Instance().GetCurrentTracer(), "
+        "paddle::imperative::AmpLevel::O0);\n"
+        "      return %s_dygraph_function(%s);\n"
+        "    }";
+    amp_function_call_args_str += ", attr_map ";
+    if (amp_function_call_args_str.size() > 0) {
+      auto iter = amp_function_call_args_str.begin();
+      if ((*iter) == ',') amp_function_call_args_str.erase(iter);
+    }
+    std::string call_back_str = paddle::string::Sprintf(
+        CALL_BACK_TEMPLATE, op_type, amp_function_call_args_str);
+    amp_logic_str += call_back_str;
+    amp_logic_str += "\n";
+    std::string amp_context =
+        paddle::string::Sprintf(AMP_LOGIC_CONTEXT, amp_logic_str);
+    generated_function_body += amp_context;
+    generated_function_body += "\n";
+  }
+  // forward ins insert
+  const char* FWD_INS_MAP_TEMPLATE =
+      "  std::map<std::string, "
+      "std::vector<std::shared_ptr<egr::EagerVariable>>> ins = { "
+      "%s };\n";
+  std::string ins_map_str =
+      paddle::string::Sprintf(FWD_INS_MAP_TEMPLATE, ins_contents_str);
+  ins_map_str += dispensable_ins_contents_str;
+  generated_function_body += ins_map_str;
+  generated_function_body += "\n";
+  // forward outs insert
   const char* FWD_OUTS_MAP_TEMPLATE =
       "  std::map<std::string, "
       "std::vector<std::shared_ptr<egr::EagerVariable>>> outs = { "
@@ -2044,6 +2160,7 @@ static std::string GenerateSingleOpBase(
     grad_attrs_str += paddle::string::Sprintf(CAST_GRAD, attrs_name, attrs_name,
                                               attrs_name, attrs_name);
   }
+
   // Handle dynamic grad attributes
   grad_attrs_str += HandleDynamicGradAttributes(fwd_op_type, attrs_name);
   generated_grad_function_body += grad_attrs_str;
@@ -2469,6 +2586,7 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path,
       "#include "
       "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n"
       "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
+      "#include \"paddle/fluid/eager/amp_utils.h\"\n"
       "#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n";
   std::string forward_cc_include_str =
       paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 25610a3f95f..5f3dfe8e513 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -364,6 +364,7 @@ GradNodeBase::ApplyGradientHooks(
       if (!outs[i][j].defined() || !outs[i][j].initialized()) {
         outs[i][j] = tensors[i][j];
       }
+      CheckTensor(tensors[i][j], outs[i][j]);
     }
   }
 
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 4dec1c1f9f4..81470f38cc3 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -279,4 +279,29 @@ class Edge {
   std::shared_ptr<GradNodeBase> grad_node_{nullptr};
 };
 
+inline void CheckTensor(const paddle::experimental::Tensor& pre,
+                        const paddle::experimental::Tensor& post) {
+  if (!pre.initialized() && post.initialized()) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "The tensor in before and after hook are not consistent"));
+  }
+  if (pre.initialized() && post.initialized()) {
+    VLOG(4) << paddle::framework::DataType2String(pre.dtype()) << " "
+            << paddle::framework::DataType2String(post.dtype());
+    PADDLE_ENFORCE_EQ(
+        pre.dtype(), post.dtype(),
+        paddle::platform::errors::PermissionDenied(
+            "The dtype of tensor before(%s) and after(%s) hook are not "
+            "consistent",
+            paddle::framework::DataType2String(pre.dtype()),
+            paddle::framework::DataType2String(post.dtype())));
+    PADDLE_ENFORCE_EQ(
+        pre.inner_place(), post.inner_place(),
+        paddle::platform::errors::PermissionDenied(
+            "The place of tensor before(%s) and after(%s) "
+            "hook are not consistent",
+            pre.inner_place().DebugString(), post.inner_place().DebugString()));
+  }
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/framework/convert_utils.cc b/paddle/fluid/framework/convert_utils.cc
index df5cc6d8204..1144bc51509 100644
--- a/paddle/fluid/framework/convert_utils.cc
+++ b/paddle/fluid/framework/convert_utils.cc
@@ -145,6 +145,8 @@ DataType String2DataType(const std::string& str) {
     return DataType::COMPLEX64;
   } else if (str == "complex128") {
     return DataType::COMPLEX128;
+  } else if (str == "bfloat16") {
+    return DataType::BFLOAT16;
   } else {
     return DataType::UNDEFINED;
   }
@@ -174,6 +176,8 @@ std::string DataType2String(DataType dtype) {
       return "complex64";
     case DataType::COMPLEX128:
       return "complex128";
+    case DataType::BFLOAT16:
+      return "bfloat16";
     default:
       PADDLE_THROW(paddle::platform::errors::InvalidArgument(
           "Unknow phi::DataType, the int value = %d.",
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index a610c31ee89..d8c297b1a94 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -112,6 +112,9 @@ int tensor_properties_set_stop_gradient(TensorObject* self, PyObject* value,
   EAGER_TRY
   auto meta = egr::EagerUtils::autograd_meta(&self->tensor);
   meta->SetStopGradient(CastPyArg2AttrBoolean(value, 0));
+  if (!meta->GradNode()) {
+    meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
+  }
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_ZERO
 }
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
index 2dcf7a6f168..76a6e11d98d 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
@@ -172,7 +172,7 @@ class TestImperativeQatAmp(unittest.TestCase):
         acc_top1 = sum(acc_top1_list) / len(acc_top1_list)
         return acc_top1
 
-    def test_ptq(self):
+    def ptq(self):
         start_time = time.time()
 
         self.set_vars()
@@ -217,6 +217,11 @@ class TestImperativeQatAmp(unittest.TestCase):
         end_time = time.time()
         print("total time: %ss" % (end_time - start_time))
 
+    def test_ptq(self):
+        self.ptq()
+        with _test_eager_guard():
+            self.ptq()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 2011a35db68..aa1b10db441 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -21,6 +21,7 @@ from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_p
 import paddle.nn as nn
 from paddle.static import InputSpec
 from paddle.autograd import PyLayer
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode, in_dygraph_mode
 
 if fluid.core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
@@ -51,7 +52,7 @@ class SimpleConv(fluid.dygraph.Layer):
 
 
 class TestAutoCast(unittest.TestCase):
-    def test_amp_guard_white_op(self):
+    def amp_guard_white_op(self):
         data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
         with fluid.dygraph.guard():
             conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
@@ -66,7 +67,12 @@ class TestAutoCast(unittest.TestCase):
         self.assertTrue(out_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
         self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
 
-    def test_amp_guard_black_op(self):
+    def test_amp_guard_white_op(self):
+        with _test_eager_guard():
+            self.amp_guard_white_op()
+        self.amp_guard_white_op()
+
+    def amp_guard_black_op(self):
         data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
         with fluid.dygraph.guard():
             data = fluid.dygraph.to_variable(data)
@@ -76,7 +82,12 @@ class TestAutoCast(unittest.TestCase):
         self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
         self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
 
-    def test_custom_op_list(self):
+    def test_amp_guard_black_op(self):
+        with _test_eager_guard():
+            self.amp_guard_black_op()
+        self.amp_guard_black_op()
+
+    def custom_op_list(self):
         with fluid.dygraph.guard():
             tracer = fluid.framework._dygraph_tracer()
             base_white_list = fluid.dygraph.amp.auto_cast.WHITE_LIST
@@ -107,7 +118,12 @@ class TestAutoCast(unittest.TestCase):
                     set(black_list) ==
                     (set(base_black_list) - {"log"}) | {"conv2d"})
 
-    def test_custom_op_list_exception(self):
+    def test_custom_op_list(self):
+        with _test_eager_guard():
+            self.custom_op_list()
+        self.custom_op_list()
+
+    def custom_op_list_exception(self):
         inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
 
         def func():
@@ -118,7 +134,6 @@ class TestAutoCast(unittest.TestCase):
                     filter_size=7,
                     stride=2,
                     act='relu')
-
                 with fluid.dygraph.amp_guard(
                         custom_white_list=["conv2d"],
                         custom_black_list=["conv2d"]):
@@ -127,7 +142,12 @@ class TestAutoCast(unittest.TestCase):
 
         self.assertRaises(ValueError, func)
 
-    def test_amp_guard_upsupported_fp16_op(self):
+    def test_custom_op_list_exception(self):
+        with _test_eager_guard():
+            self.custom_op_list_exception()
+        self.custom_op_list_exception()
+
+    def amp_guard_upsupported_fp16_op(self):
         data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
         with fluid.dygraph.guard():
             conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
@@ -143,7 +163,6 @@ class TestAutoCast(unittest.TestCase):
                 out_purefp16_fp32 = paddle.expand_as(
                     out_purefp16_fp16,
                     out_purefp16_fp16)  # expand_as_v2 has no fp16 kernel
-
         self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
         self.assertTrue(out_amp_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
         self.assertTrue(out_amp_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
@@ -152,7 +171,12 @@ class TestAutoCast(unittest.TestCase):
         self.assertTrue(
             out_purefp16_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
 
-    def test_mode_exception(self):
+    def test_amp_guard_upsupported_fp16_op(self):
+        with _test_eager_guard():
+            self.amp_guard_upsupported_fp16_op()
+        self.amp_guard_upsupported_fp16_op()
+
+    def mode_exception(self):
         def func():
             data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
             with fluid.dygraph.guard():
@@ -164,9 +188,14 @@ class TestAutoCast(unittest.TestCase):
 
         self.assertRaises(ValueError, func)
 
+    def test_mode_exception(self):
+        with _test_eager_guard():
+            self.mode_exception()
+        self.mode_exception()
+
 
 class TestAmpScaler(unittest.TestCase):
-    def test_scale(self):
+    def scale(self):
         with fluid.dygraph.guard():
             data = paddle.rand([10, 1024])
             scaler = paddle.fluid.dygraph.AmpScaler(init_loss_scaling=1024)
@@ -174,7 +203,12 @@ class TestAmpScaler(unittest.TestCase):
             self.assertEqual(
                 np.array_equal(scaled_data.numpy(), data.numpy() * 1024), True)
 
-    def test_minimize(self):
+    def test_scale(self):
+        with _test_eager_guard():
+            self.scale()
+        self.scale()
+
+    def minimize(self):
         inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
 
         def run_simple_conv(inp_np, use_scaler=True):
@@ -223,7 +257,12 @@ class TestAmpScaler(unittest.TestCase):
                 np.allclose(outs_with_scaler[1][i][0].numpy(),
                             outs_no_scaler[1][i][0].numpy()), True)
 
-    def test_step(self):
+    def test_minimize(self):
+        with _test_eager_guard():
+            self.minimize()
+        self.minimize()
+
+    def step(self):
         inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
 
         def run_simple_conv(inp_np, use_scaler=True):
@@ -264,7 +303,12 @@ class TestAmpScaler(unittest.TestCase):
                 np.allclose(outs_with_scaler[i].numpy(),
                             outs_no_scaler[i].numpy()), True)
 
-    def test_nan_inf(self):
+    def test_step(self):
+        with _test_eager_guard():
+            self.step()
+        self.step()
+
+    def nan_inf(self):
         inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
         inp_np[0][1][2][3] = np.nan
         with fluid.dygraph.guard():
@@ -294,7 +338,12 @@ class TestAmpScaler(unittest.TestCase):
                 self.assertTrue(
                     np.array_equal(param.numpy(), params_init[param.name]))
 
-    def test_step_update_exception(self):
+    def test_nan_inf(self):
+        with _test_eager_guard():
+            self.nan_inf()
+        self.nan_inf()
+
+    def step_update_exception(self):
         def func1():
             model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
             optimizer = paddle.optimizer.SGD(learning_rate=0.01,
@@ -340,6 +389,11 @@ class TestAmpScaler(unittest.TestCase):
 
         self.assertRaises(RuntimeError, func3)
 
+    def test_step_update_exception(self):
+        with _test_eager_guard():
+            self.step_update_exception()
+        self.step_update_exception()
+
     def test_get_and_set(self):
         with fluid.dygraph.guard():
             scaler = paddle.amp.GradScaler(
@@ -504,14 +558,19 @@ class TestGradScalerStateDict(unittest.TestCase):
         return dy_out, dy_param_value, dy_grad_value
 
     def test_with_state_dict(self):
-        with fluid.dygraph.guard():
-            out_use_state_dict = self.train_resnet(
-                enable_amp=True, use_data_loader=True, use_save_load=True)
-            out_no_state_dict = self.train_resnet(
-                enable_amp=True, use_data_loader=True, use_save_load=False)
-        print('save_load:', out_use_state_dict[0], out_no_state_dict[0])
-        self.assertTrue(
-            np.allclose(out_use_state_dict[0], out_no_state_dict[0]))
+        def func_isinstance():
+            with fluid.dygraph.guard():
+                out_use_state_dict = self.train_resnet(
+                    enable_amp=True, use_data_loader=True, use_save_load=True)
+                out_no_state_dict = self.train_resnet(
+                    enable_amp=True, use_data_loader=True, use_save_load=False)
+            print('save_load:', out_use_state_dict[0], out_no_state_dict[0])
+            self.assertTrue(
+                np.allclose(out_use_state_dict[0], out_no_state_dict[0]))
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
 
 
 class TestAmpDecorator(unittest.TestCase):
@@ -765,17 +824,23 @@ class TestPureFp16SaveLoad(unittest.TestCase):
         return dy_out, dy_param_value, dy_grad_value
 
     def test_with_save_load(self):
-        with fluid.dygraph.guard():
-            out_use_save_load = self.train_resnet(
-                enable_amp=True, use_data_loader=True, use_save_load=True)
-            out_no_save_load = self.train_resnet(
-                enable_amp=True, use_data_loader=True, use_save_load=False)
-        print('save_load:', out_use_save_load[0], out_no_save_load[0])
-        self.assertTrue(np.allclose(out_use_save_load[0], out_no_save_load[0]))
+        def func_isinstance():
+            with fluid.dygraph.guard():
+                out_use_save_load = self.train_resnet(
+                    enable_amp=True, use_data_loader=True, use_save_load=True)
+                out_no_save_load = self.train_resnet(
+                    enable_amp=True, use_data_loader=True, use_save_load=False)
+            print('save_load:', out_use_save_load[0], out_no_save_load[0])
+            self.assertTrue(
+                np.allclose(out_use_save_load[0], out_no_save_load[0]))
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
 
 
 class TestPureFp16InferenceSaveLoad(unittest.TestCase):
-    def test_inference_save_load(self):
+    def inference_save_load(self):
         BATCH_SIZE = 16
         BATCH_NUM = 4
         EPOCH_NUM = 4
@@ -861,7 +926,15 @@ class TestPureFp16InferenceSaveLoad(unittest.TestCase):
         results = exe.run(inference_program,
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
+        print("pred.numpy()", pred.numpy())
+        print("results", results)
         self.assertTrue(np.allclose(pred.numpy(), results, atol=1.e-5))
+        paddle.disable_static()
+
+    def test_inference_save_load(self):
+        self.inference_save_load()
+        with _test_eager_guard():
+            self.inference_save_load()
 
 
 class TestResnet2(unittest.TestCase):
@@ -987,38 +1060,63 @@ class TestResnet2(unittest.TestCase):
         return dy_out, dy_param_value, dy_grad_value
 
     def test_resnet(self):
-        with fluid.dygraph.guard():
-            out_fp32 = self.train_resnet(enable_amp=False)
-            out_amp = self.train_resnet(enable_amp=True)
-            out_pure_fp16 = self.train_resnet(enable_amp=True, level='O2')
-        print(out_fp32[0], out_amp[0], out_pure_fp16[0])
-        self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
-        self.assertTrue(np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+        def func_isinstance():
+            with fluid.dygraph.guard():
+                out_fp32 = self.train_resnet(enable_amp=False)
+                out_amp = self.train_resnet(enable_amp=True)
+                out_pure_fp16 = self.train_resnet(enable_amp=True, level='O2')
+            print(out_fp32[0], out_amp[0], out_pure_fp16[0])
+            self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
+            self.assertTrue(
+                np.allclose(
+                    out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
 
     def test_with_data_loader(self):
-        with fluid.dygraph.guard():
-            out_fp32 = self.train_resnet(enable_amp=False, use_data_loader=True)
-            out_amp = self.train_resnet(enable_amp=True, use_data_loader=True)
-            out_pure_fp16 = self.train_resnet(
-                enable_amp=True, use_data_loader=True, level='O2')
-        print(out_fp32[0], out_amp[0], out_pure_fp16[0])
-        self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
-        self.assertTrue(np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+        def func_isinstance():
+            with fluid.dygraph.guard():
+                out_fp32 = self.train_resnet(
+                    enable_amp=False, use_data_loader=True)
+                out_amp = self.train_resnet(
+                    enable_amp=True, use_data_loader=True)
+                out_pure_fp16 = self.train_resnet(
+                    enable_amp=True, use_data_loader=True, level='O2')
+            print(out_fp32[0], out_amp[0], out_pure_fp16[0])
+            self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
+            self.assertTrue(
+                np.allclose(
+                    out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
 
     def test_param_group(self):
-        with fluid.dygraph.guard():
-            out_fp32 = self.train_resnet(
-                enable_amp=False, use_data_loader=True, use_param_group=True)
-            out_amp = self.train_resnet(
-                enable_amp=True, use_data_loader=True, use_param_group=True)
-            out_pure_fp16 = self.train_resnet(
-                enable_amp=True,
-                use_data_loader=True,
-                use_param_group=True,
-                level='O2')
-        print(out_fp32[0], out_amp[0], out_pure_fp16[0])
-        self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
-        self.assertTrue(np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+        def func_isinstance():
+            with fluid.dygraph.guard():
+                out_fp32 = self.train_resnet(
+                    enable_amp=False,
+                    use_data_loader=True,
+                    use_param_group=True)
+                out_amp = self.train_resnet(
+                    enable_amp=True, use_data_loader=True, use_param_group=True)
+                out_pure_fp16 = self.train_resnet(
+                    enable_amp=True,
+                    use_data_loader=True,
+                    use_param_group=True,
+                    level='O2')
+            print(out_fp32[0], out_amp[0], out_pure_fp16[0])
+            self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
+            self.assertTrue(
+                np.allclose(
+                    out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
 
 
 class TestResnet(unittest.TestCase):
@@ -1102,12 +1200,19 @@ class TestResnet(unittest.TestCase):
         return dy_out, dy_param_value, dy_grad_value
 
     def test_resnet(self):
-        out_fp32 = self.train_resnet(enable_amp=False)
-        out_amp = self.train_resnet(enable_amp=True)
-        out_pure_fp16 = self.train_resnet(enable_amp=True, level='O2')
-        print(out_fp32[0], out_amp[0], out_pure_fp16[0])
-        self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2))
-        self.assertTrue(np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-1))
+        def func_isinstance():
+            out_fp32 = self.train_resnet(enable_amp=False)
+            out_amp = self.train_resnet(enable_amp=True)
+            out_pure_fp16 = self.train_resnet(enable_amp=True, level='O2')
+            print(out_fp32[0], out_amp[0], out_pure_fp16[0])
+            self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2))
+            self.assertTrue(
+                np.allclose(
+                    out_fp32[0], out_pure_fp16[0], atol=1.e-1))
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
 
 
 class TestLayerNormFp16(unittest.TestCase):
@@ -1116,14 +1221,20 @@ class TestLayerNormFp16(unittest.TestCase):
     '''
 
     def test_layer_norm_fp16(self):
-        if fluid.is_compiled_with_cuda():
-            with fluid.dygraph.guard(fluid.CUDAPlace(0)):
-                x = paddle.rand([2, 2, 2, 3])
-                layer_norm = paddle.nn.LayerNorm(x.shape[1:])
-                with paddle.amp.auto_cast(custom_white_list=['layer_norm']):
-                    out = layer_norm(x)
+        def func_isinstance():
+            if fluid.is_compiled_with_cuda():
+                with fluid.dygraph.guard(fluid.CUDAPlace(0)):
+                    x = paddle.rand([2, 2, 2, 3])
+                    layer_norm = paddle.nn.LayerNorm(x.shape[1:])
+                    with paddle.amp.auto_cast(custom_white_list=['layer_norm']):
+                        out = layer_norm(x)
 
-                self.assertTrue(out.dtype == fluid.core.VarDesc.VarType.FP16)
+                    self.assertTrue(
+                        out.dtype == fluid.core.VarDesc.VarType.FP16)
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
 
 
 class TestBf16(unittest.TestCase):
@@ -1142,18 +1253,23 @@ class TestBf16(unittest.TestCase):
         return output.numpy()
 
     def test_bf16(self):
-        if fluid.core.is_compiled_with_cuda():
-            cudnn_version = paddle.device.get_cudnn_version()
-            if cudnn_version is not None and cudnn_version >= 8100:
-                out_fp32 = self.train(enable_amp=False)
-                out_bf16_O1 = self.train(enable_amp=True, amp_level='O1')
-                out_bf16_O2 = self.train(enable_amp=True, amp_level='O2')
-                self.assertTrue(
-                    np.allclose(
-                        out_fp32, out_bf16_O1, rtol=1.e-3, atol=1.e-1))
-                self.assertTrue(
-                    np.allclose(
-                        out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
+        def func_isinstance():
+            if fluid.core.is_compiled_with_cuda():
+                cudnn_version = paddle.device.get_cudnn_version()
+                if cudnn_version is not None and cudnn_version >= 8100:
+                    out_fp32 = self.train(enable_amp=False)
+                    out_bf16_O1 = self.train(enable_amp=True, amp_level='O1')
+                    out_bf16_O2 = self.train(enable_amp=True, amp_level='O2')
+                    self.assertTrue(
+                        np.allclose(
+                            out_fp32, out_bf16_O1, rtol=1.e-3, atol=1.e-1))
+                    self.assertTrue(
+                        np.allclose(
+                            out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
 
 
 class TestAmpWithPyLyer(unittest.TestCase):
@@ -1176,44 +1292,54 @@ class TestAmpWithPyLyer(unittest.TestCase):
         x.stop_gradient = False
         y.stop_gradient = False
 
-        with paddle.amp.auto_cast():
-            res = MyMM.apply(x, y)
-            loss = paddle.mean(res)
+        # with paddle.amp.auto_cast():
+        res = MyMM.apply(x, y)
+        loss = paddle.mean(res)
         loss.backward()
 
 
 class TestAmpWithHook(unittest.TestCase):
     def test_hook_change_dtype(self):
-        with paddle.fluid.dygraph.guard():
-            v = paddle.rand([3, 3])
-            v.stop_gradient = False
-
-            def foo(grad):
-                print('grad', grad, grad.dtype)  # grad's dtype is float32
-                res = paddle.mm(grad, grad)  # mm runs in fp16
-                print('res', res, res.dtype)  # res's dtype is float16
-                return res
-
-            v.register_hook(foo)
-            with paddle.amp.auto_cast():
-                a = paddle.mm(v, v)
-                loss = a.sum()
-                self.assertRaises(RuntimeError, loss.backward)
+        def func_isinstance():
+            with paddle.fluid.dygraph.guard():
+                v = paddle.rand([3, 3])
+                v.stop_gradient = False
+
+                def foo(grad):
+                    print('grad', grad, grad.dtype)  # grad's dtype is float32
+                    res = paddle.mm(grad, grad)  # mm runs in fp16
+                    print('res', res, res.dtype)  # res's dtype is float16
+                    return res
+
+                v.register_hook(foo)
+                with paddle.amp.auto_cast():
+                    a = paddle.mm(v, v)
+                    loss = a.sum()
+                    self.assertRaises(RuntimeError, loss.backward)
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
 
     def test_hook_change_place(self):
-        with paddle.fluid.dygraph.guard():
-            v = paddle.rand([3, 3])
-            v.stop_gradient = False
-
-            def foo(grad):
-                res = grad.cpu()  # change place
-                return res
-
-            v.register_hook(foo)
-            with paddle.amp.auto_cast():
-                a = paddle.mm(v, v)
-                loss = a.sum()
-                self.assertRaises(RuntimeError, loss.backward)
+        def func_isinstance():
+            with paddle.fluid.dygraph.guard():
+                v = paddle.rand([3, 3])
+                v.stop_gradient = False
+
+                def foo(grad):
+                    res = grad.cpu()  # change place
+                    return res
+
+                v.register_hook(foo)
+                with paddle.amp.auto_cast():
+                    a = paddle.mm(v, v)
+                    loss = a.sum()
+                    self.assertRaises(RuntimeError, loss.backward)
+
+        with _test_eager_guard():
+            func_isinstance()
+        func_isinstance()
 
 
 if __name__ == '__main__':
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 7f8e516496f..f075439e54f 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -997,17 +997,17 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [
     'test_parallel_executor_transformer',
     'test_tensor_scalar_type_promotion_dynamic',
     'test_eager_deletion_delete_vars', 'test_asp_pruning_1d',
-    'test_imperative_auto_mixed_precision', 'test_imperative_using_non_zero_gpu',
-    'test_machine_translation', 'test_flatten_op', 'test_onnx_export',
-    'test_optimizer_for_varbase', 'test_fusion_transpose_flatten_concat_op',
-    'best_fit_allocator_test', 'test_ir_fusion_group_pass',
-    'test_trt_quant_conv2d_dequant_fuse_pass', 'test_allclose_op',
-    'test_ftrl_op', 'test_elementwise_add_op', 'test_instance_norm_op',
-    'test_lambv2_op', 'test_yolo_box_op', 'test_parallel_executor_drop_scope',
-    'test_generator_dataloader', 'test_conv2d_transpose_op_depthwise_conv',
-    'test_imperative_save_load_v2', 'test_lookahead',
-    'test_moving_average_abs_max_scale_op', 'test_roi_perspective_transform_op',
-    'test_tensorrt_engine', 'test_affine_grid_function', 'test_nonzero_api',
+    'test_imperative_using_non_zero_gpu', 'test_machine_translation',
+    'test_flatten_op', 'test_onnx_export', 'test_optimizer_for_varbase',
+    'test_fusion_transpose_flatten_concat_op', 'best_fit_allocator_test',
+    'test_ir_fusion_group_pass', 'test_trt_quant_conv2d_dequant_fuse_pass',
+    'test_allclose_op', 'test_ftrl_op', 'test_elementwise_add_op',
+    'test_instance_norm_op', 'test_lambv2_op', 'test_yolo_box_op',
+    'test_parallel_executor_drop_scope', 'test_generator_dataloader',
+    'test_conv2d_transpose_op_depthwise_conv', 'test_imperative_save_load_v2',
+    'test_lookahead', 'test_moving_average_abs_max_scale_op',
+    'test_roi_perspective_transform_op', 'test_tensorrt_engine',
+    'test_affine_grid_function', 'test_nonzero_api',
     'test_ir_memory_optimize_pass', 'test_reduce_mkldnn_op',
     'test_bilinear_interp_op', 'test_cvm_op', 'test_scale_op', 'test_matmul_op',
     'test_sequence_pool', 'test_complex_simplenet', 'test_complex_reshape',
-- 
GitLab