From ec77defccfcf0799e4f5da195add18d321d06bcd Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Tue, 25 Apr 2023 15:09:00 +0800
Subject: [PATCH] [Cherry-pick] Add enable_tensor_checker and
 disable_tensor_checker to api list (#52936) (#53287)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

新增enable_tensor_checker, disable_tensor_checker API (#52936)
---
 paddle/fluid/eager/nan_inf_utils.cc           |  26 +-
 paddle/fluid/eager/nan_inf_utils.h            |   4 +
 .../framework/details/nan_inf_utils_detail.cc |   8 +
 .../framework/details/nan_inf_utils_detail.cu |   2 +-
 .../framework/details/nan_inf_utils_detail.h  |   4 +
 paddle/fluid/pybind/pybind.cc                 |  12 +
 python/paddle/amp/debugging.py                | 356 ++++++++++--------
 .../fluid/tests/unittests/test_nan_inf_dir.py | 145 +------
 .../tests/unittests/test_tensor_checker.py    | 111 ++++++
 9 files changed, 359 insertions(+), 309 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_tensor_checker.py

diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index 6eae40fca36..e71ae7cf119 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -24,8 +24,6 @@
 DECLARE_int32(check_nan_inf_level);
 namespace egr {
 
-static std::once_flag dump_list_init_flag;
-
 static std::unordered_set<std::string>& nan_inf_check_op_list() {
   static std::unordered_set<std::string> _check_op_list = {};
   return _check_op_list;
@@ -36,39 +34,32 @@ static std::unordered_set<std::string>& nan_inf_skip_op_list() {
   return _skip_op_list;
 }
 
-static void InitDumpListFormEnv() {
+void SetCheckOpList(const std::string& check_op_list = "") {
   nan_inf_check_op_list();
-  nan_inf_skip_op_list();
-  const char* check_op_list = std::getenv("Paddle_check_nan_inf_op_list");
-  const char* skip_op_list = std::getenv("Paddle_skip_nan_inf_op_list");
-
-  if (check_op_list) {
+  if (check_op_list.size() != 0) {
     std::stringstream ss(check_op_list);
     std::string op_type;
     LOG(INFO) << "Please set op's name according to the "
                  "paddle.amp.low_precision_op_list()";
     while (std::getline(ss, op_type, ',')) {
       nan_inf_check_op_list().emplace(op_type);
+      VLOG(4) << "Check nan inf op list: " << op_type;
     }
   }
+}
 
-  if (skip_op_list) {
+void SetSkipOpList(const std::string& skip_op_list = "") {
+  nan_inf_skip_op_list();
+  if (skip_op_list.size() != 0) {
     std::stringstream ss(skip_op_list);
     std::string op_type;
     LOG(INFO) << "Please set op's name according to the "
                  "paddle.amp.low_precision_op_list()";
     while (std::getline(ss, op_type, ',')) {
       nan_inf_skip_op_list().emplace(op_type);
+      VLOG(4) << "Skip nan inf op list: " << op_type;
     }
   }
-
-  for (auto const& key : nan_inf_check_op_list()) {
-    LOG(INFO) << "Check nan inf op list: " << key;
-  }
-
-  for (auto const& key : nan_inf_skip_op_list()) {
-    LOG(INFO) << "Skip nan inf op list: " << key;
-  }
 }
 
 bool CheckOp(const std::string& api_name) {
@@ -89,7 +80,6 @@ bool CheckOp(const std::string& api_name) {
 }
 
 void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
-  std::call_once(dump_list_init_flag, InitDumpListFormEnv);
   auto op_name = phi::TransToFluidOpName(api_name);
   if (tensor.initialized() && CheckOp(op_name)) {
     auto& tensor_name = tensor.name();
diff --git a/paddle/fluid/eager/nan_inf_utils.h b/paddle/fluid/eager/nan_inf_utils.h
index 8d7ed7ffb76..4f412cf6db8 100644
--- a/paddle/fluid/eager/nan_inf_utils.h
+++ b/paddle/fluid/eager/nan_inf_utils.h
@@ -65,6 +65,10 @@ void CheckTensorHasNanOrInf(
 void CheckTensorHasNanOrInf(const std::string& api_name,
                             const TupleOfTensorAndVector& tensors);
 
+void SetCheckOpList(const std::string& check_op_list);
+
+void SetSkipOpList(const std::string& skip_op_list);
+
 void CheckTensorHasNanOrInf(
     const std::string& api_name,
     const paddle::small_vector<std::vector<paddle::Tensor>,
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index e3e08e8b7df..7890e37e672 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -30,6 +30,7 @@ namespace details {
 struct DebugTools {
   DebugTools() {}
   std::string path = "";
+  int stack_limit = 1;
 };
 static DebugTools debug_nan_inf;
 
@@ -45,6 +46,13 @@ std::string GetNanPath() {
   return debug_nan_inf.path + "/";
 }
 
+void SetNanInfStackLimit(const int& stack_limit) {
+  debug_nan_inf.stack_limit = stack_limit;
+  VLOG(4) << "Set the stack limit of debug tools : " << stack_limit;
+}
+
+int GetNanInfStackLimit() { return debug_nan_inf.stack_limit; }
+
 static std::once_flag white_list_init_flag;
 
 static int op_role_nan_inf_white_list = 0;
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index dd99adfecfc..3e001299e8e 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -516,7 +516,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
                                        check_nan_inf_level,
                                        nan_inf_zero_tensor.data<int64_t>());
 
-  if (check_nan_inf_level == 0) {
+  if (check_nan_inf_level == 0 && GetNanInfStackLimit() > 0) {
     auto nan_cpu =
         phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 3);
     int64_t* nan_cpu_ptr = reinterpret_cast<int64_t*>(nan_cpu->ptr());
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h
index 8f5eb5352ac..59865162cc3 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.h
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h
@@ -40,6 +40,10 @@ void SetNanInfDebugPath(const std::string& nan_inf_path);
 
 std::string GetNanPath();
 
+void SetNanInfStackLimit(const int& stack_limit);
+
+int GetNanInfStackLimit();
+
 template <typename T,
           typename MT,
           std::enable_if_t<std::is_same<T, float>::value, bool> = true>
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index bde6357ccbe..359c2266f8e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2670,9 +2670,21 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("use_layout_autotune",
         [] { return egr::Controller::Instance().UseLayoutAutoTune(); });
   // Add the api for nan op debug
+  m.def("set_nan_inf_stack_limit",
+        &paddle::framework::details::SetNanInfStackLimit);
+
+  // Add the api for nan op debug
   m.def("set_nan_inf_debug_path",
         &paddle::framework::details::SetNanInfDebugPath);
 
+  // Add check op lost
+  m.def("set_checked_op_list",
+        [](const std::string &op_list) { egr::SetCheckOpList(op_list); });
+
+  // Add skipped op list
+  m.def("set_skipped_op_list",
+        [](const std::string &op_list) { egr::SetSkipOpList(op_list); });
+
   m.def("check_numerics",
         [](const std::string &op_name, const paddle::Tensor &tensor) {
           VLOG(4) << "Check tensor whether has nan or inf.";
diff --git a/python/paddle/amp/debugging.py b/python/paddle/amp/debugging.py
index d1036221984..69ee21fe4c3 100644
--- a/python/paddle/amp/debugging.py
+++ b/python/paddle/amp/debugging.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import contextlib
-import os
 import random
 from enum import Enum
 
@@ -24,82 +23,119 @@ from paddle.fluid import core
 from paddle.fluid.framework import dygraph_only
 
 __all__ = [
+    "DebugMode",
+    "TensorCheckerConfig",
     "enable_operator_stats_collection",
     "disable_operator_stats_collection",
     "collect_operator_stats",
+    "enable_tensor_checker",
+    "disable_tensor_checker",
 ]
 
 
 class DebugMode(Enum):
+    """
+    The DebugMode is a feature that helps to present the state of the TensorCheckerConfig. Each DebugMode has a specific meaning, which is explained below:
+
+    - DebugMode.CHECK_NAN_INF_AND_ABORT: This mode prints or saves information about Tensors that contain NaN/Inf and interrupts the program.
+
+    - DebugMode.CHECK_NAN_INF: This mode prints or saves critical information about Tensors that contain NaN/Inf but allows the program to continue running.
+
+    - DebugMode.CHECK_ALL_FOR_OVERFLOW: This mode checks the output of the FP32 operator and prints or saves information about key Tensors that exceed the FP16 representation range, such as overflow or underflow.
+
+    - DebugMode.CHECK_ALL: This mode prints or saves output Tensor key information for all operators.
+
+    """
+
     CHECK_NAN_INF_AND_ABORT = 0
     CHECK_NAN_INF = 1
     CHECK_ALL_FOR_OVERFLOW = 2
     CHECK_ALL = 3
-    CHECK_ALL_AND_ABORT = 4
-    DUMP_ALL = 5
+    # CHECK_ALL_AND_ABORT = 4
+    # DUMP_ALL = 5
+
+
+def set_checked_op_list(checked_op_list):
+    # check checked_op_list
+    if checked_op_list is not None:
+        if isinstance(checked_op_list, (list, tuple)):
+            check_op_list = ",".join(value for value in checked_op_list)
+            paddle.fluid.core.set_checked_op_list(check_op_list)
+        else:
+            raise ValueError("checked_op_list must be list or tuple")
+
+
+def set_skipped_op_list(skipped_op_list):
+    # check skipped_op_list
+    if skipped_op_list is not None:
+        if isinstance(skipped_op_list, (list, tuple)):
+            skip_op_list = ",".join(value for value in skipped_op_list)
+            paddle.fluid.core.set_skipped_op_list(skip_op_list)
+        else:
+            raise ValueError("skipped_op_list must be list or tuple")
 
 
 class TensorCheckerConfig:
     """
-    Collect the config for checking nan and inf in module or op tensor.
+    The purpose of this class is to collect the configuration for checking NaN and Inf values in the tensors of a module or operator. It takes the following arguments:
 
     Args:
-    * enable: Whether to enable Tensor's value detection function. The default value is False, which means that these tools will never be used.
+        enable(bool): Indicating whether to enable the detection of NaN and Inf values in tensors. The default value is False, which means that these tools will not be used.
 
-    * debug_mode: Debug mode,There are 6 kinds of debug mode.
-        CHECK_NAN_INF_AND_ABORT(default): Print or save Tensor key information with NaN/Inf and interrupt the program
-        CHECK_NAN_INF: Print or save Tensor critical information with NaN/Inf, but continue to run
-        CHECK_ALL_AND_ABORT: Print or save the output Tensor key information of all operators, and interrupt the program if NaN/Inf occurs
-        CHECK_ALL_FOR_OVERFLOW: Check the output of the FP32 operator, print or save key Tensor information that exceeds the FP16 representation range (overflow, underflow)
-        CHECK_ALL: Print or save output Tensor key information for all operators
-        DUMP_ALL: Saves all Tensor data. This mode does not print on the terminal
+        debug_mode(DebugMode, optional): A parameter that determines the type of debugging to be used. Default is DebugMode.CHECK_NAN_INF_AND_ABORT.
 
-    * dump_dir: The collection data storage path. If it is None, it will be directly printed to the terminal
+        output_dir(string, optional): The path to store collected data. If this parameter is set to None, the data will be printed to the terminal. Default is None.
 
-    * checked_op_list: A list of operators you want to check
+        checked_op_list(list|tuple, optional): Specifies a list of operators that need to be checked during program execution, for example, checked_op_list=['elementwise_add', 'conv2d'], indicating that the output results of elementwise_add and conv2d should be checked for nan/inf during program execution. Default is None.
 
-    * skipped_op_list: A list of operators to skip checking
+        skipped_op_list(list|tuple, optional): Specifies a list of operators that do not need to be checked during program execution, for example, skipped_op_list=['elementwise_add', 'conv2d'], indicating that the output results of elementwise_add and conv2d should not be checked for nan/inf during program execution. None is None.
 
-    * debug_step: The iteration scope of debugging
+        debug_step(list|tuple, optional): A list or tuple used primarily for nan/inf checking during model training. For example, debug_step=[1,5] indicates that nan/inf checking should only be performed on model training iterations 1 to 5. Default is None.
 
-    * stack_height_limit: The maximum depth of the call stack, and supports printing the call stack at the error location. The specific scheme needs to be investigated
-
-    * enable_traceback_filtering: Whether to filter the traceback. The main purpose is to filter out the internal code call stack of the framework and only display the user code call stack
+        stack_height_limit(int, optional): An integer value specifying the maximum depth of the call stack. This feature supports printing the call stack at the error location. Currently, only enabling or disabling call stack printing is supported. If you want to print the corresponding C++ call stack when NaN is detected in GPU Kernel, set stack_height_limit to 1, otherwise set it to 0. Default is 1.
 
     Examples:
-       .. code-block:: python
-          import paddle
 
-          checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
-          paddle.amp.debugging.enable_tensor_checker(checker_config)
+        ..  code-block:: python
+
+            import paddle
+
+            checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF)
+            paddle.amp.debugging.enable_tensor_checker(checker_config)
 
-          x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
-          y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
-          res = paddle.pow(x, y)
+            x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
+            y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
+            res = paddle.pow(x, y)
+            paddle.autograd.backward(res, retain_graph=True)
+            paddle.amp.debugging.disable_tensor_checker()
 
-          paddle.autograd.backward(res, retain_graph=True)
-          paddle.amp.debugging.disable_tensor_checker()
+            #[PRECISION] [ERROR] in [device=cpu, op=elementwise_pow_grad, tensor=, dtype=fp32], numel=3, num_nan=1, num_inf=0, num_zero=0, max=2.886751e-01, min=2.000000e-01, mean=-nan
+
+            # when DebugMode.CHECK_NAN_INF_AND_ABORT and stack_height_limit = 1
+            #Traceback (most recent call last):
+            #    res = paddle.pow(x, y)
+            #  File "/usr/local/lib/python3.8/dist-packages/paddle/tensor/math.py", line 447, in pow
+            #    return _C_ops.elementwise_pow(x, y)
 
     """
 
     # For module debugging
-    Current_step_id = 0
+    current_step_id = 0
 
     def __init__(
         self,
         enable,
         debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT,
-        dump_dir=None,
+        output_dir=None,
         checked_op_list=None,
         skipped_op_list=None,
         debug_step=None,
-        stack_height_limit=3,
-        enable_traceback_filtering=False,
+        stack_height_limit=1,
     ):
 
         self.enable = enable
         self.debug_mode = debug_mode
-        self.dump_dir = dump_dir
+        self.output_dir = output_dir
 
         self.checked_op_list = checked_op_list
         self.skipped_op_list = skipped_op_list
@@ -107,8 +143,6 @@ class TensorCheckerConfig:
         self.debug_step = debug_step
         self.stack_height_limit = stack_height_limit
 
-        self.enable_traceback_filtering = enable_traceback_filtering
-
         self.start_step = None
         self.end_step = None
 
@@ -146,60 +180,43 @@ class TensorCheckerConfig:
                 DebugMode.__members__,
             )
 
-        # check checked_op_list
-        if self.checked_op_list is not None:
-            if isinstance(self.checked_op_list, (list, tuple)):
-                check_op_list = ",".join(
-                    value for value in self.checked_op_list
-                )
-                os.environ["Paddle_check_nan_inf_op_list"] = str(check_op_list)
-            else:
-                raise ValueError("checked_op_list must be list or tuple")
+        set_checked_op_list(self.checked_op_list)
 
-        # check skipped_op_list
-        if self.skipped_op_list is not None:
-            if isinstance(self.skipped_op_list, (list, tuple)):
-                skipped_op_list = ",".join(
-                    value for value in self.skipped_op_list
-                )
-                os.environ["Paddle_skip_nan_inf_op_list"] = str(skipped_op_list)
-            else:
-                raise ValueError("skipped_op_list must be list or tuple")
+        set_skipped_op_list(self.skipped_op_list)
 
         if self.enable:
             self._set_seed(self.enable)
 
-    def keep_random(self, seed, flag):
+    def _set_seed(self, flag):
+        if self.initial_seed != self.seed:
+            self.seed = self.initial_seed
+
+        if self.seed > np.iinfo(np.uint32).max or self.seed < 0:
+            print("[Warnning: Seed must be between 0 and 2**32 - 1")
+            self.seed = 123
+
         # get random seed
-        self.seed = seed
         paddle.seed(self.seed)
         np.random.seed(self.seed)
         random.seed(self.seed)
 
+        # info
+        print("AMP Debugging TensorCheckerConfig: seed ", self.seed)
+
         # set cudnn and cpu
         if core.is_compiled_with_cuda():
             paddle.set_flags({"FLAGS_cudnn_deterministic": flag})
-        paddle.set_flags({"FLAGS_cpu_deterministic": flag})
+            print(
+                "AMP Debugging TensorCheckerConfig: FLAGS_cudnn_deterministic is ",
+                flag,
+            )
 
-        # info
-        print("AMP Debugging TensorCheckerConfig: seed ", self.seed)
-        print(
-            "AMP Debugging TensorCheckerConfig: FLAGS_cudnn_deterministic is ",
-            flag,
-        )
+        paddle.set_flags({"FLAGS_cpu_deterministic": flag})
         print(
             "AMP Debugging TensorCheckerConfig: FLAGS_cpu_deterministic is ",
             flag,
         )
 
-    def _set_seed(self, enable):
-        if self.initial_seed != self.seed:
-            self.seed = self.initial_seed
-        if self.seed > 4294967295 or self.seed < 0:
-            print("[Warnning: Seed must be between 0 and 2**32 - 1")
-            self.seed = 123
-            self.keep_random(self.seed, True)
-
     def _set_env(self, check_flag):
         paddle.set_flags({"FLAGS_check_nan_inf": check_flag})
         if check_flag:
@@ -209,35 +226,35 @@ class TensorCheckerConfig:
             )
 
             # set output_dir
-            if self.dump_dir is not None:
-                paddle.fluid.core.set_nan_inf_debug_path(self.dump_dir)
+            if self.output_dir is not None:
+                paddle.fluid.core.set_nan_inf_debug_path(self.output_dir)
 
             # set stack_height_limit
             if isinstance(self.stack_height_limit, (int)):
-                paddle.set_flags(
-                    {"FLAGS_call_stack_level": self.stack_height_limit}
+                paddle.fluid.core.set_nan_inf_stack_limit(
+                    self.stack_height_limit
                 )
             else:
                 raise ValueError("stack_height_limit must be int")
 
-    def check(self):
+    def update_and_check_step_id(self):
         if self.enable:
             if self.start_step is not None and self.end_step is not None:
                 if (
-                    self.start_step > TensorCheckerConfig.Current_step_id
-                    or TensorCheckerConfig.Current_step_id >= self.end_step
+                    self.start_step > TensorCheckerConfig.current_step_id
+                    or TensorCheckerConfig.current_step_id >= self.end_step
                 ):
                     return False
                 else:
-                    TensorCheckerConfig.Current_step_id += 1
+                    TensorCheckerConfig.current_step_id += 1
             return True
         return False
 
-    def run(self):
+    def start_check_nan_inf(self):
         if self.enable:
             self._set_env(self.enable)
 
-    def end(self):
+    def stop_check_nan_inf(self):
         self._set_env(False)
 
 
@@ -302,26 +319,26 @@ def enable_operator_stats_collection():
 
     Examples:
 
-     .. code-block:: python
+        ..  code-block:: python
 
-        import paddle
+            import paddle
 
-        conv = paddle.nn.Conv2D(3, 2, 3)
-        x = paddle.rand([10, 3, 32, 32])
+            conv = paddle.nn.Conv2D(3, 2, 3)
+            x = paddle.rand([10, 3, 32, 32])
 
-        paddle.amp.debugging.enable_operator_stats_collection()
-        # AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype)
-        with paddle.amp.auto_cast(enable=True, level='O2'):
-            out = conv(x)
-        # Print to the standard output.
-        paddle.amp.debugging.disable_operator_stats_collection()
-        # <------------------------------------------------------- op list -------------------------------------------------------->
-        # <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls -->
-        #   conv2d                                  |  1                |  0                |  0                |  0
-        #   elementwise_add                         |  1                |  0                |  0                |  0
-        #   reshape2                                |  1                |  0                |  0                |  0
-        #   transfer_dtype                          |  0                |  0                |  3                |  0
-        # <----------------------------------------------------- op count: 4 ------------------------------------------------------>
+            paddle.amp.debugging.enable_operator_stats_collection()
+            # AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype)
+            with paddle.amp.auto_cast(enable=True, level='O2'):
+                out = conv(x)
+            # Print to the standard output.
+            paddle.amp.debugging.disable_operator_stats_collection()
+            # <------------------------------------------------------- op list -------------------------------------------------------->
+            # <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls -->
+            #   conv2d                                  |  1                |  0                |  0                |  0
+            #   elementwise_add                         |  1                |  0                |  0                |  0
+            #   reshape2                                |  1                |  0                |  0                |  0
+            #   transfer_dtype                          |  0                |  0                |  3                |  0
+            # <----------------------------------------------------- op count: 4 ------------------------------------------------------>
 
     """
     # Clear the previous stats.
@@ -340,26 +357,26 @@ def disable_operator_stats_collection():
 
     Examples:
 
-     .. code-block:: python
+        ..  code-block:: python
 
-        import paddle
+            import paddle
 
-        conv = paddle.nn.Conv2D(3, 2, 3)
-        x = paddle.rand([10, 3, 32, 32])
+            conv = paddle.nn.Conv2D(3, 2, 3)
+            x = paddle.rand([10, 3, 32, 32])
 
-        paddle.amp.debugging.enable_operator_stats_collection()
-        # AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype)
-        with paddle.amp.auto_cast(enable=True, level='O2'):
-            out = conv(x)
-        # Print to the standard output.
-        paddle.amp.debugging.disable_operator_stats_collection()
-        # <------------------------------------------------------- op list -------------------------------------------------------->
-        # <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls -->
-        #   conv2d                                  |  1                |  0                |  0                |  0
-        #   elementwise_add                         |  1                |  0                |  0                |  0
-        #   reshape2                                |  1                |  0                |  0                |  0
-        #   transfer_dtype                          |  0                |  0                |  3                |  0
-        # <----------------------------------------------------- op count: 4 ------------------------------------------------------>
+            paddle.amp.debugging.enable_operator_stats_collection()
+            # AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype)
+            with paddle.amp.auto_cast(enable=True, level='O2'):
+                out = conv(x)
+            # Print to the standard output.
+            paddle.amp.debugging.disable_operator_stats_collection()
+            # <------------------------------------------------------- op list -------------------------------------------------------->
+            # <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls -->
+            #   conv2d                                  |  1                |  0                |  0                |  0
+            #   elementwise_add                         |  1                |  0                |  0                |  0
+            #   reshape2                                |  1                |  0                |  0                |  0
+            #   transfer_dtype                          |  0                |  0                |  3                |  0
+            # <----------------------------------------------------- op count: 4 ------------------------------------------------------>
 
     """
     if not _get_operator_stats_flag():
@@ -381,25 +398,25 @@ def collect_operator_stats():
 
     Examples:
 
-     .. code-block:: python
+        ..  code-block:: python
 
-        import paddle
+            import paddle
 
-        conv = paddle.nn.Conv2D(3, 2, 3)
-        x = paddle.rand([10, 3, 32, 32])
+            conv = paddle.nn.Conv2D(3, 2, 3)
+            x = paddle.rand([10, 3, 32, 32])
 
-        with paddle.amp.debugging.collect_operator_stats():
-            # AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype)
-            with paddle.amp.auto_cast(enable=True, level='O2'):
-                out = conv(x)
-        # Print to the standard output.
-        # <------------------------------------------------------- op list -------------------------------------------------------->
-        # <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls -->
-        #   conv2d                                  |  1                |  0                |  0                |  0
-        #   elementwise_add                         |  1                |  0                |  0                |  0
-        #   reshape2                                |  1                |  0                |  0                |  0
-        #   transfer_dtype                          |  0                |  0                |  3                |  0
-        # <----------------------------------------------------- op count: 4 ------------------------------------------------------>
+            with paddle.amp.debugging.collect_operator_stats():
+                # AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype)
+                with paddle.amp.auto_cast(enable=True, level='O2'):
+                    out = conv(x)
+            # Print to the standard output.
+            # <------------------------------------------------------- op list -------------------------------------------------------->
+            # <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls -->
+            #   conv2d                                  |  1                |  0                |  0                |  0
+            #   elementwise_add                         |  1                |  0                |  0                |  0
+            #   reshape2                                |  1                |  0                |  0                |  0
+            #   transfer_dtype                          |  0                |  0                |  3                |  0
+            # <----------------------------------------------------- op count: 4 ------------------------------------------------------>
 
     """
     enable_operator_stats_collection()
@@ -409,57 +426,74 @@ def collect_operator_stats():
 
 def enable_tensor_checker(checker_config):
     """
-    enable_tensor_checker(checker_config) is enables model level accuracy checking, which is used together with disables_tensor_checker() to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range.
+    The enable_tensor_checker(checker_config) function enables model-level accuracy checking and is used in combination with disables_tensor_checker() to achieve model-level precision checking by checking the output Tensors of all operators within the specified range.
 
-    Attention:
-
-    * If disable is called before loss. backward()_tensor_checker(), the gradient operator is not checked;
+    Args:
+        checker_config(TensorCheckerConfig): Checker_config is to collect the configuration for checking NaN and Inf values in the tensors of a module or operator.
 
-    * If disable is called before optimizer.step() tensor_checker(), the optimizer and other weight update related operators will not be checked
+    Note:
+        If disable_tensor_checker() is called before backward(), the gradient operator will not be checked.
+        If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked.
 
     Examples:
-       .. code-block:: python
-           import paddle
 
-           checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
-           paddle.amp.debugging.enable_tensor_checker(checker_config)
+        ..  code-block:: python
+
+            import paddle
+
+            checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF)
+            paddle.amp.debugging.enable_tensor_checker(checker_config)
 
-           x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
-           y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
-           res = paddle.pow(x, y)
-           paddle.autograd.backward(res, retain_graph=True)
+            x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
+            y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
+            res = paddle.pow(x, y)
+            paddle.autograd.backward(res, retain_graph=True)
+            paddle.amp.debugging.disable_tensor_checker()
+            #[PRECISION] [ERROR] in [device=cpu, op=elementwise_pow_grad, tensor=, dtype=fp32], numel=3, num_nan=1, num_inf=0, num_zero=0, max=2.886751e-01, min=2.000000e-01, mean=-nan
+
+            # when DebugMode.CHECK_NAN_INF_AND_ABORT and stack_height_limit = 1
+            # Traceback (most recent call last):
+            #   File "tp.py", line 8, in <module>
+            #     res = paddle.pow(x, y)
+            #   File "/usr/local/lib/python3.8/dist-packages/paddle/tensor/math.py", line 447, in pow
+            #     return _C_ops.elementwise_pow(x, y)
 
-           paddle.amp.debugging.disable_tensor_checker()
     """
-    if checker_config.check():
-        checker_config.run()
+    if checker_config.update_and_check_step_id():
+        checker_config.start_check_nan_inf()
     else:
-        checker_config.end()
+        checker_config.stop_check_nan_inf()
 
 
 def disable_tensor_checker():
     """
-    disable_tensor_checker() to disables the accuracy checking, which is used together with enables_tensor_checker(config) to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range.
+    disable_tensor_checker() is used to disable accuracy checking, and is used together with enable_tensor_checker(config) to achieve model-level precision checking by checking the output Tensors of all operators within the specified range.
 
-    Attention:
+    Note:
+        If disable_tensor_checker() is called before backward(), the gradient operator will not be checked;
+        If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked.
 
-    * If disable_tensor_checker() is called before loss.backward(), the gradient operator is not checked;
+    Examples:
 
-    * If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked
+        ..  code-block:: python
 
-    Examples:
-       .. code-block:: python
-           import paddle
+            import paddle
 
-           checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
-           paddle.amp.debugging.enable_tensor_checker(checker_config)
+            checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF)
+            paddle.amp.debugging.enable_tensor_checker(checker_config)
 
-           x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
-           y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
-           res = paddle.pow(x, y)
-           paddle.autograd.backward(res, retain_graph=True)
+            x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
+            y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
+            res = paddle.pow(x, y)
+            paddle.autograd.backward(res, retain_graph=True)
+            paddle.amp.debugging.disable_tensor_checker()
+            #[PRECISION] [ERROR] in [device=cpu, op=elementwise_pow_grad, tensor=, dtype=fp32], numel=3, num_nan=1, num_inf=0, num_zero=0, max=2.886751e-01, min=2.000000e-01, mean=-nan
 
-           paddle.amp.debugging.disable_tensor_checker()
+            # when DebugMode.CHECK_NAN_INF_AND_ABORT and stack_height_limit = 1
+            # Traceback (most recent call last):
+            #     res = paddle.pow(x, y)
+            #   File "/usr/local/lib/python3.8/dist-packages/paddle/tensor/math.py", line 447, in pow
+            #     return _C_ops.elementwise_pow(x, y)
 
     """
     paddle.set_flags({"FLAGS_check_nan_inf": 0})
diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py b/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py
index 425dc9a7e99..06695c56f24 100644
--- a/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py
@@ -78,7 +78,14 @@ class TestNanInfDirCheckResult(unittest.TestCase):
 
     def test_num_nan_inf(self):
         path = "nan_inf_log_dir"
-        paddle.fluid.core.set_nan_inf_debug_path(path)
+
+        checker_config = paddle.amp.debugging.TensorCheckerConfig(
+            enable=True,
+            debug_mode=paddle.amp.debugging.DebugMode.CHECK_ALL,
+            output_dir=path,
+        )
+
+        paddle.amp.debugging.enable_tensor_checker(checker_config)
 
         def _check_num_nan_inf(use_cuda):
             shape = [32, 32]
@@ -86,145 +93,25 @@ class TestNanInfDirCheckResult(unittest.TestCase):
             num_nan_np, num_inf_np = self.get_reference_num_nan_inf(x_np)
             add_assert = (num_nan_np + num_inf_np) > 0
             num_nan, num_inf = self.get_num_nan_inf(
-                x_np, use_cuda, add_assert, path
+                x_np,
+                use_cuda,
+                add_assert,
+                path,
             )
             if not use_cuda:
                 assert num_nan == num_nan_np and num_inf == num_inf_np
 
-        paddle.set_flags(
-            {"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 3}
-        )
-        _check_num_nan_inf(use_cuda=False)
         if paddle.fluid.core.is_compiled_with_cuda():
             _check_num_nan_inf(use_cuda=True)
+        else:
+            _check_num_nan_inf(use_cuda=False)
+
         x = paddle.to_tensor([2, 3, 4], 'float32')
         y = paddle.to_tensor([1, 5, 2], 'float32')
         z = paddle.add(x, y)
         path = ""
         paddle.fluid.core.set_nan_inf_debug_path(path)
-
-    def test_nan_inf_op(self):
-        import paddle
-
-        num_nan = 0
-        num_inf = 0
-        # check op list
-        x = paddle.to_tensor(
-            [1, 0, 1],
-            place=paddle.CPUPlace(),
-            dtype='float32',
-            stop_gradient=False,
-        )
-        y = paddle.to_tensor(
-            [0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
-        )
-        try:
-            res = paddle.pow(x, y)
-        except Exception as e:
-            # Cannot catch the log in CUDA kernel.
-            err_str_list = (
-                str(e)
-                .replace("(", " ")
-                .replace(")", " ")
-                .replace(",", " ")
-                .split(" ")
-            )
-            for err_str in err_str_list:
-                if "num_nan" in err_str:
-                    num_nan = int(err_str.split("=")[1])
-                elif "num_inf" in err_str:
-                    num_inf = int(err_str.split("=")[1])
-            print(
-                "[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
-                    num_nan, num_inf
-                )
-            )
-        return num_inf
-
-    def test_check_op_list(self):
-        import paddle
-
-        num_nan = 0
-        num_inf = 0
-
-        checker_config = paddle.amp.debugging.TensorCheckerConfig(
-            enable=True,
-            debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
-            skipped_op_list=["elementwise_div"],
-        )
-
-        x = paddle.to_tensor(
-            [0, 0, 0],
-            place=paddle.CPUPlace(),
-            dtype='float32',
-            stop_gradient=False,
-        )
-        y = paddle.to_tensor(
-            [0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
-        )
-        paddle.amp.debugging.enable_tensor_checker(checker_config)
-        try:
-            res = paddle.divide(y, x)
-        except Exception as e:
-            # Cannot catch the log in CUDA kernel.
-            err_str_list = (
-                str(e)
-                .replace("(", " ")
-                .replace(")", " ")
-                .replace(",", " ")
-                .split(" ")
-            )
-            for err_str in err_str_list:
-                if "num_nan" in err_str:
-                    num_nan = int(err_str.split("=")[1])
-                elif "num_inf" in err_str:
-                    num_inf = int(err_str.split("=")[1])
-            print(
-                "[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
-                    num_nan, num_inf
-                )
-            )
-        paddle.amp.debugging.enable_tensor_checker(checker_config)
-
-    def test_tensor_checker(self):
-        import paddle
-
-        def _assert_flag(value):
-            flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level']
-            res = paddle.get_flags(flags)
-            assert res["FLAGS_check_nan_inf"] == value
-
-        paddle.set_flags({"FLAGS_check_nan_inf": 0})
-        paddle.seed(102)
-        checker_config = paddle.amp.debugging.TensorCheckerConfig(
-            enable=True,
-            debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
-            checked_op_list=["elementwise_pow"],
-            skipped_op_list=["elementwise_add"],
-            debug_step=[0, 3],
-        )
-        # check seed
-        assert checker_config.initial_seed == 102
-        assert checker_config.seed == 102
-        _assert_flag(False)
-        for index in range(5):
-            paddle.amp.debugging.enable_tensor_checker(checker_config)
-            if index <= 2:
-                _assert_flag(True)
-                assert (
-                    index + 1
-                    == paddle.amp.debugging.TensorCheckerConfig.Current_step_id
-                )
-                assert 1 == self.test_nan_inf_op()
-            else:
-                assert (
-                    3
-                    == paddle.amp.debugging.TensorCheckerConfig.Current_step_id
-                )
-                _assert_flag(False)
-                assert 0 == self.test_nan_inf_op()
-            paddle.amp.debugging.disable_tensor_checker()
-            _assert_flag(False)
+        paddle.amp.debugging.disable_tensor_checker()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_checker.py b/python/paddle/fluid/tests/unittests/test_tensor_checker.py
new file mode 100644
index 00000000000..a5b5e82034f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_checker.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+
+
+class TestTensorChecker(unittest.TestCase):
+    def get_num_inf(self, e):
+        num_nan = 0
+        num_inf = 0
+        # Cannot catch the log in CUDA kernel.
+        err_str_list = (
+            str(e)
+            .replace("(", " ")
+            .replace(")", " ")
+            .replace(",", " ")
+            .split(" ")
+        )
+        for err_str in err_str_list:
+            if "num_nan" in err_str:
+                num_nan = int(err_str.split("=")[1])
+            elif "num_inf" in err_str:
+                num_inf = int(err_str.split("=")[1])
+        print(
+            "[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
+                num_nan, num_inf
+            )
+        )
+        return num_nan
+
+    def generate_num_inf(self, place):
+        num_inf = 0
+        num_nan = 0
+        paddle.set_device(place)
+        # check op list
+        x = paddle.to_tensor(
+            [1, 0, 0],
+            dtype='float32',
+            stop_gradient=False,
+        )
+        y = paddle.to_tensor([0, 0, 1], dtype='float32')
+        try:
+            res = paddle.pow(x, y)
+            # test backward
+            paddle.autograd.backward([res])
+            res = paddle.divide(y, x)
+        except Exception as e:
+            num_inf = self.get_num_inf(e)
+        return num_inf
+
+    def test_tensor_checker(self):
+        def _assert_flag(value):
+            flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level']
+            res = paddle.get_flags(flags)
+            assert res["FLAGS_check_nan_inf"] == value
+
+        paddle.set_flags({"FLAGS_check_nan_inf": 0})
+        paddle.seed(102)
+        checker_config = paddle.amp.debugging.TensorCheckerConfig(
+            enable=True,
+            debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
+            checked_op_list=["elementwise_pow_grad"],
+            skipped_op_list=["elementwise_div"],
+            debug_step=[0, 3],
+        )
+        places = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            places.append('gpu')
+        # check seed
+        self.assertEqual(checker_config.initial_seed, 102)
+        self.assertEqual(checker_config.seed, 102)
+        _assert_flag(False)
+
+        for place in places:
+            paddle.amp.debugging.TensorCheckerConfig.current_step_id = 0
+            for index in range(5):
+                paddle.amp.debugging.enable_tensor_checker(checker_config)
+                if index <= 2:
+                    _assert_flag(True)
+                    self.assertEqual(
+                        index + 1,
+                        paddle.amp.debugging.TensorCheckerConfig.current_step_id,
+                    )
+                    self.assertEqual(1, self.generate_num_inf(place))
+                else:
+                    self.assertEqual(
+                        3,
+                        paddle.amp.debugging.TensorCheckerConfig.current_step_id,
+                    )
+                    _assert_flag(False)
+                    self.assertEqual(0, self.generate_num_inf(place))
+
+                paddle.amp.debugging.disable_tensor_checker()
+                _assert_flag(False)
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab