From 7067763eefbd6e0747cb98eadbed3abf9d8c6762 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Wed, 29 Mar 2023 11:05:20 +0800
Subject: [PATCH] Support op check list and op skip in check_nan_inf_tools
 (#51998)

---
 paddle/fluid/eager/nan_inf_utils.cc           | 70 ++++++++++++++++++-
 paddle/fluid/pybind/pybind.cc                 |  7 ++
 .../fluid/tests/unittests/test_nan_inf.py     | 23 ++++++
 3 files changed, 99 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index cc09ebb6c5d..17cf8825d5c 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -20,10 +20,78 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/selected_rows.h"
 
+#include "paddle/phi/core/compat/convert_utils.h"
+DECLARE_int32(check_nan_inf_level);
 namespace egr {
 
+static std::once_flag dump_list_init_flag;
+
+static std::unordered_set<std::string>& nan_inf_check_op_list() {
+  static std::unordered_set<std::string> _check_op_list = {};
+  return _check_op_list;
+}
+
+static std::unordered_set<std::string>& nan_inf_skip_op_list() {
+  static std::unordered_set<std::string> _skip_op_list = {};
+  return _skip_op_list;
+}
+
+static void InitDumpListFormEnv() {
+  nan_inf_check_op_list();
+  nan_inf_skip_op_list();
+  const char* check_op_list = std::getenv("Paddle_check_nan_inf_op_list");
+  const char* skip_op_list = std::getenv("Paddle_skip_nan_inf_op_list");
+
+  if (check_op_list) {
+    std::stringstream ss(check_op_list);
+    std::string op_type;
+    LOG(INFO) << "Please set op's name according to the "
+                 "paddle.amp.low_precision_op_list()";
+    while (std::getline(ss, op_type, ',')) {
+      nan_inf_check_op_list().emplace(op_type);
+    }
+  }
+
+  if (skip_op_list) {
+    std::stringstream ss(skip_op_list);
+    std::string op_type;
+    LOG(INFO) << "Please set op's name according to the "
+                 "paddle.amp.low_precision_op_list()";
+    while (std::getline(ss, op_type, ',')) {
+      nan_inf_skip_op_list().emplace(op_type);
+    }
+  }
+
+  for (auto const& key : nan_inf_check_op_list()) {
+    LOG(INFO) << "Check nan inf op list: " << key;
+  }
+
+  for (auto const& key : nan_inf_skip_op_list()) {
+    LOG(INFO) << "Skip nan inf op list: " << key;
+  }
+}
+
+bool CheckOp(const std::string& api_name) {
+  if (nan_inf_skip_op_list().count("all") ||
+      nan_inf_skip_op_list().count(api_name)) {
+    VLOG(4) << "Current op is in skipped_op_list : " << api_name;
+    return false;
+  }
+
+  if (nan_inf_check_op_list().size() != 0 &&
+      (!nan_inf_check_op_list().count(api_name))) {
+    VLOG(4) << "Current op isn't in checked_op_list : " << api_name;
+    return false;
+  }
+
+  VLOG(6) << "Current check nan inf Op is : " << api_name;
+  return true;
+}
+
 void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
-  if (tensor.initialized()) {
+  std::call_once(dump_list_init_flag, InitDumpListFormEnv);
+  auto op_name = phi::TransToFluidOpName(api_name);
+  if (tensor.initialized() && CheckOp(op_name)) {
     auto& tensor_name = tensor.name();
     const phi::DenseTensor* dense_tensor{nullptr};
     if (tensor.is_dense_tensor()) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index ec7d3c710c6..27b5ec65167 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -198,6 +198,7 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/nan_inf_utils.h"
 #include "paddle/fluid/imperative/layout_autotune.h"
 #include "paddle/fluid/prim/utils/eager/eager_tensor_operants.h"
 #include "paddle/fluid/prim/utils/static/static_tensor_operants.h"
@@ -2859,6 +2860,12 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("set_nan_inf_debug_path",
         &paddle::framework::details::SetNanInfDebugPath);
 
+  m.def("check_numerics",
+        [](const std::string &op_name, const paddle::Tensor &tensor) {
+          VLOG(4) << "Check tensor whether has nan or inf.";
+          egr::CheckTensorHasNanOrInf(op_name, tensor);
+        });
+
   BindFleetWrapper(&m);
   BindIO(&m);
   BindParallelExecutor(m);
diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py
index 4f6c48fa47e..139bd8e9d8e 100644
--- a/python/paddle/fluid/tests/unittests/test_nan_inf.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py
@@ -68,6 +68,15 @@ class TestNanInfEnv(TestNanInf):
         self.env["PADDLE_INF_NAN_SKIP_VAR"] = "elementwise_add:fc_0.tmp_1"
 
 
+class TestCheckSkipEnv(TestNanInf):
+    def setUp(self):
+        super().setUp()
+        # windows python have some bug with env, so need use str to pass ci
+        # otherwise, "TypeError: environment can only contain strings"
+        self.env["Paddle_check_nan_inf_op_list"] = "mean"
+        self.env["Paddle_skip_nan_inf_op_list"] = "elementwise_add"
+
+
 class TestNanInfCheckResult(unittest.TestCase):
     def generate_inputs(self, shape, dtype="float32"):
         data = np.random.random(size=shape).astype(dtype)
@@ -159,6 +168,20 @@ class TestNanInfCheckResult(unittest.TestCase):
         if paddle.fluid.core.is_compiled_with_cuda():
             self.check_nan_inf_level(use_cuda=True, dtype="float16")
 
+    def test_check_numerics(self):
+        paddle.set_flags(
+            {"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 3}
+        )
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_nan_inf_level(use_cuda=True, dtype="float16")
+
+        shape = [8, 8]
+        x_np, y_np = self.generate_inputs(shape, "float16")
+        x = paddle.to_tensor(x_np)
+        y = paddle.to_tensor(y_np)
+        paddle.fluid.core.check_numerics("check_numerics", x)
+        paddle.fluid.core.check_numerics("check_numerics", y)
+
 
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab