Support op check list and op skip in check_nan_inf_tools (#51998)

7067763e · niuliling123 · GitHub · 99c1f54b · 7067763e · 7067763e
3 changed file
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -20,10 +20,78 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+DECLARE_int32(check_nan_inf_level);
 namespace egr {
+static std::once_flag dump_list_init_flag;
+static std::unordered_set<std::string>& nan_inf_check_op_list() {
+  static std::unordered_set<std::string> _check_op_list = {};
+  return _check_op_list;
+}
+static std::unordered_set<std::string>& nan_inf_skip_op_list() {
+  static std::unordered_set<std::string> _skip_op_list = {};
+  return _skip_op_list;
+}
+static void InitDumpListFormEnv() {
+  nan_inf_check_op_list();
+  nan_inf_skip_op_list();
+  const char* check_op_list = std::getenv("Paddle_check_nan_inf_op_list");
+  const char* skip_op_list = std::getenv("Paddle_skip_nan_inf_op_list");
+  if (check_op_list) {
+    std::stringstream ss(check_op_list);
+    std::string op_type;
+    LOG(INFO) << "Please set op's name according to the "
+                 "paddle.amp.low_precision_op_list()";
+    while (std::getline(ss, op_type, ',')) {
+      nan_inf_check_op_list().emplace(op_type);
+    }
+  }
+  if (skip_op_list) {
+    std::stringstream ss(skip_op_list);
+    std::string op_type;
+    LOG(INFO) << "Please set op's name according to the "
+                 "paddle.amp.low_precision_op_list()";
+    while (std::getline(ss, op_type, ',')) {
+      nan_inf_skip_op_list().emplace(op_type);
+    }
+  }
+  for (auto const& key : nan_inf_check_op_list()) {
+    LOG(INFO) << "Check nan inf op list: " << key;
+  }
+  for (auto const& key : nan_inf_skip_op_list()) {
+    LOG(INFO) << "Skip nan inf op list: " << key;
+  }
+}
+bool CheckOp(const std::string& api_name) {
+  if (nan_inf_skip_op_list().count("all") ||
+      nan_inf_skip_op_list().count(api_name)) {
+    VLOG(4) << "Current op is in skipped_op_list : " << api_name;
+    return false;
+  }
+  if (nan_inf_check_op_list().size() != 0 &&
+      (!nan_inf_check_op_list().count(api_name))) {
+    VLOG(4) << "Current op isn't in checked_op_list : " << api_name;
+    return false;
+  }
+  VLOG(6) << "Current check nan inf Op is : " << api_name;
+  return true;
+}
 void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
-  if (tensor.initialized()) {
+  std::call_once(dump_list_init_flag, InitDumpListFormEnv);
+  auto op_name = phi::TransToFluidOpName(api_name);
+  if (tensor.initialized() && CheckOp(op_name)) {
    auto& tensor_name = tensor.name();
    const phi::DenseTensor* dense_tensor{nullptr};
    if (tensor.is_dense_tensor()) {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -198,6 +198,7 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/nan_inf_utils.h"
 #include "paddle/fluid/imperative/layout_autotune.h"
 #include "paddle/fluid/prim/utils/eager/eager_tensor_operants.h"
 #include "paddle/fluid/prim/utils/static/static_tensor_operants.h"
@@ -2859,6 +2860,12 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("set_nan_inf_debug_path",
        &paddle::framework::details::SetNanInfDebugPath);
+  m.def("check_numerics",
+        [](const std::string &op_name, const paddle::Tensor &tensor) {
+          VLOG(4) << "Check tensor whether has nan or inf.";
+          egr::CheckTensorHasNanOrInf(op_name, tensor);
+        });
  BindFleetWrapper(&m);
  BindIO(&m);
  BindParallelExecutor(m);

--- a/python/paddle/fluid/tests/unittests/test_nan_inf.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py
@@ -68,6 +68,15 @@ class TestNanInfEnv(TestNanInf):
        self.env["PADDLE_INF_NAN_SKIP_VAR"] = "elementwise_add:fc_0.tmp_1"
+class TestCheckSkipEnv(TestNanInf):
+    def setUp(self):
+        super().setUp()
+        # windows python have some bug with env, so need use str to pass ci
+        # otherwise, "TypeError: environment can only contain strings"
+        self.env["Paddle_check_nan_inf_op_list"] = "mean"
+        self.env["Paddle_skip_nan_inf_op_list"] = "elementwise_add"
 class TestNanInfCheckResult(unittest.TestCase):
    def generate_inputs(self, shape, dtype="float32"):
        data = np.random.random(size=shape).astype(dtype)
@@ -159,6 +168,20 @@ class TestNanInfCheckResult(unittest.TestCase):
        if paddle.fluid.core.is_compiled_with_cuda():
            self.check_nan_inf_level(use_cuda=True, dtype="float16")
+    def test_check_numerics(self):
+        paddle.set_flags(
+            {"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 3}
+        )
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_nan_inf_level(use_cuda=True, dtype="float16")
+        shape = [8, 8]
+        x_np, y_np = self.generate_inputs(shape, "float16")
+        x = paddle.to_tensor(x_np)
+        y = paddle.to_tensor(y_np)
+        paddle.fluid.core.check_numerics("check_numerics", x)
+        paddle.fluid.core.check_numerics("check_numerics", y)
 if __name__ == '__main__':
    unittest.main()