From 7067763eefbd6e0747cb98eadbed3abf9d8c6762 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Wed, 29 Mar 2023 11:05:20 +0800 Subject: [PATCH] Support op check list and op skip in check_nan_inf_tools (#51998) --- paddle/fluid/eager/nan_inf_utils.cc | 70 ++++++++++++++++++- paddle/fluid/pybind/pybind.cc | 7 ++ .../fluid/tests/unittests/test_nan_inf.py | 23 ++++++ 3 files changed, 99 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc index cc09ebb6c5d..17cf8825d5c 100644 --- a/paddle/fluid/eager/nan_inf_utils.cc +++ b/paddle/fluid/eager/nan_inf_utils.cc @@ -20,10 +20,78 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/core/compat/convert_utils.h" +DECLARE_int32(check_nan_inf_level); namespace egr { +static std::once_flag dump_list_init_flag; + +static std::unordered_set& nan_inf_check_op_list() { + static std::unordered_set _check_op_list = {}; + return _check_op_list; +} + +static std::unordered_set& nan_inf_skip_op_list() { + static std::unordered_set _skip_op_list = {}; + return _skip_op_list; +} + +static void InitDumpListFormEnv() { + nan_inf_check_op_list(); + nan_inf_skip_op_list(); + const char* check_op_list = std::getenv("Paddle_check_nan_inf_op_list"); + const char* skip_op_list = std::getenv("Paddle_skip_nan_inf_op_list"); + + if (check_op_list) { + std::stringstream ss(check_op_list); + std::string op_type; + LOG(INFO) << "Please set op's name according to the " + "paddle.amp.low_precision_op_list()"; + while (std::getline(ss, op_type, ',')) { + nan_inf_check_op_list().emplace(op_type); + } + } + + if (skip_op_list) { + std::stringstream ss(skip_op_list); + std::string op_type; + LOG(INFO) << "Please set op's name according to the " + "paddle.amp.low_precision_op_list()"; + while (std::getline(ss, op_type, ',')) { + nan_inf_skip_op_list().emplace(op_type); + } + } + + for (auto const& key : nan_inf_check_op_list()) { + LOG(INFO) << "Check nan inf op list: " << key; + } + + for (auto const& key : nan_inf_skip_op_list()) { + LOG(INFO) << "Skip nan inf op list: " << key; + } +} + +bool CheckOp(const std::string& api_name) { + if (nan_inf_skip_op_list().count("all") || + nan_inf_skip_op_list().count(api_name)) { + VLOG(4) << "Current op is in skipped_op_list : " << api_name; + return false; + } + + if (nan_inf_check_op_list().size() != 0 && + (!nan_inf_check_op_list().count(api_name))) { + VLOG(4) << "Current op isn't in checked_op_list : " << api_name; + return false; + } + + VLOG(6) << "Current check nan inf Op is : " << api_name; + return true; +} + void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) { - if (tensor.initialized()) { + std::call_once(dump_list_init_flag, InitDumpListFormEnv); + auto op_name = phi::TransToFluidOpName(api_name); + if (tensor.initialized() && CheckOp(op_name)) { auto& tensor_name = tensor.name(); const phi::DenseTensor* dense_tensor{nullptr}; if (tensor.is_dense_tensor()) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ec7d3c710c6..27b5ec65167 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -198,6 +198,7 @@ limitations under the License. */ #endif #include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/nan_inf_utils.h" #include "paddle/fluid/imperative/layout_autotune.h" #include "paddle/fluid/prim/utils/eager/eager_tensor_operants.h" #include "paddle/fluid/prim/utils/static/static_tensor_operants.h" @@ -2859,6 +2860,12 @@ All parameter, weight, gradient are variables in Paddle. m.def("set_nan_inf_debug_path", &paddle::framework::details::SetNanInfDebugPath); + m.def("check_numerics", + [](const std::string &op_name, const paddle::Tensor &tensor) { + VLOG(4) << "Check tensor whether has nan or inf."; + egr::CheckTensorHasNanOrInf(op_name, tensor); + }); + BindFleetWrapper(&m); BindIO(&m); BindParallelExecutor(m); diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py index 4f6c48fa47e..139bd8e9d8e 100644 --- a/python/paddle/fluid/tests/unittests/test_nan_inf.py +++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py @@ -68,6 +68,15 @@ class TestNanInfEnv(TestNanInf): self.env["PADDLE_INF_NAN_SKIP_VAR"] = "elementwise_add:fc_0.tmp_1" +class TestCheckSkipEnv(TestNanInf): + def setUp(self): + super().setUp() + # windows python have some bug with env, so need use str to pass ci + # otherwise, "TypeError: environment can only contain strings" + self.env["Paddle_check_nan_inf_op_list"] = "mean" + self.env["Paddle_skip_nan_inf_op_list"] = "elementwise_add" + + class TestNanInfCheckResult(unittest.TestCase): def generate_inputs(self, shape, dtype="float32"): data = np.random.random(size=shape).astype(dtype) @@ -159,6 +168,20 @@ class TestNanInfCheckResult(unittest.TestCase): if paddle.fluid.core.is_compiled_with_cuda(): self.check_nan_inf_level(use_cuda=True, dtype="float16") + def test_check_numerics(self): + paddle.set_flags( + {"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 3} + ) + if paddle.fluid.core.is_compiled_with_cuda(): + self.check_nan_inf_level(use_cuda=True, dtype="float16") + + shape = [8, 8] + x_np, y_np = self.generate_inputs(shape, "float16") + x = paddle.to_tensor(x_np) + y = paddle.to_tensor(y_np) + paddle.fluid.core.check_numerics("check_numerics", x) + paddle.fluid.core.check_numerics("check_numerics", y) + if __name__ == '__main__': unittest.main() -- GitLab