未验证 提交 7067763e 编写于 作者: N niuliling123 提交者: GitHub

Support op check list and op skip in check_nan_inf_tools (#51998)

上级 99c1f54b
......@@ -20,10 +20,78 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/selected_rows.h"
#include "paddle/phi/core/compat/convert_utils.h"
DECLARE_int32(check_nan_inf_level);
namespace egr {
static std::once_flag dump_list_init_flag;
static std::unordered_set<std::string>& nan_inf_check_op_list() {
static std::unordered_set<std::string> _check_op_list = {};
return _check_op_list;
}
static std::unordered_set<std::string>& nan_inf_skip_op_list() {
static std::unordered_set<std::string> _skip_op_list = {};
return _skip_op_list;
}
static void InitDumpListFormEnv() {
nan_inf_check_op_list();
nan_inf_skip_op_list();
const char* check_op_list = std::getenv("Paddle_check_nan_inf_op_list");
const char* skip_op_list = std::getenv("Paddle_skip_nan_inf_op_list");
if (check_op_list) {
std::stringstream ss(check_op_list);
std::string op_type;
LOG(INFO) << "Please set op's name according to the "
"paddle.amp.low_precision_op_list()";
while (std::getline(ss, op_type, ',')) {
nan_inf_check_op_list().emplace(op_type);
}
}
if (skip_op_list) {
std::stringstream ss(skip_op_list);
std::string op_type;
LOG(INFO) << "Please set op's name according to the "
"paddle.amp.low_precision_op_list()";
while (std::getline(ss, op_type, ',')) {
nan_inf_skip_op_list().emplace(op_type);
}
}
for (auto const& key : nan_inf_check_op_list()) {
LOG(INFO) << "Check nan inf op list: " << key;
}
for (auto const& key : nan_inf_skip_op_list()) {
LOG(INFO) << "Skip nan inf op list: " << key;
}
}
bool CheckOp(const std::string& api_name) {
if (nan_inf_skip_op_list().count("all") ||
nan_inf_skip_op_list().count(api_name)) {
VLOG(4) << "Current op is in skipped_op_list : " << api_name;
return false;
}
if (nan_inf_check_op_list().size() != 0 &&
(!nan_inf_check_op_list().count(api_name))) {
VLOG(4) << "Current op isn't in checked_op_list : " << api_name;
return false;
}
VLOG(6) << "Current check nan inf Op is : " << api_name;
return true;
}
void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
if (tensor.initialized()) {
std::call_once(dump_list_init_flag, InitDumpListFormEnv);
auto op_name = phi::TransToFluidOpName(api_name);
if (tensor.initialized() && CheckOp(op_name)) {
auto& tensor_name = tensor.name();
const phi::DenseTensor* dense_tensor{nullptr};
if (tensor.is_dense_tensor()) {
......
......@@ -198,6 +198,7 @@ limitations under the License. */
#endif
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/nan_inf_utils.h"
#include "paddle/fluid/imperative/layout_autotune.h"
#include "paddle/fluid/prim/utils/eager/eager_tensor_operants.h"
#include "paddle/fluid/prim/utils/static/static_tensor_operants.h"
......@@ -2859,6 +2860,12 @@ All parameter, weight, gradient are variables in Paddle.
m.def("set_nan_inf_debug_path",
&paddle::framework::details::SetNanInfDebugPath);
m.def("check_numerics",
[](const std::string &op_name, const paddle::Tensor &tensor) {
VLOG(4) << "Check tensor whether has nan or inf.";
egr::CheckTensorHasNanOrInf(op_name, tensor);
});
BindFleetWrapper(&m);
BindIO(&m);
BindParallelExecutor(m);
......
......@@ -68,6 +68,15 @@ class TestNanInfEnv(TestNanInf):
self.env["PADDLE_INF_NAN_SKIP_VAR"] = "elementwise_add:fc_0.tmp_1"
class TestCheckSkipEnv(TestNanInf):
def setUp(self):
super().setUp()
# windows python have some bug with env, so need use str to pass ci
# otherwise, "TypeError: environment can only contain strings"
self.env["Paddle_check_nan_inf_op_list"] = "mean"
self.env["Paddle_skip_nan_inf_op_list"] = "elementwise_add"
class TestNanInfCheckResult(unittest.TestCase):
def generate_inputs(self, shape, dtype="float32"):
data = np.random.random(size=shape).astype(dtype)
......@@ -159,6 +168,20 @@ class TestNanInfCheckResult(unittest.TestCase):
if paddle.fluid.core.is_compiled_with_cuda():
self.check_nan_inf_level(use_cuda=True, dtype="float16")
def test_check_numerics(self):
paddle.set_flags(
{"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 3}
)
if paddle.fluid.core.is_compiled_with_cuda():
self.check_nan_inf_level(use_cuda=True, dtype="float16")
shape = [8, 8]
x_np, y_np = self.generate_inputs(shape, "float16")
x = paddle.to_tensor(x_np)
y = paddle.to_tensor(y_np)
paddle.fluid.core.check_numerics("check_numerics", x)
paddle.fluid.core.check_numerics("check_numerics", y)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册