未验证 提交 ec77defc 编写于 作者: N niuliling123 提交者: GitHub

[Cherry-pick] Add enable_tensor_checker and disable_tensor_checker to api list (#52936) (#53287)

新增enable_tensor_checker, disable_tensor_checker API (#52936)
上级 ba415ee7
......@@ -24,8 +24,6 @@
DECLARE_int32(check_nan_inf_level);
namespace egr {
static std::once_flag dump_list_init_flag;
static std::unordered_set<std::string>& nan_inf_check_op_list() {
static std::unordered_set<std::string> _check_op_list = {};
return _check_op_list;
......@@ -36,39 +34,32 @@ static std::unordered_set<std::string>& nan_inf_skip_op_list() {
return _skip_op_list;
}
static void InitDumpListFormEnv() {
void SetCheckOpList(const std::string& check_op_list = "") {
nan_inf_check_op_list();
nan_inf_skip_op_list();
const char* check_op_list = std::getenv("Paddle_check_nan_inf_op_list");
const char* skip_op_list = std::getenv("Paddle_skip_nan_inf_op_list");
if (check_op_list) {
if (check_op_list.size() != 0) {
std::stringstream ss(check_op_list);
std::string op_type;
LOG(INFO) << "Please set op's name according to the "
"paddle.amp.low_precision_op_list()";
while (std::getline(ss, op_type, ',')) {
nan_inf_check_op_list().emplace(op_type);
VLOG(4) << "Check nan inf op list: " << op_type;
}
}
}
if (skip_op_list) {
void SetSkipOpList(const std::string& skip_op_list = "") {
nan_inf_skip_op_list();
if (skip_op_list.size() != 0) {
std::stringstream ss(skip_op_list);
std::string op_type;
LOG(INFO) << "Please set op's name according to the "
"paddle.amp.low_precision_op_list()";
while (std::getline(ss, op_type, ',')) {
nan_inf_skip_op_list().emplace(op_type);
VLOG(4) << "Skip nan inf op list: " << op_type;
}
}
for (auto const& key : nan_inf_check_op_list()) {
LOG(INFO) << "Check nan inf op list: " << key;
}
for (auto const& key : nan_inf_skip_op_list()) {
LOG(INFO) << "Skip nan inf op list: " << key;
}
}
bool CheckOp(const std::string& api_name) {
......@@ -89,7 +80,6 @@ bool CheckOp(const std::string& api_name) {
}
void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
std::call_once(dump_list_init_flag, InitDumpListFormEnv);
auto op_name = phi::TransToFluidOpName(api_name);
if (tensor.initialized() && CheckOp(op_name)) {
auto& tensor_name = tensor.name();
......
......@@ -65,6 +65,10 @@ void CheckTensorHasNanOrInf(
void CheckTensorHasNanOrInf(const std::string& api_name,
const TupleOfTensorAndVector& tensors);
void SetCheckOpList(const std::string& check_op_list);
void SetSkipOpList(const std::string& skip_op_list);
void CheckTensorHasNanOrInf(
const std::string& api_name,
const paddle::small_vector<std::vector<paddle::Tensor>,
......
......@@ -30,6 +30,7 @@ namespace details {
struct DebugTools {
DebugTools() {}
std::string path = "";
int stack_limit = 1;
};
static DebugTools debug_nan_inf;
......@@ -45,6 +46,13 @@ std::string GetNanPath() {
return debug_nan_inf.path + "/";
}
void SetNanInfStackLimit(const int& stack_limit) {
debug_nan_inf.stack_limit = stack_limit;
VLOG(4) << "Set the stack limit of debug tools : " << stack_limit;
}
int GetNanInfStackLimit() { return debug_nan_inf.stack_limit; }
static std::once_flag white_list_init_flag;
static int op_role_nan_inf_white_list = 0;
......
......@@ -516,7 +516,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
check_nan_inf_level,
nan_inf_zero_tensor.data<int64_t>());
if (check_nan_inf_level == 0) {
if (check_nan_inf_level == 0 && GetNanInfStackLimit() > 0) {
auto nan_cpu =
phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 3);
int64_t* nan_cpu_ptr = reinterpret_cast<int64_t*>(nan_cpu->ptr());
......
......@@ -40,6 +40,10 @@ void SetNanInfDebugPath(const std::string& nan_inf_path);
std::string GetNanPath();
void SetNanInfStackLimit(const int& stack_limit);
int GetNanInfStackLimit();
template <typename T,
typename MT,
std::enable_if_t<std::is_same<T, float>::value, bool> = true>
......
......@@ -2670,9 +2670,21 @@ All parameter, weight, gradient are variables in Paddle.
m.def("use_layout_autotune",
[] { return egr::Controller::Instance().UseLayoutAutoTune(); });
// Add the api for nan op debug
m.def("set_nan_inf_stack_limit",
&paddle::framework::details::SetNanInfStackLimit);
// Add the api for nan op debug
m.def("set_nan_inf_debug_path",
&paddle::framework::details::SetNanInfDebugPath);
// Add check op lost
m.def("set_checked_op_list",
[](const std::string &op_list) { egr::SetCheckOpList(op_list); });
// Add skipped op list
m.def("set_skipped_op_list",
[](const std::string &op_list) { egr::SetSkipOpList(op_list); });
m.def("check_numerics",
[](const std::string &op_name, const paddle::Tensor &tensor) {
VLOG(4) << "Check tensor whether has nan or inf.";
......
此差异已折叠。
......@@ -78,7 +78,14 @@ class TestNanInfDirCheckResult(unittest.TestCase):
def test_num_nan_inf(self):
path = "nan_inf_log_dir"
paddle.fluid.core.set_nan_inf_debug_path(path)
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_ALL,
output_dir=path,
)
paddle.amp.debugging.enable_tensor_checker(checker_config)
def _check_num_nan_inf(use_cuda):
shape = [32, 32]
......@@ -86,145 +93,25 @@ class TestNanInfDirCheckResult(unittest.TestCase):
num_nan_np, num_inf_np = self.get_reference_num_nan_inf(x_np)
add_assert = (num_nan_np + num_inf_np) > 0
num_nan, num_inf = self.get_num_nan_inf(
x_np, use_cuda, add_assert, path
x_np,
use_cuda,
add_assert,
path,
)
if not use_cuda:
assert num_nan == num_nan_np and num_inf == num_inf_np
paddle.set_flags(
{"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 3}
)
_check_num_nan_inf(use_cuda=False)
if paddle.fluid.core.is_compiled_with_cuda():
_check_num_nan_inf(use_cuda=True)
else:
_check_num_nan_inf(use_cuda=False)
x = paddle.to_tensor([2, 3, 4], 'float32')
y = paddle.to_tensor([1, 5, 2], 'float32')
z = paddle.add(x, y)
path = ""
paddle.fluid.core.set_nan_inf_debug_path(path)
def test_nan_inf_op(self):
import paddle
num_nan = 0
num_inf = 0
# check op list
x = paddle.to_tensor(
[1, 0, 1],
place=paddle.CPUPlace(),
dtype='float32',
stop_gradient=False,
)
y = paddle.to_tensor(
[0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
)
try:
res = paddle.pow(x, y)
except Exception as e:
# Cannot catch the log in CUDA kernel.
err_str_list = (
str(e)
.replace("(", " ")
.replace(")", " ")
.replace(",", " ")
.split(" ")
)
for err_str in err_str_list:
if "num_nan" in err_str:
num_nan = int(err_str.split("=")[1])
elif "num_inf" in err_str:
num_inf = int(err_str.split("=")[1])
print(
"[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
num_nan, num_inf
)
)
return num_inf
def test_check_op_list(self):
import paddle
num_nan = 0
num_inf = 0
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
skipped_op_list=["elementwise_div"],
)
x = paddle.to_tensor(
[0, 0, 0],
place=paddle.CPUPlace(),
dtype='float32',
stop_gradient=False,
)
y = paddle.to_tensor(
[0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
)
paddle.amp.debugging.enable_tensor_checker(checker_config)
try:
res = paddle.divide(y, x)
except Exception as e:
# Cannot catch the log in CUDA kernel.
err_str_list = (
str(e)
.replace("(", " ")
.replace(")", " ")
.replace(",", " ")
.split(" ")
)
for err_str in err_str_list:
if "num_nan" in err_str:
num_nan = int(err_str.split("=")[1])
elif "num_inf" in err_str:
num_inf = int(err_str.split("=")[1])
print(
"[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
num_nan, num_inf
)
)
paddle.amp.debugging.enable_tensor_checker(checker_config)
def test_tensor_checker(self):
import paddle
def _assert_flag(value):
flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level']
res = paddle.get_flags(flags)
assert res["FLAGS_check_nan_inf"] == value
paddle.set_flags({"FLAGS_check_nan_inf": 0})
paddle.seed(102)
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
checked_op_list=["elementwise_pow"],
skipped_op_list=["elementwise_add"],
debug_step=[0, 3],
)
# check seed
assert checker_config.initial_seed == 102
assert checker_config.seed == 102
_assert_flag(False)
for index in range(5):
paddle.amp.debugging.enable_tensor_checker(checker_config)
if index <= 2:
_assert_flag(True)
assert (
index + 1
== paddle.amp.debugging.TensorCheckerConfig.Current_step_id
)
assert 1 == self.test_nan_inf_op()
else:
assert (
3
== paddle.amp.debugging.TensorCheckerConfig.Current_step_id
)
_assert_flag(False)
assert 0 == self.test_nan_inf_op()
paddle.amp.debugging.disable_tensor_checker()
_assert_flag(False)
paddle.amp.debugging.disable_tensor_checker()
if __name__ == '__main__':
......
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
class TestTensorChecker(unittest.TestCase):
def get_num_inf(self, e):
num_nan = 0
num_inf = 0
# Cannot catch the log in CUDA kernel.
err_str_list = (
str(e)
.replace("(", " ")
.replace(")", " ")
.replace(",", " ")
.split(" ")
)
for err_str in err_str_list:
if "num_nan" in err_str:
num_nan = int(err_str.split("=")[1])
elif "num_inf" in err_str:
num_inf = int(err_str.split("=")[1])
print(
"[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
num_nan, num_inf
)
)
return num_nan
def generate_num_inf(self, place):
num_inf = 0
num_nan = 0
paddle.set_device(place)
# check op list
x = paddle.to_tensor(
[1, 0, 0],
dtype='float32',
stop_gradient=False,
)
y = paddle.to_tensor([0, 0, 1], dtype='float32')
try:
res = paddle.pow(x, y)
# test backward
paddle.autograd.backward([res])
res = paddle.divide(y, x)
except Exception as e:
num_inf = self.get_num_inf(e)
return num_inf
def test_tensor_checker(self):
def _assert_flag(value):
flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level']
res = paddle.get_flags(flags)
assert res["FLAGS_check_nan_inf"] == value
paddle.set_flags({"FLAGS_check_nan_inf": 0})
paddle.seed(102)
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
checked_op_list=["elementwise_pow_grad"],
skipped_op_list=["elementwise_div"],
debug_step=[0, 3],
)
places = ['cpu']
if paddle.is_compiled_with_cuda():
places.append('gpu')
# check seed
self.assertEqual(checker_config.initial_seed, 102)
self.assertEqual(checker_config.seed, 102)
_assert_flag(False)
for place in places:
paddle.amp.debugging.TensorCheckerConfig.current_step_id = 0
for index in range(5):
paddle.amp.debugging.enable_tensor_checker(checker_config)
if index <= 2:
_assert_flag(True)
self.assertEqual(
index + 1,
paddle.amp.debugging.TensorCheckerConfig.current_step_id,
)
self.assertEqual(1, self.generate_num_inf(place))
else:
self.assertEqual(
3,
paddle.amp.debugging.TensorCheckerConfig.current_step_id,
)
_assert_flag(False)
self.assertEqual(0, self.generate_num_inf(place))
paddle.amp.debugging.disable_tensor_checker()
_assert_flag(False)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册