未验证 提交 28de4558 编写于 作者: N niuliling123 提交者: GitHub

Add TensorCheckerConfig for debugging tools (#51906)

上级 2aaed989
......@@ -13,8 +13,14 @@
# limitations under the License.
import contextlib
import os
import random
from enum import Enum
import numpy as np
import paddle
from paddle.fluid import core
from paddle.fluid.framework import dygraph_only
__all__ = [
......@@ -24,6 +30,217 @@ __all__ = [
]
class DebugMode(Enum):
CHECK_NAN_INF_AND_ABORT = 0
CHECK_NAN_INF = 1
CHECK_ALL_FOR_OVERFLOW = 2
CHECK_ALL = 3
CHECK_ALL_AND_ABORT = 4
DUMP_ALL = 5
class TensorCheckerConfig:
"""
Collect the config for checking nan and inf in module or op tensor.
Args:
* enable: Whether to enable Tensor's value detection function. The default value is False, which means that these tools will never be used.
* debug_mode: Debug mode,There are 6 kinds of debug mode.
CHECK_NAN_INF_AND_ABORT(default): Print or save Tensor key information with NaN/Inf and interrupt the program
CHECK_NAN_INF: Print or save Tensor critical information with NaN/Inf, but continue to run
CHECK_ALL_AND_ABORT: Print or save the output Tensor key information of all operators, and interrupt the program if NaN/Inf occurs
CHECK_ALL_FOR_OVERFLOW: Check the output of the FP32 operator, print or save key Tensor information that exceeds the FP16 representation range (overflow, underflow)
CHECK_ALL: Print or save output Tensor key information for all operators
DUMP_ALL: Saves all Tensor data. This mode does not print on the terminal
* dump_dir: The collection data storage path. If it is None, it will be directly printed to the terminal
* checked_op_list: A list of operators you want to check
* skipped_op_list: A list of operators to skip checking
* debug_step: The iteration scope of debugging
* stack_height_limit: The maximum depth of the call stack, and supports printing the call stack at the error location. The specific scheme needs to be investigated
* enable_traceback_filtering: Whether to filter the traceback. The main purpose is to filter out the internal code call stack of the framework and only display the user code call stack
Examples:
.. code-block:: python
import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
paddle.amp.debugging.enable_tensor_checker(checker_config)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True)
paddle.amp.debugging.disable_tensor_checker()
"""
# For module debugging
Current_step_id = 0
def __init__(
self,
enable,
debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT,
dump_dir=None,
checked_op_list=None,
skipped_op_list=None,
debug_step=None,
stack_height_limit=3,
enable_traceback_filtering=False,
):
self.enable = enable
self.debug_mode = debug_mode
self.dump_dir = dump_dir
self.checked_op_list = checked_op_list
self.skipped_op_list = skipped_op_list
self.debug_step = debug_step
self.stack_height_limit = stack_height_limit
self.enable_traceback_filtering = enable_traceback_filtering
self.start_step = None
self.end_step = None
self.seed = 123
self.initial_seed = 123
# check debug_step
if debug_step is not None:
if isinstance(debug_step, (tuple, list)):
assert (
len(self.debug_step) == 2
and self.debug_step[1] > self.debug_step[0]
)
self.start_step, self.end_step = self.debug_step
self.start_step = max(self.start_step, 0)
else:
raise ValueError("debug_step must be list or tuple")
if core.is_compiled_with_cuda():
for i in range(core.get_cuda_device_count()):
self.initial_seed = core.default_cuda_generator(
i
).initial_seed()
elif core.is_compiled_with_xpu():
for i in range(core.get_xpu_device_count()):
self.initial_seed = core.default_xpu_generator(i).initial_seed()
self.initial_seed = core.default_cpu_generator().initial_seed()
# check debug_mode
if self.debug_mode.name not in DebugMode.__members__:
raise ValueError(
"debug_mode in DebugMode",
self.debug_mode,
DebugMode.__members__,
)
# check checked_op_list
if self.checked_op_list is not None:
if isinstance(self.checked_op_list, (list, tuple)):
check_op_list = ",".join(
value for value in self.checked_op_list
)
os.environ["Paddle_check_nan_inf_op_list"] = str(check_op_list)
else:
raise ValueError("checked_op_list must be list or tuple")
# check skipped_op_list
if self.skipped_op_list is not None:
if isinstance(self.skipped_op_list, (list, tuple)):
skipped_op_list = ",".join(
value for value in self.skipped_op_list
)
os.environ["Paddle_skip_nan_inf_op_list"] = str(skipped_op_list)
else:
raise ValueError("skipped_op_list must be list or tuple")
if self.enable:
self._set_seed(self.enable)
def keep_random(self, seed, flag):
# get random seed
self.seed = seed
paddle.seed(self.seed)
np.random.seed(self.seed)
random.seed(self.seed)
# set cudnn and cpu
if core.is_compiled_with_cuda():
paddle.set_flags({"FLAGS_cudnn_deterministic": flag})
paddle.set_flags({"FLAGS_cpu_deterministic": flag})
# info
print("AMP Debugging TensorCheckerConfig: seed ", self.seed)
print(
"AMP Debugging TensorCheckerConfig: FLAGS_cudnn_deterministic is ",
flag,
)
print(
"AMP Debugging TensorCheckerConfig: FLAGS_cpu_deterministic is ",
flag,
)
def _set_seed(self, enable):
if self.initial_seed != self.seed:
self.seed = self.initial_seed
if self.seed > 4294967295 or self.seed < 0:
print("[Warnning: Seed must be between 0 and 2**32 - 1")
self.seed = 123
self.keep_random(self.seed, True)
def _set_env(self, check_flag):
paddle.set_flags({"FLAGS_check_nan_inf": check_flag})
if check_flag:
# set debug level
paddle.set_flags(
{"FLAGS_check_nan_inf_level": self.debug_mode.value}
)
# set output_dir
if self.dump_dir is not None:
paddle.fluid.core.set_nan_inf_debug_path(self.dump_dir)
# set stack_height_limit
if isinstance(self.stack_height_limit, (int)):
paddle.set_flags(
{"FLAGS_call_stack_level": self.stack_height_limit}
)
else:
raise ValueError("stack_height_limit must be int")
def check(self):
if self.enable:
if self.start_step is not None and self.end_step is not None:
if (
self.start_step > TensorCheckerConfig.Current_step_id
or TensorCheckerConfig.Current_step_id >= self.end_step
):
return False
else:
TensorCheckerConfig.Current_step_id += 1
return True
return False
def run(self):
if self.enable:
self._set_env(self.enable)
def end(self):
self._set_env(False)
def _get_operator_stats_flag():
flags = paddle.get_flags(["FLAGS_low_precision_op_list"])
return flags["FLAGS_low_precision_op_list"]
......@@ -188,3 +405,61 @@ def collect_operator_stats():
enable_operator_stats_collection()
yield
disable_operator_stats_collection()
def enable_tensor_checker(checker_config):
"""
enable_tensor_checker(checker_config) is enables model level accuracy checking, which is used together with disables_tensor_checker() to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range.
Attention:
* If disable is called before loss. backward()_tensor_checker(), the gradient operator is not checked;
* If disable is called before optimizer.step() tensor_checker(), the optimizer and other weight update related operators will not be checked
Examples:
.. code-block:: python
import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
paddle.amp.debugging.enable_tensor_checker(checker_config)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True)
paddle.amp.debugging.disable_tensor_checker()
"""
if checker_config.check():
checker_config.run()
else:
checker_config.end()
def disable_tensor_checker():
"""
disable_tensor_checker() to disables the accuracy checking, which is used together with enables_tensor_checker(config) to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range.
Attention:
* If disable_tensor_checker() is called before loss.backward(), the gradient operator is not checked;
* If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked
Examples:
.. code-block:: python
import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
paddle.amp.debugging.enable_tensor_checker(checker_config)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True)
paddle.amp.debugging.disable_tensor_checker()
"""
paddle.set_flags({"FLAGS_check_nan_inf": 0})
......@@ -100,6 +100,131 @@ class TestNanInfDirCheckResult(unittest.TestCase):
x = paddle.to_tensor([2, 3, 4], 'float32')
y = paddle.to_tensor([1, 5, 2], 'float32')
z = paddle.add(x, y)
path = ""
paddle.fluid.core.set_nan_inf_debug_path(path)
def test_nan_inf_op(self):
import paddle
num_nan = 0
num_inf = 0
# check op list
x = paddle.to_tensor(
[1, 0, 1],
place=paddle.CPUPlace(),
dtype='float32',
stop_gradient=False,
)
y = paddle.to_tensor(
[0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
)
try:
res = paddle.pow(x, y)
except Exception as e:
# Cannot catch the log in CUDA kernel.
err_str_list = (
str(e)
.replace("(", " ")
.replace(")", " ")
.replace(",", " ")
.split(" ")
)
for err_str in err_str_list:
if "num_nan" in err_str:
num_nan = int(err_str.split("=")[1])
elif "num_inf" in err_str:
num_inf = int(err_str.split("=")[1])
print(
"[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
num_nan, num_inf
)
)
return num_inf
def test_check_op_list(self):
import paddle
num_nan = 0
num_inf = 0
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
skipped_op_list=["elementwise_div"],
)
x = paddle.to_tensor(
[0, 0, 0],
place=paddle.CPUPlace(),
dtype='float32',
stop_gradient=False,
)
y = paddle.to_tensor(
[0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
)
paddle.amp.debugging.enable_tensor_checker(checker_config)
try:
res = paddle.divide(y, x)
except Exception as e:
# Cannot catch the log in CUDA kernel.
err_str_list = (
str(e)
.replace("(", " ")
.replace(")", " ")
.replace(",", " ")
.split(" ")
)
for err_str in err_str_list:
if "num_nan" in err_str:
num_nan = int(err_str.split("=")[1])
elif "num_inf" in err_str:
num_inf = int(err_str.split("=")[1])
print(
"[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
num_nan, num_inf
)
)
paddle.amp.debugging.enable_tensor_checker(checker_config)
def test_tensor_checker(self):
import paddle
def _assert_flag(value):
flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level']
res = paddle.get_flags(flags)
assert res["FLAGS_check_nan_inf"] == value
paddle.set_flags({"FLAGS_check_nan_inf": 0})
paddle.seed(102)
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
checked_op_list=["elementwise_pow"],
skipped_op_list=["elementwise_add"],
debug_step=[0, 3],
)
# check seed
assert checker_config.initial_seed == 102
assert checker_config.seed == 102
_assert_flag(False)
for index in range(5):
paddle.amp.debugging.enable_tensor_checker(checker_config)
if index <= 2:
_assert_flag(True)
assert (
index + 1
== paddle.amp.debugging.TensorCheckerConfig.Current_step_id
)
assert 1 == self.test_nan_inf_op()
else:
assert (
3
== paddle.amp.debugging.TensorCheckerConfig.Current_step_id
)
_assert_flag(False)
assert 0 == self.test_nan_inf_op()
paddle.amp.debugging.disable_tensor_checker()
_assert_flag(False)
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册