未验证 提交 28de4558 编写于 作者: N niuliling123 提交者: GitHub

Add TensorCheckerConfig for debugging tools (#51906)

上级 2aaed989
...@@ -13,8 +13,14 @@ ...@@ -13,8 +13,14 @@
# limitations under the License. # limitations under the License.
import contextlib import contextlib
import os
import random
from enum import Enum
import numpy as np
import paddle import paddle
from paddle.fluid import core
from paddle.fluid.framework import dygraph_only from paddle.fluid.framework import dygraph_only
__all__ = [ __all__ = [
...@@ -24,6 +30,217 @@ __all__ = [ ...@@ -24,6 +30,217 @@ __all__ = [
] ]
class DebugMode(Enum):
CHECK_NAN_INF_AND_ABORT = 0
CHECK_NAN_INF = 1
CHECK_ALL_FOR_OVERFLOW = 2
CHECK_ALL = 3
CHECK_ALL_AND_ABORT = 4
DUMP_ALL = 5
class TensorCheckerConfig:
"""
Collect the config for checking nan and inf in module or op tensor.
Args:
* enable: Whether to enable Tensor's value detection function. The default value is False, which means that these tools will never be used.
* debug_mode: Debug mode,There are 6 kinds of debug mode.
CHECK_NAN_INF_AND_ABORT(default): Print or save Tensor key information with NaN/Inf and interrupt the program
CHECK_NAN_INF: Print or save Tensor critical information with NaN/Inf, but continue to run
CHECK_ALL_AND_ABORT: Print or save the output Tensor key information of all operators, and interrupt the program if NaN/Inf occurs
CHECK_ALL_FOR_OVERFLOW: Check the output of the FP32 operator, print or save key Tensor information that exceeds the FP16 representation range (overflow, underflow)
CHECK_ALL: Print or save output Tensor key information for all operators
DUMP_ALL: Saves all Tensor data. This mode does not print on the terminal
* dump_dir: The collection data storage path. If it is None, it will be directly printed to the terminal
* checked_op_list: A list of operators you want to check
* skipped_op_list: A list of operators to skip checking
* debug_step: The iteration scope of debugging
* stack_height_limit: The maximum depth of the call stack, and supports printing the call stack at the error location. The specific scheme needs to be investigated
* enable_traceback_filtering: Whether to filter the traceback. The main purpose is to filter out the internal code call stack of the framework and only display the user code call stack
Examples:
.. code-block:: python
import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
paddle.amp.debugging.enable_tensor_checker(checker_config)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True)
paddle.amp.debugging.disable_tensor_checker()
"""
# For module debugging
Current_step_id = 0
def __init__(
self,
enable,
debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT,
dump_dir=None,
checked_op_list=None,
skipped_op_list=None,
debug_step=None,
stack_height_limit=3,
enable_traceback_filtering=False,
):
self.enable = enable
self.debug_mode = debug_mode
self.dump_dir = dump_dir
self.checked_op_list = checked_op_list
self.skipped_op_list = skipped_op_list
self.debug_step = debug_step
self.stack_height_limit = stack_height_limit
self.enable_traceback_filtering = enable_traceback_filtering
self.start_step = None
self.end_step = None
self.seed = 123
self.initial_seed = 123
# check debug_step
if debug_step is not None:
if isinstance(debug_step, (tuple, list)):
assert (
len(self.debug_step) == 2
and self.debug_step[1] > self.debug_step[0]
)
self.start_step, self.end_step = self.debug_step
self.start_step = max(self.start_step, 0)
else:
raise ValueError("debug_step must be list or tuple")
if core.is_compiled_with_cuda():
for i in range(core.get_cuda_device_count()):
self.initial_seed = core.default_cuda_generator(
i
).initial_seed()
elif core.is_compiled_with_xpu():
for i in range(core.get_xpu_device_count()):
self.initial_seed = core.default_xpu_generator(i).initial_seed()
self.initial_seed = core.default_cpu_generator().initial_seed()
# check debug_mode
if self.debug_mode.name not in DebugMode.__members__:
raise ValueError(
"debug_mode in DebugMode",
self.debug_mode,
DebugMode.__members__,
)
# check checked_op_list
if self.checked_op_list is not None:
if isinstance(self.checked_op_list, (list, tuple)):
check_op_list = ",".join(
value for value in self.checked_op_list
)
os.environ["Paddle_check_nan_inf_op_list"] = str(check_op_list)
else:
raise ValueError("checked_op_list must be list or tuple")
# check skipped_op_list
if self.skipped_op_list is not None:
if isinstance(self.skipped_op_list, (list, tuple)):
skipped_op_list = ",".join(
value for value in self.skipped_op_list
)
os.environ["Paddle_skip_nan_inf_op_list"] = str(skipped_op_list)
else:
raise ValueError("skipped_op_list must be list or tuple")
if self.enable:
self._set_seed(self.enable)
def keep_random(self, seed, flag):
# get random seed
self.seed = seed
paddle.seed(self.seed)
np.random.seed(self.seed)
random.seed(self.seed)
# set cudnn and cpu
if core.is_compiled_with_cuda():
paddle.set_flags({"FLAGS_cudnn_deterministic": flag})
paddle.set_flags({"FLAGS_cpu_deterministic": flag})
# info
print("AMP Debugging TensorCheckerConfig: seed ", self.seed)
print(
"AMP Debugging TensorCheckerConfig: FLAGS_cudnn_deterministic is ",
flag,
)
print(
"AMP Debugging TensorCheckerConfig: FLAGS_cpu_deterministic is ",
flag,
)
def _set_seed(self, enable):
if self.initial_seed != self.seed:
self.seed = self.initial_seed
if self.seed > 4294967295 or self.seed < 0:
print("[Warnning: Seed must be between 0 and 2**32 - 1")
self.seed = 123
self.keep_random(self.seed, True)
def _set_env(self, check_flag):
paddle.set_flags({"FLAGS_check_nan_inf": check_flag})
if check_flag:
# set debug level
paddle.set_flags(
{"FLAGS_check_nan_inf_level": self.debug_mode.value}
)
# set output_dir
if self.dump_dir is not None:
paddle.fluid.core.set_nan_inf_debug_path(self.dump_dir)
# set stack_height_limit
if isinstance(self.stack_height_limit, (int)):
paddle.set_flags(
{"FLAGS_call_stack_level": self.stack_height_limit}
)
else:
raise ValueError("stack_height_limit must be int")
def check(self):
if self.enable:
if self.start_step is not None and self.end_step is not None:
if (
self.start_step > TensorCheckerConfig.Current_step_id
or TensorCheckerConfig.Current_step_id >= self.end_step
):
return False
else:
TensorCheckerConfig.Current_step_id += 1
return True
return False
def run(self):
if self.enable:
self._set_env(self.enable)
def end(self):
self._set_env(False)
def _get_operator_stats_flag(): def _get_operator_stats_flag():
flags = paddle.get_flags(["FLAGS_low_precision_op_list"]) flags = paddle.get_flags(["FLAGS_low_precision_op_list"])
return flags["FLAGS_low_precision_op_list"] return flags["FLAGS_low_precision_op_list"]
...@@ -188,3 +405,61 @@ def collect_operator_stats(): ...@@ -188,3 +405,61 @@ def collect_operator_stats():
enable_operator_stats_collection() enable_operator_stats_collection()
yield yield
disable_operator_stats_collection() disable_operator_stats_collection()
def enable_tensor_checker(checker_config):
"""
enable_tensor_checker(checker_config) is enables model level accuracy checking, which is used together with disables_tensor_checker() to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range.
Attention:
* If disable is called before loss. backward()_tensor_checker(), the gradient operator is not checked;
* If disable is called before optimizer.step() tensor_checker(), the optimizer and other weight update related operators will not be checked
Examples:
.. code-block:: python
import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
paddle.amp.debugging.enable_tensor_checker(checker_config)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True)
paddle.amp.debugging.disable_tensor_checker()
"""
if checker_config.check():
checker_config.run()
else:
checker_config.end()
def disable_tensor_checker():
"""
disable_tensor_checker() to disables the accuracy checking, which is used together with enables_tensor_checker(config) to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range.
Attention:
* If disable_tensor_checker() is called before loss.backward(), the gradient operator is not checked;
* If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked
Examples:
.. code-block:: python
import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
paddle.amp.debugging.enable_tensor_checker(checker_config)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True)
paddle.amp.debugging.disable_tensor_checker()
"""
paddle.set_flags({"FLAGS_check_nan_inf": 0})
...@@ -100,6 +100,131 @@ class TestNanInfDirCheckResult(unittest.TestCase): ...@@ -100,6 +100,131 @@ class TestNanInfDirCheckResult(unittest.TestCase):
x = paddle.to_tensor([2, 3, 4], 'float32') x = paddle.to_tensor([2, 3, 4], 'float32')
y = paddle.to_tensor([1, 5, 2], 'float32') y = paddle.to_tensor([1, 5, 2], 'float32')
z = paddle.add(x, y) z = paddle.add(x, y)
path = ""
paddle.fluid.core.set_nan_inf_debug_path(path)
def test_nan_inf_op(self):
import paddle
num_nan = 0
num_inf = 0
# check op list
x = paddle.to_tensor(
[1, 0, 1],
place=paddle.CPUPlace(),
dtype='float32',
stop_gradient=False,
)
y = paddle.to_tensor(
[0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
)
try:
res = paddle.pow(x, y)
except Exception as e:
# Cannot catch the log in CUDA kernel.
err_str_list = (
str(e)
.replace("(", " ")
.replace(")", " ")
.replace(",", " ")
.split(" ")
)
for err_str in err_str_list:
if "num_nan" in err_str:
num_nan = int(err_str.split("=")[1])
elif "num_inf" in err_str:
num_inf = int(err_str.split("=")[1])
print(
"[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
num_nan, num_inf
)
)
return num_inf
def test_check_op_list(self):
import paddle
num_nan = 0
num_inf = 0
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
skipped_op_list=["elementwise_div"],
)
x = paddle.to_tensor(
[0, 0, 0],
place=paddle.CPUPlace(),
dtype='float32',
stop_gradient=False,
)
y = paddle.to_tensor(
[0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
)
paddle.amp.debugging.enable_tensor_checker(checker_config)
try:
res = paddle.divide(y, x)
except Exception as e:
# Cannot catch the log in CUDA kernel.
err_str_list = (
str(e)
.replace("(", " ")
.replace(")", " ")
.replace(",", " ")
.split(" ")
)
for err_str in err_str_list:
if "num_nan" in err_str:
num_nan = int(err_str.split("=")[1])
elif "num_inf" in err_str:
num_inf = int(err_str.split("=")[1])
print(
"[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
num_nan, num_inf
)
)
paddle.amp.debugging.enable_tensor_checker(checker_config)
def test_tensor_checker(self):
import paddle
def _assert_flag(value):
flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level']
res = paddle.get_flags(flags)
assert res["FLAGS_check_nan_inf"] == value
paddle.set_flags({"FLAGS_check_nan_inf": 0})
paddle.seed(102)
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
checked_op_list=["elementwise_pow"],
skipped_op_list=["elementwise_add"],
debug_step=[0, 3],
)
# check seed
assert checker_config.initial_seed == 102
assert checker_config.seed == 102
_assert_flag(False)
for index in range(5):
paddle.amp.debugging.enable_tensor_checker(checker_config)
if index <= 2:
_assert_flag(True)
assert (
index + 1
== paddle.amp.debugging.TensorCheckerConfig.Current_step_id
)
assert 1 == self.test_nan_inf_op()
else:
assert (
3
== paddle.amp.debugging.TensorCheckerConfig.Current_step_id
)
_assert_flag(False)
assert 0 == self.test_nan_inf_op()
paddle.amp.debugging.disable_tensor_checker()
_assert_flag(False)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册