未验证 提交 ec77defc 编写于 作者: N niuliling123 提交者: GitHub

[Cherry-pick] Add enable_tensor_checker and disable_tensor_checker to api list (#52936) (#53287)

新增enable_tensor_checker, disable_tensor_checker API (#52936)
上级 ba415ee7
......@@ -24,8 +24,6 @@
DECLARE_int32(check_nan_inf_level);
namespace egr {
static std::once_flag dump_list_init_flag;
static std::unordered_set<std::string>& nan_inf_check_op_list() {
static std::unordered_set<std::string> _check_op_list = {};
return _check_op_list;
......@@ -36,39 +34,32 @@ static std::unordered_set<std::string>& nan_inf_skip_op_list() {
return _skip_op_list;
}
static void InitDumpListFormEnv() {
void SetCheckOpList(const std::string& check_op_list = "") {
nan_inf_check_op_list();
nan_inf_skip_op_list();
const char* check_op_list = std::getenv("Paddle_check_nan_inf_op_list");
const char* skip_op_list = std::getenv("Paddle_skip_nan_inf_op_list");
if (check_op_list) {
if (check_op_list.size() != 0) {
std::stringstream ss(check_op_list);
std::string op_type;
LOG(INFO) << "Please set op's name according to the "
"paddle.amp.low_precision_op_list()";
while (std::getline(ss, op_type, ',')) {
nan_inf_check_op_list().emplace(op_type);
VLOG(4) << "Check nan inf op list: " << op_type;
}
}
}
if (skip_op_list) {
void SetSkipOpList(const std::string& skip_op_list = "") {
nan_inf_skip_op_list();
if (skip_op_list.size() != 0) {
std::stringstream ss(skip_op_list);
std::string op_type;
LOG(INFO) << "Please set op's name according to the "
"paddle.amp.low_precision_op_list()";
while (std::getline(ss, op_type, ',')) {
nan_inf_skip_op_list().emplace(op_type);
VLOG(4) << "Skip nan inf op list: " << op_type;
}
}
for (auto const& key : nan_inf_check_op_list()) {
LOG(INFO) << "Check nan inf op list: " << key;
}
for (auto const& key : nan_inf_skip_op_list()) {
LOG(INFO) << "Skip nan inf op list: " << key;
}
}
bool CheckOp(const std::string& api_name) {
......@@ -89,7 +80,6 @@ bool CheckOp(const std::string& api_name) {
}
void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
std::call_once(dump_list_init_flag, InitDumpListFormEnv);
auto op_name = phi::TransToFluidOpName(api_name);
if (tensor.initialized() && CheckOp(op_name)) {
auto& tensor_name = tensor.name();
......
......@@ -65,6 +65,10 @@ void CheckTensorHasNanOrInf(
void CheckTensorHasNanOrInf(const std::string& api_name,
const TupleOfTensorAndVector& tensors);
void SetCheckOpList(const std::string& check_op_list);
void SetSkipOpList(const std::string& skip_op_list);
void CheckTensorHasNanOrInf(
const std::string& api_name,
const paddle::small_vector<std::vector<paddle::Tensor>,
......
......@@ -30,6 +30,7 @@ namespace details {
struct DebugTools {
DebugTools() {}
std::string path = "";
int stack_limit = 1;
};
static DebugTools debug_nan_inf;
......@@ -45,6 +46,13 @@ std::string GetNanPath() {
return debug_nan_inf.path + "/";
}
void SetNanInfStackLimit(const int& stack_limit) {
debug_nan_inf.stack_limit = stack_limit;
VLOG(4) << "Set the stack limit of debug tools : " << stack_limit;
}
int GetNanInfStackLimit() { return debug_nan_inf.stack_limit; }
static std::once_flag white_list_init_flag;
static int op_role_nan_inf_white_list = 0;
......
......@@ -516,7 +516,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
check_nan_inf_level,
nan_inf_zero_tensor.data<int64_t>());
if (check_nan_inf_level == 0) {
if (check_nan_inf_level == 0 && GetNanInfStackLimit() > 0) {
auto nan_cpu =
phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 3);
int64_t* nan_cpu_ptr = reinterpret_cast<int64_t*>(nan_cpu->ptr());
......
......@@ -40,6 +40,10 @@ void SetNanInfDebugPath(const std::string& nan_inf_path);
std::string GetNanPath();
void SetNanInfStackLimit(const int& stack_limit);
int GetNanInfStackLimit();
template <typename T,
typename MT,
std::enable_if_t<std::is_same<T, float>::value, bool> = true>
......
......@@ -2670,9 +2670,21 @@ All parameter, weight, gradient are variables in Paddle.
m.def("use_layout_autotune",
[] { return egr::Controller::Instance().UseLayoutAutoTune(); });
// Add the api for nan op debug
m.def("set_nan_inf_stack_limit",
&paddle::framework::details::SetNanInfStackLimit);
// Add the api for nan op debug
m.def("set_nan_inf_debug_path",
&paddle::framework::details::SetNanInfDebugPath);
// Add check op lost
m.def("set_checked_op_list",
[](const std::string &op_list) { egr::SetCheckOpList(op_list); });
// Add skipped op list
m.def("set_skipped_op_list",
[](const std::string &op_list) { egr::SetSkipOpList(op_list); });
m.def("check_numerics",
[](const std::string &op_name, const paddle::Tensor &tensor) {
VLOG(4) << "Check tensor whether has nan or inf.";
......
......@@ -13,7 +13,6 @@
# limitations under the License.
import contextlib
import os
import random
from enum import Enum
......@@ -24,82 +23,119 @@ from paddle.fluid import core
from paddle.fluid.framework import dygraph_only
__all__ = [
"DebugMode",
"TensorCheckerConfig",
"enable_operator_stats_collection",
"disable_operator_stats_collection",
"collect_operator_stats",
"enable_tensor_checker",
"disable_tensor_checker",
]
class DebugMode(Enum):
"""
The DebugMode is a feature that helps to present the state of the TensorCheckerConfig. Each DebugMode has a specific meaning, which is explained below:
- DebugMode.CHECK_NAN_INF_AND_ABORT: This mode prints or saves information about Tensors that contain NaN/Inf and interrupts the program.
- DebugMode.CHECK_NAN_INF: This mode prints or saves critical information about Tensors that contain NaN/Inf but allows the program to continue running.
- DebugMode.CHECK_ALL_FOR_OVERFLOW: This mode checks the output of the FP32 operator and prints or saves information about key Tensors that exceed the FP16 representation range, such as overflow or underflow.
- DebugMode.CHECK_ALL: This mode prints or saves output Tensor key information for all operators.
"""
CHECK_NAN_INF_AND_ABORT = 0
CHECK_NAN_INF = 1
CHECK_ALL_FOR_OVERFLOW = 2
CHECK_ALL = 3
CHECK_ALL_AND_ABORT = 4
DUMP_ALL = 5
# CHECK_ALL_AND_ABORT = 4
# DUMP_ALL = 5
def set_checked_op_list(checked_op_list):
# check checked_op_list
if checked_op_list is not None:
if isinstance(checked_op_list, (list, tuple)):
check_op_list = ",".join(value for value in checked_op_list)
paddle.fluid.core.set_checked_op_list(check_op_list)
else:
raise ValueError("checked_op_list must be list or tuple")
def set_skipped_op_list(skipped_op_list):
# check skipped_op_list
if skipped_op_list is not None:
if isinstance(skipped_op_list, (list, tuple)):
skip_op_list = ",".join(value for value in skipped_op_list)
paddle.fluid.core.set_skipped_op_list(skip_op_list)
else:
raise ValueError("skipped_op_list must be list or tuple")
class TensorCheckerConfig:
"""
Collect the config for checking nan and inf in module or op tensor.
The purpose of this class is to collect the configuration for checking NaN and Inf values in the tensors of a module or operator. It takes the following arguments:
Args:
* enable: Whether to enable Tensor's value detection function. The default value is False, which means that these tools will never be used.
enable(bool): Indicating whether to enable the detection of NaN and Inf values in tensors. The default value is False, which means that these tools will not be used.
* debug_mode: Debug mode,There are 6 kinds of debug mode.
CHECK_NAN_INF_AND_ABORT(default): Print or save Tensor key information with NaN/Inf and interrupt the program
CHECK_NAN_INF: Print or save Tensor critical information with NaN/Inf, but continue to run
CHECK_ALL_AND_ABORT: Print or save the output Tensor key information of all operators, and interrupt the program if NaN/Inf occurs
CHECK_ALL_FOR_OVERFLOW: Check the output of the FP32 operator, print or save key Tensor information that exceeds the FP16 representation range (overflow, underflow)
CHECK_ALL: Print or save output Tensor key information for all operators
DUMP_ALL: Saves all Tensor data. This mode does not print on the terminal
debug_mode(DebugMode, optional): A parameter that determines the type of debugging to be used. Default is DebugMode.CHECK_NAN_INF_AND_ABORT.
* dump_dir: The collection data storage path. If it is None, it will be directly printed to the terminal
output_dir(string, optional): The path to store collected data. If this parameter is set to None, the data will be printed to the terminal. Default is None.
* checked_op_list: A list of operators you want to check
checked_op_list(list|tuple, optional): Specifies a list of operators that need to be checked during program execution, for example, checked_op_list=['elementwise_add', 'conv2d'], indicating that the output results of elementwise_add and conv2d should be checked for nan/inf during program execution. Default is None.
* skipped_op_list: A list of operators to skip checking
skipped_op_list(list|tuple, optional): Specifies a list of operators that do not need to be checked during program execution, for example, skipped_op_list=['elementwise_add', 'conv2d'], indicating that the output results of elementwise_add and conv2d should not be checked for nan/inf during program execution. None is None.
* debug_step: The iteration scope of debugging
debug_step(list|tuple, optional): A list or tuple used primarily for nan/inf checking during model training. For example, debug_step=[1,5] indicates that nan/inf checking should only be performed on model training iterations 1 to 5. Default is None.
* stack_height_limit: The maximum depth of the call stack, and supports printing the call stack at the error location. The specific scheme needs to be investigated
* enable_traceback_filtering: Whether to filter the traceback. The main purpose is to filter out the internal code call stack of the framework and only display the user code call stack
stack_height_limit(int, optional): An integer value specifying the maximum depth of the call stack. This feature supports printing the call stack at the error location. Currently, only enabling or disabling call stack printing is supported. If you want to print the corresponding C++ call stack when NaN is detected in GPU Kernel, set stack_height_limit to 1, otherwise set it to 0. Default is 1.
Examples:
.. code-block:: python
import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
paddle.amp.debugging.enable_tensor_checker(checker_config)
.. code-block:: python
import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF)
paddle.amp.debugging.enable_tensor_checker(checker_config)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True)
paddle.amp.debugging.disable_tensor_checker()
paddle.autograd.backward(res, retain_graph=True)
paddle.amp.debugging.disable_tensor_checker()
#[PRECISION] [ERROR] in [device=cpu, op=elementwise_pow_grad, tensor=, dtype=fp32], numel=3, num_nan=1, num_inf=0, num_zero=0, max=2.886751e-01, min=2.000000e-01, mean=-nan
# when DebugMode.CHECK_NAN_INF_AND_ABORT and stack_height_limit = 1
#Traceback (most recent call last):
# res = paddle.pow(x, y)
# File "/usr/local/lib/python3.8/dist-packages/paddle/tensor/math.py", line 447, in pow
# return _C_ops.elementwise_pow(x, y)
"""
# For module debugging
Current_step_id = 0
current_step_id = 0
def __init__(
self,
enable,
debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT,
dump_dir=None,
output_dir=None,
checked_op_list=None,
skipped_op_list=None,
debug_step=None,
stack_height_limit=3,
enable_traceback_filtering=False,
stack_height_limit=1,
):
self.enable = enable
self.debug_mode = debug_mode
self.dump_dir = dump_dir
self.output_dir = output_dir
self.checked_op_list = checked_op_list
self.skipped_op_list = skipped_op_list
......@@ -107,8 +143,6 @@ class TensorCheckerConfig:
self.debug_step = debug_step
self.stack_height_limit = stack_height_limit
self.enable_traceback_filtering = enable_traceback_filtering
self.start_step = None
self.end_step = None
......@@ -146,60 +180,43 @@ class TensorCheckerConfig:
DebugMode.__members__,
)
# check checked_op_list
if self.checked_op_list is not None:
if isinstance(self.checked_op_list, (list, tuple)):
check_op_list = ",".join(
value for value in self.checked_op_list
)
os.environ["Paddle_check_nan_inf_op_list"] = str(check_op_list)
else:
raise ValueError("checked_op_list must be list or tuple")
set_checked_op_list(self.checked_op_list)
# check skipped_op_list
if self.skipped_op_list is not None:
if isinstance(self.skipped_op_list, (list, tuple)):
skipped_op_list = ",".join(
value for value in self.skipped_op_list
)
os.environ["Paddle_skip_nan_inf_op_list"] = str(skipped_op_list)
else:
raise ValueError("skipped_op_list must be list or tuple")
set_skipped_op_list(self.skipped_op_list)
if self.enable:
self._set_seed(self.enable)
def keep_random(self, seed, flag):
def _set_seed(self, flag):
if self.initial_seed != self.seed:
self.seed = self.initial_seed
if self.seed > np.iinfo(np.uint32).max or self.seed < 0:
print("[Warnning: Seed must be between 0 and 2**32 - 1")
self.seed = 123
# get random seed
self.seed = seed
paddle.seed(self.seed)
np.random.seed(self.seed)
random.seed(self.seed)
# info
print("AMP Debugging TensorCheckerConfig: seed ", self.seed)
# set cudnn and cpu
if core.is_compiled_with_cuda():
paddle.set_flags({"FLAGS_cudnn_deterministic": flag})
paddle.set_flags({"FLAGS_cpu_deterministic": flag})
print(
"AMP Debugging TensorCheckerConfig: FLAGS_cudnn_deterministic is ",
flag,
)
# info
print("AMP Debugging TensorCheckerConfig: seed ", self.seed)
print(
"AMP Debugging TensorCheckerConfig: FLAGS_cudnn_deterministic is ",
flag,
)
paddle.set_flags({"FLAGS_cpu_deterministic": flag})
print(
"AMP Debugging TensorCheckerConfig: FLAGS_cpu_deterministic is ",
flag,
)
def _set_seed(self, enable):
if self.initial_seed != self.seed:
self.seed = self.initial_seed
if self.seed > 4294967295 or self.seed < 0:
print("[Warnning: Seed must be between 0 and 2**32 - 1")
self.seed = 123
self.keep_random(self.seed, True)
def _set_env(self, check_flag):
paddle.set_flags({"FLAGS_check_nan_inf": check_flag})
if check_flag:
......@@ -209,35 +226,35 @@ class TensorCheckerConfig:
)
# set output_dir
if self.dump_dir is not None:
paddle.fluid.core.set_nan_inf_debug_path(self.dump_dir)
if self.output_dir is not None:
paddle.fluid.core.set_nan_inf_debug_path(self.output_dir)
# set stack_height_limit
if isinstance(self.stack_height_limit, (int)):
paddle.set_flags(
{"FLAGS_call_stack_level": self.stack_height_limit}
paddle.fluid.core.set_nan_inf_stack_limit(
self.stack_height_limit
)
else:
raise ValueError("stack_height_limit must be int")
def check(self):
def update_and_check_step_id(self):
if self.enable:
if self.start_step is not None and self.end_step is not None:
if (
self.start_step > TensorCheckerConfig.Current_step_id
or TensorCheckerConfig.Current_step_id >= self.end_step
self.start_step > TensorCheckerConfig.current_step_id
or TensorCheckerConfig.current_step_id >= self.end_step
):
return False
else:
TensorCheckerConfig.Current_step_id += 1
TensorCheckerConfig.current_step_id += 1
return True
return False
def run(self):
def start_check_nan_inf(self):
if self.enable:
self._set_env(self.enable)
def end(self):
def stop_check_nan_inf(self):
self._set_env(False)
......@@ -302,26 +319,26 @@ def enable_operator_stats_collection():
Examples:
.. code-block:: python
.. code-block:: python
import paddle
import paddle
conv = paddle.nn.Conv2D(3, 2, 3)
x = paddle.rand([10, 3, 32, 32])
conv = paddle.nn.Conv2D(3, 2, 3)
x = paddle.rand([10, 3, 32, 32])
paddle.amp.debugging.enable_operator_stats_collection()
# AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype)
with paddle.amp.auto_cast(enable=True, level='O2'):
out = conv(x)
# Print to the standard output.
paddle.amp.debugging.disable_operator_stats_collection()
# <------------------------------------------------------- op list -------------------------------------------------------->
# <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls -->
# conv2d | 1 | 0 | 0 | 0
# elementwise_add | 1 | 0 | 0 | 0
# reshape2 | 1 | 0 | 0 | 0
# transfer_dtype | 0 | 0 | 3 | 0
# <----------------------------------------------------- op count: 4 ------------------------------------------------------>
paddle.amp.debugging.enable_operator_stats_collection()
# AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype)
with paddle.amp.auto_cast(enable=True, level='O2'):
out = conv(x)
# Print to the standard output.
paddle.amp.debugging.disable_operator_stats_collection()
# <------------------------------------------------------- op list -------------------------------------------------------->
# <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls -->
# conv2d | 1 | 0 | 0 | 0
# elementwise_add | 1 | 0 | 0 | 0
# reshape2 | 1 | 0 | 0 | 0
# transfer_dtype | 0 | 0 | 3 | 0
# <----------------------------------------------------- op count: 4 ------------------------------------------------------>
"""
# Clear the previous stats.
......@@ -340,26 +357,26 @@ def disable_operator_stats_collection():
Examples:
.. code-block:: python
.. code-block:: python
import paddle
import paddle
conv = paddle.nn.Conv2D(3, 2, 3)
x = paddle.rand([10, 3, 32, 32])
conv = paddle.nn.Conv2D(3, 2, 3)
x = paddle.rand([10, 3, 32, 32])
paddle.amp.debugging.enable_operator_stats_collection()
# AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype)
with paddle.amp.auto_cast(enable=True, level='O2'):
out = conv(x)
# Print to the standard output.
paddle.amp.debugging.disable_operator_stats_collection()
# <------------------------------------------------------- op list -------------------------------------------------------->
# <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls -->
# conv2d | 1 | 0 | 0 | 0
# elementwise_add | 1 | 0 | 0 | 0
# reshape2 | 1 | 0 | 0 | 0
# transfer_dtype | 0 | 0 | 3 | 0
# <----------------------------------------------------- op count: 4 ------------------------------------------------------>
paddle.amp.debugging.enable_operator_stats_collection()
# AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype)
with paddle.amp.auto_cast(enable=True, level='O2'):
out = conv(x)
# Print to the standard output.
paddle.amp.debugging.disable_operator_stats_collection()
# <------------------------------------------------------- op list -------------------------------------------------------->
# <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls -->
# conv2d | 1 | 0 | 0 | 0
# elementwise_add | 1 | 0 | 0 | 0
# reshape2 | 1 | 0 | 0 | 0
# transfer_dtype | 0 | 0 | 3 | 0
# <----------------------------------------------------- op count: 4 ------------------------------------------------------>
"""
if not _get_operator_stats_flag():
......@@ -381,25 +398,25 @@ def collect_operator_stats():
Examples:
.. code-block:: python
.. code-block:: python
import paddle
import paddle
conv = paddle.nn.Conv2D(3, 2, 3)
x = paddle.rand([10, 3, 32, 32])
conv = paddle.nn.Conv2D(3, 2, 3)
x = paddle.rand([10, 3, 32, 32])
with paddle.amp.debugging.collect_operator_stats():
# AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype)
with paddle.amp.auto_cast(enable=True, level='O2'):
out = conv(x)
# Print to the standard output.
# <------------------------------------------------------- op list -------------------------------------------------------->
# <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls -->
# conv2d | 1 | 0 | 0 | 0
# elementwise_add | 1 | 0 | 0 | 0
# reshape2 | 1 | 0 | 0 | 0
# transfer_dtype | 0 | 0 | 3 | 0
# <----------------------------------------------------- op count: 4 ------------------------------------------------------>
with paddle.amp.debugging.collect_operator_stats():
# AMP list including conv2d, elementwise_add, reshape2, cast (transfer_dtype)
with paddle.amp.auto_cast(enable=True, level='O2'):
out = conv(x)
# Print to the standard output.
# <------------------------------------------------------- op list -------------------------------------------------------->
# <--------------- Op Name ---------------- | -- FP16 Calls --- | -- BF16 Calls --- | --- FP32 Calls--- | -- Other Calls -->
# conv2d | 1 | 0 | 0 | 0
# elementwise_add | 1 | 0 | 0 | 0
# reshape2 | 1 | 0 | 0 | 0
# transfer_dtype | 0 | 0 | 3 | 0
# <----------------------------------------------------- op count: 4 ------------------------------------------------------>
"""
enable_operator_stats_collection()
......@@ -409,57 +426,74 @@ def collect_operator_stats():
def enable_tensor_checker(checker_config):
"""
enable_tensor_checker(checker_config) is enables model level accuracy checking, which is used together with disables_tensor_checker() to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range.
The enable_tensor_checker(checker_config) function enables model-level accuracy checking and is used in combination with disables_tensor_checker() to achieve model-level precision checking by checking the output Tensors of all operators within the specified range.
Attention:
* If disable is called before loss. backward()_tensor_checker(), the gradient operator is not checked;
Args:
checker_config(TensorCheckerConfig): Checker_config is to collect the configuration for checking NaN and Inf values in the tensors of a module or operator.
* If disable is called before optimizer.step() tensor_checker(), the optimizer and other weight update related operators will not be checked
Note:
If disable_tensor_checker() is called before backward(), the gradient operator will not be checked.
If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked.
Examples:
.. code-block:: python
import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
paddle.amp.debugging.enable_tensor_checker(checker_config)
.. code-block:: python
import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF)
paddle.amp.debugging.enable_tensor_checker(checker_config)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True)
paddle.amp.debugging.disable_tensor_checker()
#[PRECISION] [ERROR] in [device=cpu, op=elementwise_pow_grad, tensor=, dtype=fp32], numel=3, num_nan=1, num_inf=0, num_zero=0, max=2.886751e-01, min=2.000000e-01, mean=-nan
# when DebugMode.CHECK_NAN_INF_AND_ABORT and stack_height_limit = 1
# Traceback (most recent call last):
# File "tp.py", line 8, in <module>
# res = paddle.pow(x, y)
# File "/usr/local/lib/python3.8/dist-packages/paddle/tensor/math.py", line 447, in pow
# return _C_ops.elementwise_pow(x, y)
paddle.amp.debugging.disable_tensor_checker()
"""
if checker_config.check():
checker_config.run()
if checker_config.update_and_check_step_id():
checker_config.start_check_nan_inf()
else:
checker_config.end()
checker_config.stop_check_nan_inf()
def disable_tensor_checker():
"""
disable_tensor_checker() to disables the accuracy checking, which is used together with enables_tensor_checker(config) to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range.
disable_tensor_checker() is used to disable accuracy checking, and is used together with enable_tensor_checker(config) to achieve model-level precision checking by checking the output Tensors of all operators within the specified range.
Attention:
Note:
If disable_tensor_checker() is called before backward(), the gradient operator will not be checked;
If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked.
* If disable_tensor_checker() is called before loss.backward(), the gradient operator is not checked;
Examples:
* If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked
.. code-block:: python
Examples:
.. code-block:: python
import paddle
import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
paddle.amp.debugging.enable_tensor_checker(checker_config)
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF)
paddle.amp.debugging.enable_tensor_checker(checker_config)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True)
paddle.amp.debugging.disable_tensor_checker()
#[PRECISION] [ERROR] in [device=cpu, op=elementwise_pow_grad, tensor=, dtype=fp32], numel=3, num_nan=1, num_inf=0, num_zero=0, max=2.886751e-01, min=2.000000e-01, mean=-nan
paddle.amp.debugging.disable_tensor_checker()
# when DebugMode.CHECK_NAN_INF_AND_ABORT and stack_height_limit = 1
# Traceback (most recent call last):
# res = paddle.pow(x, y)
# File "/usr/local/lib/python3.8/dist-packages/paddle/tensor/math.py", line 447, in pow
# return _C_ops.elementwise_pow(x, y)
"""
paddle.set_flags({"FLAGS_check_nan_inf": 0})
......@@ -78,7 +78,14 @@ class TestNanInfDirCheckResult(unittest.TestCase):
def test_num_nan_inf(self):
path = "nan_inf_log_dir"
paddle.fluid.core.set_nan_inf_debug_path(path)
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_ALL,
output_dir=path,
)
paddle.amp.debugging.enable_tensor_checker(checker_config)
def _check_num_nan_inf(use_cuda):
shape = [32, 32]
......@@ -86,145 +93,25 @@ class TestNanInfDirCheckResult(unittest.TestCase):
num_nan_np, num_inf_np = self.get_reference_num_nan_inf(x_np)
add_assert = (num_nan_np + num_inf_np) > 0
num_nan, num_inf = self.get_num_nan_inf(
x_np, use_cuda, add_assert, path
x_np,
use_cuda,
add_assert,
path,
)
if not use_cuda:
assert num_nan == num_nan_np and num_inf == num_inf_np
paddle.set_flags(
{"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 3}
)
_check_num_nan_inf(use_cuda=False)
if paddle.fluid.core.is_compiled_with_cuda():
_check_num_nan_inf(use_cuda=True)
else:
_check_num_nan_inf(use_cuda=False)
x = paddle.to_tensor([2, 3, 4], 'float32')
y = paddle.to_tensor([1, 5, 2], 'float32')
z = paddle.add(x, y)
path = ""
paddle.fluid.core.set_nan_inf_debug_path(path)
def test_nan_inf_op(self):
import paddle
num_nan = 0
num_inf = 0
# check op list
x = paddle.to_tensor(
[1, 0, 1],
place=paddle.CPUPlace(),
dtype='float32',
stop_gradient=False,
)
y = paddle.to_tensor(
[0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
)
try:
res = paddle.pow(x, y)
except Exception as e:
# Cannot catch the log in CUDA kernel.
err_str_list = (
str(e)
.replace("(", " ")
.replace(")", " ")
.replace(",", " ")
.split(" ")
)
for err_str in err_str_list:
if "num_nan" in err_str:
num_nan = int(err_str.split("=")[1])
elif "num_inf" in err_str:
num_inf = int(err_str.split("=")[1])
print(
"[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
num_nan, num_inf
)
)
return num_inf
def test_check_op_list(self):
import paddle
num_nan = 0
num_inf = 0
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
skipped_op_list=["elementwise_div"],
)
x = paddle.to_tensor(
[0, 0, 0],
place=paddle.CPUPlace(),
dtype='float32',
stop_gradient=False,
)
y = paddle.to_tensor(
[0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
)
paddle.amp.debugging.enable_tensor_checker(checker_config)
try:
res = paddle.divide(y, x)
except Exception as e:
# Cannot catch the log in CUDA kernel.
err_str_list = (
str(e)
.replace("(", " ")
.replace(")", " ")
.replace(",", " ")
.split(" ")
)
for err_str in err_str_list:
if "num_nan" in err_str:
num_nan = int(err_str.split("=")[1])
elif "num_inf" in err_str:
num_inf = int(err_str.split("=")[1])
print(
"[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
num_nan, num_inf
)
)
paddle.amp.debugging.enable_tensor_checker(checker_config)
def test_tensor_checker(self):
import paddle
def _assert_flag(value):
flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level']
res = paddle.get_flags(flags)
assert res["FLAGS_check_nan_inf"] == value
paddle.set_flags({"FLAGS_check_nan_inf": 0})
paddle.seed(102)
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
checked_op_list=["elementwise_pow"],
skipped_op_list=["elementwise_add"],
debug_step=[0, 3],
)
# check seed
assert checker_config.initial_seed == 102
assert checker_config.seed == 102
_assert_flag(False)
for index in range(5):
paddle.amp.debugging.enable_tensor_checker(checker_config)
if index <= 2:
_assert_flag(True)
assert (
index + 1
== paddle.amp.debugging.TensorCheckerConfig.Current_step_id
)
assert 1 == self.test_nan_inf_op()
else:
assert (
3
== paddle.amp.debugging.TensorCheckerConfig.Current_step_id
)
_assert_flag(False)
assert 0 == self.test_nan_inf_op()
paddle.amp.debugging.disable_tensor_checker()
_assert_flag(False)
paddle.amp.debugging.disable_tensor_checker()
if __name__ == '__main__':
......
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
class TestTensorChecker(unittest.TestCase):
def get_num_inf(self, e):
num_nan = 0
num_inf = 0
# Cannot catch the log in CUDA kernel.
err_str_list = (
str(e)
.replace("(", " ")
.replace(")", " ")
.replace(",", " ")
.split(" ")
)
for err_str in err_str_list:
if "num_nan" in err_str:
num_nan = int(err_str.split("=")[1])
elif "num_inf" in err_str:
num_inf = int(err_str.split("=")[1])
print(
"[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
num_nan, num_inf
)
)
return num_nan
def generate_num_inf(self, place):
num_inf = 0
num_nan = 0
paddle.set_device(place)
# check op list
x = paddle.to_tensor(
[1, 0, 0],
dtype='float32',
stop_gradient=False,
)
y = paddle.to_tensor([0, 0, 1], dtype='float32')
try:
res = paddle.pow(x, y)
# test backward
paddle.autograd.backward([res])
res = paddle.divide(y, x)
except Exception as e:
num_inf = self.get_num_inf(e)
return num_inf
def test_tensor_checker(self):
def _assert_flag(value):
flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level']
res = paddle.get_flags(flags)
assert res["FLAGS_check_nan_inf"] == value
paddle.set_flags({"FLAGS_check_nan_inf": 0})
paddle.seed(102)
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
checked_op_list=["elementwise_pow_grad"],
skipped_op_list=["elementwise_div"],
debug_step=[0, 3],
)
places = ['cpu']
if paddle.is_compiled_with_cuda():
places.append('gpu')
# check seed
self.assertEqual(checker_config.initial_seed, 102)
self.assertEqual(checker_config.seed, 102)
_assert_flag(False)
for place in places:
paddle.amp.debugging.TensorCheckerConfig.current_step_id = 0
for index in range(5):
paddle.amp.debugging.enable_tensor_checker(checker_config)
if index <= 2:
_assert_flag(True)
self.assertEqual(
index + 1,
paddle.amp.debugging.TensorCheckerConfig.current_step_id,
)
self.assertEqual(1, self.generate_num_inf(place))
else:
self.assertEqual(
3,
paddle.amp.debugging.TensorCheckerConfig.current_step_id,
)
_assert_flag(False)
self.assertEqual(0, self.generate_num_inf(place))
paddle.amp.debugging.disable_tensor_checker()
_assert_flag(False)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册