未验证 提交 ec77defc 编写于 作者: N niuliling123 提交者: GitHub

[Cherry-pick] Add enable_tensor_checker and disable_tensor_checker to api list (#52936) (#53287)

新增enable_tensor_checker, disable_tensor_checker API (#52936)
上级 ba415ee7
...@@ -24,8 +24,6 @@ ...@@ -24,8 +24,6 @@
DECLARE_int32(check_nan_inf_level); DECLARE_int32(check_nan_inf_level);
namespace egr { namespace egr {
static std::once_flag dump_list_init_flag;
static std::unordered_set<std::string>& nan_inf_check_op_list() { static std::unordered_set<std::string>& nan_inf_check_op_list() {
static std::unordered_set<std::string> _check_op_list = {}; static std::unordered_set<std::string> _check_op_list = {};
return _check_op_list; return _check_op_list;
...@@ -36,39 +34,32 @@ static std::unordered_set<std::string>& nan_inf_skip_op_list() { ...@@ -36,39 +34,32 @@ static std::unordered_set<std::string>& nan_inf_skip_op_list() {
return _skip_op_list; return _skip_op_list;
} }
static void InitDumpListFormEnv() { void SetCheckOpList(const std::string& check_op_list = "") {
nan_inf_check_op_list(); nan_inf_check_op_list();
nan_inf_skip_op_list(); if (check_op_list.size() != 0) {
const char* check_op_list = std::getenv("Paddle_check_nan_inf_op_list");
const char* skip_op_list = std::getenv("Paddle_skip_nan_inf_op_list");
if (check_op_list) {
std::stringstream ss(check_op_list); std::stringstream ss(check_op_list);
std::string op_type; std::string op_type;
LOG(INFO) << "Please set op's name according to the " LOG(INFO) << "Please set op's name according to the "
"paddle.amp.low_precision_op_list()"; "paddle.amp.low_precision_op_list()";
while (std::getline(ss, op_type, ',')) { while (std::getline(ss, op_type, ',')) {
nan_inf_check_op_list().emplace(op_type); nan_inf_check_op_list().emplace(op_type);
VLOG(4) << "Check nan inf op list: " << op_type;
} }
} }
}
if (skip_op_list) { void SetSkipOpList(const std::string& skip_op_list = "") {
nan_inf_skip_op_list();
if (skip_op_list.size() != 0) {
std::stringstream ss(skip_op_list); std::stringstream ss(skip_op_list);
std::string op_type; std::string op_type;
LOG(INFO) << "Please set op's name according to the " LOG(INFO) << "Please set op's name according to the "
"paddle.amp.low_precision_op_list()"; "paddle.amp.low_precision_op_list()";
while (std::getline(ss, op_type, ',')) { while (std::getline(ss, op_type, ',')) {
nan_inf_skip_op_list().emplace(op_type); nan_inf_skip_op_list().emplace(op_type);
VLOG(4) << "Skip nan inf op list: " << op_type;
} }
} }
for (auto const& key : nan_inf_check_op_list()) {
LOG(INFO) << "Check nan inf op list: " << key;
}
for (auto const& key : nan_inf_skip_op_list()) {
LOG(INFO) << "Skip nan inf op list: " << key;
}
} }
bool CheckOp(const std::string& api_name) { bool CheckOp(const std::string& api_name) {
...@@ -89,7 +80,6 @@ bool CheckOp(const std::string& api_name) { ...@@ -89,7 +80,6 @@ bool CheckOp(const std::string& api_name) {
} }
void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) { void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
std::call_once(dump_list_init_flag, InitDumpListFormEnv);
auto op_name = phi::TransToFluidOpName(api_name); auto op_name = phi::TransToFluidOpName(api_name);
if (tensor.initialized() && CheckOp(op_name)) { if (tensor.initialized() && CheckOp(op_name)) {
auto& tensor_name = tensor.name(); auto& tensor_name = tensor.name();
......
...@@ -65,6 +65,10 @@ void CheckTensorHasNanOrInf( ...@@ -65,6 +65,10 @@ void CheckTensorHasNanOrInf(
void CheckTensorHasNanOrInf(const std::string& api_name, void CheckTensorHasNanOrInf(const std::string& api_name,
const TupleOfTensorAndVector& tensors); const TupleOfTensorAndVector& tensors);
void SetCheckOpList(const std::string& check_op_list);
void SetSkipOpList(const std::string& skip_op_list);
void CheckTensorHasNanOrInf( void CheckTensorHasNanOrInf(
const std::string& api_name, const std::string& api_name,
const paddle::small_vector<std::vector<paddle::Tensor>, const paddle::small_vector<std::vector<paddle::Tensor>,
......
...@@ -30,6 +30,7 @@ namespace details { ...@@ -30,6 +30,7 @@ namespace details {
struct DebugTools { struct DebugTools {
DebugTools() {} DebugTools() {}
std::string path = ""; std::string path = "";
int stack_limit = 1;
}; };
static DebugTools debug_nan_inf; static DebugTools debug_nan_inf;
...@@ -45,6 +46,13 @@ std::string GetNanPath() { ...@@ -45,6 +46,13 @@ std::string GetNanPath() {
return debug_nan_inf.path + "/"; return debug_nan_inf.path + "/";
} }
void SetNanInfStackLimit(const int& stack_limit) {
debug_nan_inf.stack_limit = stack_limit;
VLOG(4) << "Set the stack limit of debug tools : " << stack_limit;
}
int GetNanInfStackLimit() { return debug_nan_inf.stack_limit; }
static std::once_flag white_list_init_flag; static std::once_flag white_list_init_flag;
static int op_role_nan_inf_white_list = 0; static int op_role_nan_inf_white_list = 0;
......
...@@ -516,7 +516,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply( ...@@ -516,7 +516,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
check_nan_inf_level, check_nan_inf_level,
nan_inf_zero_tensor.data<int64_t>()); nan_inf_zero_tensor.data<int64_t>());
if (check_nan_inf_level == 0) { if (check_nan_inf_level == 0 && GetNanInfStackLimit() > 0) {
auto nan_cpu = auto nan_cpu =
phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 3); phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 3);
int64_t* nan_cpu_ptr = reinterpret_cast<int64_t*>(nan_cpu->ptr()); int64_t* nan_cpu_ptr = reinterpret_cast<int64_t*>(nan_cpu->ptr());
......
...@@ -40,6 +40,10 @@ void SetNanInfDebugPath(const std::string& nan_inf_path); ...@@ -40,6 +40,10 @@ void SetNanInfDebugPath(const std::string& nan_inf_path);
std::string GetNanPath(); std::string GetNanPath();
void SetNanInfStackLimit(const int& stack_limit);
int GetNanInfStackLimit();
template <typename T, template <typename T,
typename MT, typename MT,
std::enable_if_t<std::is_same<T, float>::value, bool> = true> std::enable_if_t<std::is_same<T, float>::value, bool> = true>
......
...@@ -2670,9 +2670,21 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2670,9 +2670,21 @@ All parameter, weight, gradient are variables in Paddle.
m.def("use_layout_autotune", m.def("use_layout_autotune",
[] { return egr::Controller::Instance().UseLayoutAutoTune(); }); [] { return egr::Controller::Instance().UseLayoutAutoTune(); });
// Add the api for nan op debug // Add the api for nan op debug
m.def("set_nan_inf_stack_limit",
&paddle::framework::details::SetNanInfStackLimit);
// Add the api for nan op debug
m.def("set_nan_inf_debug_path", m.def("set_nan_inf_debug_path",
&paddle::framework::details::SetNanInfDebugPath); &paddle::framework::details::SetNanInfDebugPath);
// Add check op lost
m.def("set_checked_op_list",
[](const std::string &op_list) { egr::SetCheckOpList(op_list); });
// Add skipped op list
m.def("set_skipped_op_list",
[](const std::string &op_list) { egr::SetSkipOpList(op_list); });
m.def("check_numerics", m.def("check_numerics",
[](const std::string &op_name, const paddle::Tensor &tensor) { [](const std::string &op_name, const paddle::Tensor &tensor) {
VLOG(4) << "Check tensor whether has nan or inf."; VLOG(4) << "Check tensor whether has nan or inf.";
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
# limitations under the License. # limitations under the License.
import contextlib import contextlib
import os
import random import random
from enum import Enum from enum import Enum
...@@ -24,82 +23,119 @@ from paddle.fluid import core ...@@ -24,82 +23,119 @@ from paddle.fluid import core
from paddle.fluid.framework import dygraph_only from paddle.fluid.framework import dygraph_only
__all__ = [ __all__ = [
"DebugMode",
"TensorCheckerConfig",
"enable_operator_stats_collection", "enable_operator_stats_collection",
"disable_operator_stats_collection", "disable_operator_stats_collection",
"collect_operator_stats", "collect_operator_stats",
"enable_tensor_checker",
"disable_tensor_checker",
] ]
class DebugMode(Enum): class DebugMode(Enum):
"""
The DebugMode is a feature that helps to present the state of the TensorCheckerConfig. Each DebugMode has a specific meaning, which is explained below:
- DebugMode.CHECK_NAN_INF_AND_ABORT: This mode prints or saves information about Tensors that contain NaN/Inf and interrupts the program.
- DebugMode.CHECK_NAN_INF: This mode prints or saves critical information about Tensors that contain NaN/Inf but allows the program to continue running.
- DebugMode.CHECK_ALL_FOR_OVERFLOW: This mode checks the output of the FP32 operator and prints or saves information about key Tensors that exceed the FP16 representation range, such as overflow or underflow.
- DebugMode.CHECK_ALL: This mode prints or saves output Tensor key information for all operators.
"""
CHECK_NAN_INF_AND_ABORT = 0 CHECK_NAN_INF_AND_ABORT = 0
CHECK_NAN_INF = 1 CHECK_NAN_INF = 1
CHECK_ALL_FOR_OVERFLOW = 2 CHECK_ALL_FOR_OVERFLOW = 2
CHECK_ALL = 3 CHECK_ALL = 3
CHECK_ALL_AND_ABORT = 4 # CHECK_ALL_AND_ABORT = 4
DUMP_ALL = 5 # DUMP_ALL = 5
def set_checked_op_list(checked_op_list):
# check checked_op_list
if checked_op_list is not None:
if isinstance(checked_op_list, (list, tuple)):
check_op_list = ",".join(value for value in checked_op_list)
paddle.fluid.core.set_checked_op_list(check_op_list)
else:
raise ValueError("checked_op_list must be list or tuple")
def set_skipped_op_list(skipped_op_list):
# check skipped_op_list
if skipped_op_list is not None:
if isinstance(skipped_op_list, (list, tuple)):
skip_op_list = ",".join(value for value in skipped_op_list)
paddle.fluid.core.set_skipped_op_list(skip_op_list)
else:
raise ValueError("skipped_op_list must be list or tuple")
class TensorCheckerConfig: class TensorCheckerConfig:
""" """
Collect the config for checking nan and inf in module or op tensor. The purpose of this class is to collect the configuration for checking NaN and Inf values in the tensors of a module or operator. It takes the following arguments:
Args: Args:
* enable: Whether to enable Tensor's value detection function. The default value is False, which means that these tools will never be used. enable(bool): Indicating whether to enable the detection of NaN and Inf values in tensors. The default value is False, which means that these tools will not be used.
* debug_mode: Debug mode,There are 6 kinds of debug mode.
CHECK_NAN_INF_AND_ABORT(default): Print or save Tensor key information with NaN/Inf and interrupt the program
CHECK_NAN_INF: Print or save Tensor critical information with NaN/Inf, but continue to run
CHECK_ALL_AND_ABORT: Print or save the output Tensor key information of all operators, and interrupt the program if NaN/Inf occurs
CHECK_ALL_FOR_OVERFLOW: Check the output of the FP32 operator, print or save key Tensor information that exceeds the FP16 representation range (overflow, underflow)
CHECK_ALL: Print or save output Tensor key information for all operators
DUMP_ALL: Saves all Tensor data. This mode does not print on the terminal
* dump_dir: The collection data storage path. If it is None, it will be directly printed to the terminal debug_mode(DebugMode, optional): A parameter that determines the type of debugging to be used. Default is DebugMode.CHECK_NAN_INF_AND_ABORT.
* checked_op_list: A list of operators you want to check output_dir(string, optional): The path to store collected data. If this parameter is set to None, the data will be printed to the terminal. Default is None.
* skipped_op_list: A list of operators to skip checking checked_op_list(list|tuple, optional): Specifies a list of operators that need to be checked during program execution, for example, checked_op_list=['elementwise_add', 'conv2d'], indicating that the output results of elementwise_add and conv2d should be checked for nan/inf during program execution. Default is None.
* debug_step: The iteration scope of debugging skipped_op_list(list|tuple, optional): Specifies a list of operators that do not need to be checked during program execution, for example, skipped_op_list=['elementwise_add', 'conv2d'], indicating that the output results of elementwise_add and conv2d should not be checked for nan/inf during program execution. None is None.
* stack_height_limit: The maximum depth of the call stack, and supports printing the call stack at the error location. The specific scheme needs to be investigated debug_step(list|tuple, optional): A list or tuple used primarily for nan/inf checking during model training. For example, debug_step=[1,5] indicates that nan/inf checking should only be performed on model training iterations 1 to 5. Default is None.
* enable_traceback_filtering: Whether to filter the traceback. The main purpose is to filter out the internal code call stack of the framework and only display the user code call stack stack_height_limit(int, optional): An integer value specifying the maximum depth of the call stack. This feature supports printing the call stack at the error location. Currently, only enabling or disabling call stack printing is supported. If you want to print the corresponding C++ call stack when NaN is detected in GPU Kernel, set stack_height_limit to 1, otherwise set it to 0. Default is 1.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT) checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF)
paddle.amp.debugging.enable_tensor_checker(checker_config) paddle.amp.debugging.enable_tensor_checker(checker_config)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False) x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32') y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y) res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True) paddle.autograd.backward(res, retain_graph=True)
paddle.amp.debugging.disable_tensor_checker() paddle.amp.debugging.disable_tensor_checker()
#[PRECISION] [ERROR] in [device=cpu, op=elementwise_pow_grad, tensor=, dtype=fp32], numel=3, num_nan=1, num_inf=0, num_zero=0, max=2.886751e-01, min=2.000000e-01, mean=-nan
# when DebugMode.CHECK_NAN_INF_AND_ABORT and stack_height_limit = 1
#Traceback (most recent call last):
# res = paddle.pow(x, y)
# File "/usr/local/lib/python3.8/dist-packages/paddle/tensor/math.py", line 447, in pow
# return _C_ops.elementwise_pow(x, y)
""" """
# For module debugging # For module debugging
Current_step_id = 0 current_step_id = 0
def __init__( def __init__(
self, self,
enable, enable,
debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT,
dump_dir=None, output_dir=None,
checked_op_list=None, checked_op_list=None,
skipped_op_list=None, skipped_op_list=None,
debug_step=None, debug_step=None,
stack_height_limit=3, stack_height_limit=1,
enable_traceback_filtering=False,
): ):
self.enable = enable self.enable = enable
self.debug_mode = debug_mode self.debug_mode = debug_mode
self.dump_dir = dump_dir self.output_dir = output_dir
self.checked_op_list = checked_op_list self.checked_op_list = checked_op_list
self.skipped_op_list = skipped_op_list self.skipped_op_list = skipped_op_list
...@@ -107,8 +143,6 @@ class TensorCheckerConfig: ...@@ -107,8 +143,6 @@ class TensorCheckerConfig:
self.debug_step = debug_step self.debug_step = debug_step
self.stack_height_limit = stack_height_limit self.stack_height_limit = stack_height_limit
self.enable_traceback_filtering = enable_traceback_filtering
self.start_step = None self.start_step = None
self.end_step = None self.end_step = None
...@@ -146,60 +180,43 @@ class TensorCheckerConfig: ...@@ -146,60 +180,43 @@ class TensorCheckerConfig:
DebugMode.__members__, DebugMode.__members__,
) )
# check checked_op_list set_checked_op_list(self.checked_op_list)
if self.checked_op_list is not None:
if isinstance(self.checked_op_list, (list, tuple)):
check_op_list = ",".join(
value for value in self.checked_op_list
)
os.environ["Paddle_check_nan_inf_op_list"] = str(check_op_list)
else:
raise ValueError("checked_op_list must be list or tuple")
# check skipped_op_list set_skipped_op_list(self.skipped_op_list)
if self.skipped_op_list is not None:
if isinstance(self.skipped_op_list, (list, tuple)):
skipped_op_list = ",".join(
value for value in self.skipped_op_list
)
os.environ["Paddle_skip_nan_inf_op_list"] = str(skipped_op_list)
else:
raise ValueError("skipped_op_list must be list or tuple")
if self.enable: if self.enable:
self._set_seed(self.enable) self._set_seed(self.enable)
def keep_random(self, seed, flag): def _set_seed(self, flag):
if self.initial_seed != self.seed:
self.seed = self.initial_seed
if self.seed > np.iinfo(np.uint32).max or self.seed < 0:
print("[Warnning: Seed must be between 0 and 2**32 - 1")
self.seed = 123
# get random seed # get random seed
self.seed = seed
paddle.seed(self.seed) paddle.seed(self.seed)
np.random.seed(self.seed) np.random.seed(self.seed)
random.seed(self.seed) random.seed(self.seed)
# info
print("AMP Debugging TensorCheckerConfig: seed ", self.seed)
# set cudnn and cpu # set cudnn and cpu
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
paddle.set_flags({"FLAGS_cudnn_deterministic": flag}) paddle.set_flags({"FLAGS_cudnn_deterministic": flag})
paddle.set_flags({"FLAGS_cpu_deterministic": flag})
# info
print("AMP Debugging TensorCheckerConfig: seed ", self.seed)
print( print(
"AMP Debugging TensorCheckerConfig: FLAGS_cudnn_deterministic is ", "AMP Debugging TensorCheckerConfig: FLAGS_cudnn_deterministic is ",
flag, flag,
) )
paddle.set_flags({"FLAGS_cpu_deterministic": flag})
print( print(
"AMP Debugging TensorCheckerConfig: FLAGS_cpu_deterministic is ", "AMP Debugging TensorCheckerConfig: FLAGS_cpu_deterministic is ",
flag, flag,
) )
def _set_seed(self, enable):
if self.initial_seed != self.seed:
self.seed = self.initial_seed
if self.seed > 4294967295 or self.seed < 0:
print("[Warnning: Seed must be between 0 and 2**32 - 1")
self.seed = 123
self.keep_random(self.seed, True)
def _set_env(self, check_flag): def _set_env(self, check_flag):
paddle.set_flags({"FLAGS_check_nan_inf": check_flag}) paddle.set_flags({"FLAGS_check_nan_inf": check_flag})
if check_flag: if check_flag:
...@@ -209,35 +226,35 @@ class TensorCheckerConfig: ...@@ -209,35 +226,35 @@ class TensorCheckerConfig:
) )
# set output_dir # set output_dir
if self.dump_dir is not None: if self.output_dir is not None:
paddle.fluid.core.set_nan_inf_debug_path(self.dump_dir) paddle.fluid.core.set_nan_inf_debug_path(self.output_dir)
# set stack_height_limit # set stack_height_limit
if isinstance(self.stack_height_limit, (int)): if isinstance(self.stack_height_limit, (int)):
paddle.set_flags( paddle.fluid.core.set_nan_inf_stack_limit(
{"FLAGS_call_stack_level": self.stack_height_limit} self.stack_height_limit
) )
else: else:
raise ValueError("stack_height_limit must be int") raise ValueError("stack_height_limit must be int")
def check(self): def update_and_check_step_id(self):
if self.enable: if self.enable:
if self.start_step is not None and self.end_step is not None: if self.start_step is not None and self.end_step is not None:
if ( if (
self.start_step > TensorCheckerConfig.Current_step_id self.start_step > TensorCheckerConfig.current_step_id
or TensorCheckerConfig.Current_step_id >= self.end_step or TensorCheckerConfig.current_step_id >= self.end_step
): ):
return False return False
else: else:
TensorCheckerConfig.Current_step_id += 1 TensorCheckerConfig.current_step_id += 1
return True return True
return False return False
def run(self): def start_check_nan_inf(self):
if self.enable: if self.enable:
self._set_env(self.enable) self._set_env(self.enable)
def end(self): def stop_check_nan_inf(self):
self._set_env(False) self._set_env(False)
...@@ -409,57 +426,74 @@ def collect_operator_stats(): ...@@ -409,57 +426,74 @@ def collect_operator_stats():
def enable_tensor_checker(checker_config): def enable_tensor_checker(checker_config):
""" """
enable_tensor_checker(checker_config) is enables model level accuracy checking, which is used together with disables_tensor_checker() to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range. The enable_tensor_checker(checker_config) function enables model-level accuracy checking and is used in combination with disables_tensor_checker() to achieve model-level precision checking by checking the output Tensors of all operators within the specified range.
Attention:
* If disable is called before loss. backward()_tensor_checker(), the gradient operator is not checked; Args:
checker_config(TensorCheckerConfig): Checker_config is to collect the configuration for checking NaN and Inf values in the tensors of a module or operator.
* If disable is called before optimizer.step() tensor_checker(), the optimizer and other weight update related operators will not be checked Note:
If disable_tensor_checker() is called before backward(), the gradient operator will not be checked.
If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT) checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF)
paddle.amp.debugging.enable_tensor_checker(checker_config) paddle.amp.debugging.enable_tensor_checker(checker_config)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False) x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32') y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y) res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True) paddle.autograd.backward(res, retain_graph=True)
paddle.amp.debugging.disable_tensor_checker() paddle.amp.debugging.disable_tensor_checker()
#[PRECISION] [ERROR] in [device=cpu, op=elementwise_pow_grad, tensor=, dtype=fp32], numel=3, num_nan=1, num_inf=0, num_zero=0, max=2.886751e-01, min=2.000000e-01, mean=-nan
# when DebugMode.CHECK_NAN_INF_AND_ABORT and stack_height_limit = 1
# Traceback (most recent call last):
# File "tp.py", line 8, in <module>
# res = paddle.pow(x, y)
# File "/usr/local/lib/python3.8/dist-packages/paddle/tensor/math.py", line 447, in pow
# return _C_ops.elementwise_pow(x, y)
""" """
if checker_config.check(): if checker_config.update_and_check_step_id():
checker_config.run() checker_config.start_check_nan_inf()
else: else:
checker_config.end() checker_config.stop_check_nan_inf()
def disable_tensor_checker(): def disable_tensor_checker():
""" """
disable_tensor_checker() to disables the accuracy checking, which is used together with enables_tensor_checker(config) to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range. disable_tensor_checker() is used to disable accuracy checking, and is used together with enable_tensor_checker(config) to achieve model-level precision checking by checking the output Tensors of all operators within the specified range.
Attention:
* If disable_tensor_checker() is called before loss.backward(), the gradient operator is not checked;
* If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked Note:
If disable_tensor_checker() is called before backward(), the gradient operator will not be checked;
If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT) checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF)
paddle.amp.debugging.enable_tensor_checker(checker_config) paddle.amp.debugging.enable_tensor_checker(checker_config)
x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False) x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32') y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
res = paddle.pow(x, y) res = paddle.pow(x, y)
paddle.autograd.backward(res, retain_graph=True) paddle.autograd.backward(res, retain_graph=True)
paddle.amp.debugging.disable_tensor_checker() paddle.amp.debugging.disable_tensor_checker()
#[PRECISION] [ERROR] in [device=cpu, op=elementwise_pow_grad, tensor=, dtype=fp32], numel=3, num_nan=1, num_inf=0, num_zero=0, max=2.886751e-01, min=2.000000e-01, mean=-nan
# when DebugMode.CHECK_NAN_INF_AND_ABORT and stack_height_limit = 1
# Traceback (most recent call last):
# res = paddle.pow(x, y)
# File "/usr/local/lib/python3.8/dist-packages/paddle/tensor/math.py", line 447, in pow
# return _C_ops.elementwise_pow(x, y)
""" """
paddle.set_flags({"FLAGS_check_nan_inf": 0}) paddle.set_flags({"FLAGS_check_nan_inf": 0})
...@@ -78,7 +78,14 @@ class TestNanInfDirCheckResult(unittest.TestCase): ...@@ -78,7 +78,14 @@ class TestNanInfDirCheckResult(unittest.TestCase):
def test_num_nan_inf(self): def test_num_nan_inf(self):
path = "nan_inf_log_dir" path = "nan_inf_log_dir"
paddle.fluid.core.set_nan_inf_debug_path(path)
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_ALL,
output_dir=path,
)
paddle.amp.debugging.enable_tensor_checker(checker_config)
def _check_num_nan_inf(use_cuda): def _check_num_nan_inf(use_cuda):
shape = [32, 32] shape = [32, 32]
...@@ -86,145 +93,25 @@ class TestNanInfDirCheckResult(unittest.TestCase): ...@@ -86,145 +93,25 @@ class TestNanInfDirCheckResult(unittest.TestCase):
num_nan_np, num_inf_np = self.get_reference_num_nan_inf(x_np) num_nan_np, num_inf_np = self.get_reference_num_nan_inf(x_np)
add_assert = (num_nan_np + num_inf_np) > 0 add_assert = (num_nan_np + num_inf_np) > 0
num_nan, num_inf = self.get_num_nan_inf( num_nan, num_inf = self.get_num_nan_inf(
x_np, use_cuda, add_assert, path x_np,
use_cuda,
add_assert,
path,
) )
if not use_cuda: if not use_cuda:
assert num_nan == num_nan_np and num_inf == num_inf_np assert num_nan == num_nan_np and num_inf == num_inf_np
paddle.set_flags(
{"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 3}
)
_check_num_nan_inf(use_cuda=False)
if paddle.fluid.core.is_compiled_with_cuda(): if paddle.fluid.core.is_compiled_with_cuda():
_check_num_nan_inf(use_cuda=True) _check_num_nan_inf(use_cuda=True)
else:
_check_num_nan_inf(use_cuda=False)
x = paddle.to_tensor([2, 3, 4], 'float32') x = paddle.to_tensor([2, 3, 4], 'float32')
y = paddle.to_tensor([1, 5, 2], 'float32') y = paddle.to_tensor([1, 5, 2], 'float32')
z = paddle.add(x, y) z = paddle.add(x, y)
path = "" path = ""
paddle.fluid.core.set_nan_inf_debug_path(path) paddle.fluid.core.set_nan_inf_debug_path(path)
def test_nan_inf_op(self):
import paddle
num_nan = 0
num_inf = 0
# check op list
x = paddle.to_tensor(
[1, 0, 1],
place=paddle.CPUPlace(),
dtype='float32',
stop_gradient=False,
)
y = paddle.to_tensor(
[0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
)
try:
res = paddle.pow(x, y)
except Exception as e:
# Cannot catch the log in CUDA kernel.
err_str_list = (
str(e)
.replace("(", " ")
.replace(")", " ")
.replace(",", " ")
.split(" ")
)
for err_str in err_str_list:
if "num_nan" in err_str:
num_nan = int(err_str.split("=")[1])
elif "num_inf" in err_str:
num_inf = int(err_str.split("=")[1])
print(
"[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
num_nan, num_inf
)
)
return num_inf
def test_check_op_list(self):
import paddle
num_nan = 0
num_inf = 0
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
skipped_op_list=["elementwise_div"],
)
x = paddle.to_tensor(
[0, 0, 0],
place=paddle.CPUPlace(),
dtype='float32',
stop_gradient=False,
)
y = paddle.to_tensor(
[0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
)
paddle.amp.debugging.enable_tensor_checker(checker_config)
try:
res = paddle.divide(y, x)
except Exception as e:
# Cannot catch the log in CUDA kernel.
err_str_list = (
str(e)
.replace("(", " ")
.replace(")", " ")
.replace(",", " ")
.split(" ")
)
for err_str in err_str_list:
if "num_nan" in err_str:
num_nan = int(err_str.split("=")[1])
elif "num_inf" in err_str:
num_inf = int(err_str.split("=")[1])
print(
"[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
num_nan, num_inf
)
)
paddle.amp.debugging.enable_tensor_checker(checker_config)
def test_tensor_checker(self):
import paddle
def _assert_flag(value):
flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level']
res = paddle.get_flags(flags)
assert res["FLAGS_check_nan_inf"] == value
paddle.set_flags({"FLAGS_check_nan_inf": 0})
paddle.seed(102)
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
checked_op_list=["elementwise_pow"],
skipped_op_list=["elementwise_add"],
debug_step=[0, 3],
)
# check seed
assert checker_config.initial_seed == 102
assert checker_config.seed == 102
_assert_flag(False)
for index in range(5):
paddle.amp.debugging.enable_tensor_checker(checker_config)
if index <= 2:
_assert_flag(True)
assert (
index + 1
== paddle.amp.debugging.TensorCheckerConfig.Current_step_id
)
assert 1 == self.test_nan_inf_op()
else:
assert (
3
== paddle.amp.debugging.TensorCheckerConfig.Current_step_id
)
_assert_flag(False)
assert 0 == self.test_nan_inf_op()
paddle.amp.debugging.disable_tensor_checker() paddle.amp.debugging.disable_tensor_checker()
_assert_flag(False)
if __name__ == '__main__': if __name__ == '__main__':
......
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
class TestTensorChecker(unittest.TestCase):
def get_num_inf(self, e):
num_nan = 0
num_inf = 0
# Cannot catch the log in CUDA kernel.
err_str_list = (
str(e)
.replace("(", " ")
.replace(")", " ")
.replace(",", " ")
.split(" ")
)
for err_str in err_str_list:
if "num_nan" in err_str:
num_nan = int(err_str.split("=")[1])
elif "num_inf" in err_str:
num_inf = int(err_str.split("=")[1])
print(
"[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
num_nan, num_inf
)
)
return num_nan
def generate_num_inf(self, place):
num_inf = 0
num_nan = 0
paddle.set_device(place)
# check op list
x = paddle.to_tensor(
[1, 0, 0],
dtype='float32',
stop_gradient=False,
)
y = paddle.to_tensor([0, 0, 1], dtype='float32')
try:
res = paddle.pow(x, y)
# test backward
paddle.autograd.backward([res])
res = paddle.divide(y, x)
except Exception as e:
num_inf = self.get_num_inf(e)
return num_inf
def test_tensor_checker(self):
def _assert_flag(value):
flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level']
res = paddle.get_flags(flags)
assert res["FLAGS_check_nan_inf"] == value
paddle.set_flags({"FLAGS_check_nan_inf": 0})
paddle.seed(102)
checker_config = paddle.amp.debugging.TensorCheckerConfig(
enable=True,
debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
checked_op_list=["elementwise_pow_grad"],
skipped_op_list=["elementwise_div"],
debug_step=[0, 3],
)
places = ['cpu']
if paddle.is_compiled_with_cuda():
places.append('gpu')
# check seed
self.assertEqual(checker_config.initial_seed, 102)
self.assertEqual(checker_config.seed, 102)
_assert_flag(False)
for place in places:
paddle.amp.debugging.TensorCheckerConfig.current_step_id = 0
for index in range(5):
paddle.amp.debugging.enable_tensor_checker(checker_config)
if index <= 2:
_assert_flag(True)
self.assertEqual(
index + 1,
paddle.amp.debugging.TensorCheckerConfig.current_step_id,
)
self.assertEqual(1, self.generate_num_inf(place))
else:
self.assertEqual(
3,
paddle.amp.debugging.TensorCheckerConfig.current_step_id,
)
_assert_flag(False)
self.assertEqual(0, self.generate_num_inf(place))
paddle.amp.debugging.disable_tensor_checker()
_assert_flag(False)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册