未验证 提交 54bc3b46 编写于 作者: Y Yiqun Liu 提交者: GitHub

Use an unified FLAGS_check_nan_inf_level to control the result of checking infinite. (#47672)

上级 99504cbb
......@@ -25,8 +25,7 @@
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/kernels/funcs/math_cuda_utils.h"
DECLARE_bool(abort_on_nan_inf);
DECLARE_bool(check_tensor_max_min);
DECLARE_int32(check_nan_inf_level);
namespace paddle {
namespace framework {
......@@ -233,23 +232,46 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
tensor_block_mean_ptr);
}
template <typename T>
template <typename T,
typename MT,
std::enable_if_t<std::is_same<T, float>::value, bool> = true>
__device__ bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) {
if (check_nan_inf_level >= 3) {
return true;
} else if (check_nan_inf_level >= 2) {
MT fp16_max =
static_cast<MT>(std::numeric_limits<phi::dtype::float16>::max());
return max_value > fp16_max || min_value < -fp16_max;
}
return false;
}
template <typename T,
typename MT,
std::enable_if_t<!std::is_same<T, float>::value, bool> = true>
__device__ bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) {
if (check_nan_inf_level >= 3) {
return true;
}
return false;
}
template <typename T, typename MT>
__global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
const T* tensor_block_max_ptr,
const T* tensor_block_min_ptr,
const T* tensor_block_mean_ptr,
const MT* tensor_block_max_ptr,
const MT* tensor_block_min_ptr,
const MT* tensor_block_mean_ptr,
const char* debug_info,
int64_t numel,
int64_t numel_max_min,
bool abort_on_nan_inf,
bool check_tensor_max_min) {
int check_nan_inf_level) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
int has_nan = found_nan_inf_ptr[0];
int has_inf = found_nan_inf_ptr[1];
T max_value = static_cast<T>(0);
T min_value = static_cast<T>(0);
T mean_value = static_cast<T>(0);
MT max_value = static_cast<MT>(0);
MT min_value = static_cast<MT>(0);
MT mean_value = static_cast<MT>(0);
if (tensor_block_max_ptr && tensor_block_min_ptr && tensor_block_mean_ptr) {
max_value = tensor_block_max_ptr[0];
min_value = tensor_block_min_ptr[0];
......@@ -257,9 +279,9 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
// numel_max_min <= 128
for (int64_t i = 1; i < numel_max_min; ++i) {
T tmp_max_value = tensor_block_max_ptr[i];
T tmp_min_value = tensor_block_min_ptr[i];
T tmp_mean_value = tensor_block_mean_ptr[i];
MT tmp_max_value = tensor_block_max_ptr[i];
MT tmp_min_value = tensor_block_min_ptr[i];
MT tmp_mean_value = tensor_block_mean_ptr[i];
max_value = tmp_max_value > max_value ? tmp_max_value : max_value;
min_value = tmp_min_value < min_value ? tmp_min_value : min_value;
......@@ -268,7 +290,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
}
if (has_nan || has_inf) {
if (abort_on_nan_inf) {
if (check_nan_inf_level == 0) {
PADDLE_ENFORCE(false,
"===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, "
"find_inf=%d, "
......@@ -280,7 +302,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
static_cast<float>(max_value),
static_cast<float>(min_value),
static_cast<float>(mean_value));
} else {
} else if (check_nan_inf_level >= 1) {
printf(
"===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, "
"find_inf=%d, "
......@@ -293,7 +315,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
static_cast<float>(min_value),
static_cast<float>(mean_value));
}
} else if (check_tensor_max_min) {
} else if (NeedPrint<T, MT>(max_value, min_value, check_nan_inf_level)) {
printf("[PRECISION] in %s, numel=%ld, max=%e, min=%e, mean=%e\n",
debug_info,
numel,
......@@ -423,9 +445,8 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
tensor_block_min_ptr,
tensor_block_mean_ptr);
bool abort_on_nan_inf = FLAGS_abort_on_nan_inf;
bool check_tensor_max_min = FLAGS_check_tensor_max_min;
FindGlobalMaxMinAndPrint<MT>
int check_nan_inf_level = FLAGS_check_nan_inf_level;
FindGlobalMaxMinAndPrint<T, MT>
<<<1, 1, 0, dev_ctx->stream()>>>(found_nan_inf_ptr,
tensor_block_max_ptr,
tensor_block_min_ptr,
......@@ -433,8 +454,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
gpu_str_ptr,
tensor_.numel(),
numel_max_min,
abort_on_nan_inf,
check_tensor_max_min);
check_nan_inf_level);
#endif
}
......
......@@ -70,31 +70,24 @@ PADDLE_DEFINE_EXPORTED_bool(
/**
* Operator related FLAG
* Name: FLAGS_abort_on_nan_inf
* Name: FLAGS_check_nan_inf_level
* Since Version: 2.5.0
* Value Range: bool, default=true
* Example:
* Note: Used to debug. Whether abort the process when any operator produce
* NAN/INF. It only works when FLAGS_check_nan_inf is set.
*/
PADDLE_DEFINE_EXPORTED_bool(
abort_on_nan_inf,
true,
"Whether abort the process when any operator produce NAN/INF or not.");
/**
* Operator related FLAG
* Name: FLAGS_check_tensor_max_min
* Since Version: 2.5.0
* Value Range: bool, default=false
* Value Range: int32, default=0
* Example:
* Note: Used to debug. Enable to calculate and print the max and min value of
* each operator's output tensor. It only works when FLAGS_check_nan_inf is set.
* Note: Used to debug. Setting the check and print level when
* FLAGS_check_nan_inf is set.
* - 0, abort the process when any operator produce NAN/INF and only print the
* information of tensor which holds NAN/INF.
* - 1, continue the training or inference process and print the information of
* all tensors which holds NAN/INF.
* - 2, print the information of float tensors when the max or min value
* overflowing float16's limit.
* - 3, print the information of all tensors.
*/
PADDLE_DEFINE_EXPORTED_bool(
check_tensor_max_min,
false,
"Whether to check all the output tensors's min and max value.");
PADDLE_DEFINE_EXPORTED_int32(
check_nan_inf_level,
0,
"Setting the check and print level when FLAGS_check_nan_inf is set.");
/**
* Operator related FLAG
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册