未验证 提交 54bc3b46 编写于 作者: Y Yiqun Liu 提交者: GitHub

Use an unified FLAGS_check_nan_inf_level to control the result of checking infinite. (#47672)

上级 99504cbb
...@@ -25,8 +25,7 @@ ...@@ -25,8 +25,7 @@
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/kernels/funcs/math_cuda_utils.h" #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
DECLARE_bool(abort_on_nan_inf); DECLARE_int32(check_nan_inf_level);
DECLARE_bool(check_tensor_max_min);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -233,23 +232,46 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr, ...@@ -233,23 +232,46 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
tensor_block_mean_ptr); tensor_block_mean_ptr);
} }
template <typename T> template <typename T,
typename MT,
std::enable_if_t<std::is_same<T, float>::value, bool> = true>
__device__ bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) {
if (check_nan_inf_level >= 3) {
return true;
} else if (check_nan_inf_level >= 2) {
MT fp16_max =
static_cast<MT>(std::numeric_limits<phi::dtype::float16>::max());
return max_value > fp16_max || min_value < -fp16_max;
}
return false;
}
template <typename T,
typename MT,
std::enable_if_t<!std::is_same<T, float>::value, bool> = true>
__device__ bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) {
if (check_nan_inf_level >= 3) {
return true;
}
return false;
}
template <typename T, typename MT>
__global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
const T* tensor_block_max_ptr, const MT* tensor_block_max_ptr,
const T* tensor_block_min_ptr, const MT* tensor_block_min_ptr,
const T* tensor_block_mean_ptr, const MT* tensor_block_mean_ptr,
const char* debug_info, const char* debug_info,
int64_t numel, int64_t numel,
int64_t numel_max_min, int64_t numel_max_min,
bool abort_on_nan_inf, int check_nan_inf_level) {
bool check_tensor_max_min) {
if (blockIdx.x == 0 && threadIdx.x == 0) { if (blockIdx.x == 0 && threadIdx.x == 0) {
int has_nan = found_nan_inf_ptr[0]; int has_nan = found_nan_inf_ptr[0];
int has_inf = found_nan_inf_ptr[1]; int has_inf = found_nan_inf_ptr[1];
T max_value = static_cast<T>(0); MT max_value = static_cast<MT>(0);
T min_value = static_cast<T>(0); MT min_value = static_cast<MT>(0);
T mean_value = static_cast<T>(0); MT mean_value = static_cast<MT>(0);
if (tensor_block_max_ptr && tensor_block_min_ptr && tensor_block_mean_ptr) { if (tensor_block_max_ptr && tensor_block_min_ptr && tensor_block_mean_ptr) {
max_value = tensor_block_max_ptr[0]; max_value = tensor_block_max_ptr[0];
min_value = tensor_block_min_ptr[0]; min_value = tensor_block_min_ptr[0];
...@@ -257,9 +279,9 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, ...@@ -257,9 +279,9 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
// numel_max_min <= 128 // numel_max_min <= 128
for (int64_t i = 1; i < numel_max_min; ++i) { for (int64_t i = 1; i < numel_max_min; ++i) {
T tmp_max_value = tensor_block_max_ptr[i]; MT tmp_max_value = tensor_block_max_ptr[i];
T tmp_min_value = tensor_block_min_ptr[i]; MT tmp_min_value = tensor_block_min_ptr[i];
T tmp_mean_value = tensor_block_mean_ptr[i]; MT tmp_mean_value = tensor_block_mean_ptr[i];
max_value = tmp_max_value > max_value ? tmp_max_value : max_value; max_value = tmp_max_value > max_value ? tmp_max_value : max_value;
min_value = tmp_min_value < min_value ? tmp_min_value : min_value; min_value = tmp_min_value < min_value ? tmp_min_value : min_value;
...@@ -268,7 +290,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, ...@@ -268,7 +290,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
} }
if (has_nan || has_inf) { if (has_nan || has_inf) {
if (abort_on_nan_inf) { if (check_nan_inf_level == 0) {
PADDLE_ENFORCE(false, PADDLE_ENFORCE(false,
"===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, " "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, "
"find_inf=%d, " "find_inf=%d, "
...@@ -280,7 +302,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, ...@@ -280,7 +302,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
static_cast<float>(max_value), static_cast<float>(max_value),
static_cast<float>(min_value), static_cast<float>(min_value),
static_cast<float>(mean_value)); static_cast<float>(mean_value));
} else { } else if (check_nan_inf_level >= 1) {
printf( printf(
"===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, " "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, "
"find_inf=%d, " "find_inf=%d, "
...@@ -293,7 +315,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, ...@@ -293,7 +315,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
static_cast<float>(min_value), static_cast<float>(min_value),
static_cast<float>(mean_value)); static_cast<float>(mean_value));
} }
} else if (check_tensor_max_min) { } else if (NeedPrint<T, MT>(max_value, min_value, check_nan_inf_level)) {
printf("[PRECISION] in %s, numel=%ld, max=%e, min=%e, mean=%e\n", printf("[PRECISION] in %s, numel=%ld, max=%e, min=%e, mean=%e\n",
debug_info, debug_info,
numel, numel,
...@@ -423,9 +445,8 @@ void TensorCheckerVisitor<phi::GPUContext>::apply( ...@@ -423,9 +445,8 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
tensor_block_min_ptr, tensor_block_min_ptr,
tensor_block_mean_ptr); tensor_block_mean_ptr);
bool abort_on_nan_inf = FLAGS_abort_on_nan_inf; int check_nan_inf_level = FLAGS_check_nan_inf_level;
bool check_tensor_max_min = FLAGS_check_tensor_max_min; FindGlobalMaxMinAndPrint<T, MT>
FindGlobalMaxMinAndPrint<MT>
<<<1, 1, 0, dev_ctx->stream()>>>(found_nan_inf_ptr, <<<1, 1, 0, dev_ctx->stream()>>>(found_nan_inf_ptr,
tensor_block_max_ptr, tensor_block_max_ptr,
tensor_block_min_ptr, tensor_block_min_ptr,
...@@ -433,8 +454,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply( ...@@ -433,8 +454,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
gpu_str_ptr, gpu_str_ptr,
tensor_.numel(), tensor_.numel(),
numel_max_min, numel_max_min,
abort_on_nan_inf, check_nan_inf_level);
check_tensor_max_min);
#endif #endif
} }
......
...@@ -70,31 +70,24 @@ PADDLE_DEFINE_EXPORTED_bool( ...@@ -70,31 +70,24 @@ PADDLE_DEFINE_EXPORTED_bool(
/** /**
* Operator related FLAG * Operator related FLAG
* Name: FLAGS_abort_on_nan_inf * Name: FLAGS_check_nan_inf_level
* Since Version: 2.5.0 * Since Version: 2.5.0
* Value Range: bool, default=true * Value Range: int32, default=0
* Example:
* Note: Used to debug. Whether abort the process when any operator produce
* NAN/INF. It only works when FLAGS_check_nan_inf is set.
*/
PADDLE_DEFINE_EXPORTED_bool(
abort_on_nan_inf,
true,
"Whether abort the process when any operator produce NAN/INF or not.");
/**
* Operator related FLAG
* Name: FLAGS_check_tensor_max_min
* Since Version: 2.5.0
* Value Range: bool, default=false
* Example: * Example:
* Note: Used to debug. Enable to calculate and print the max and min value of * Note: Used to debug. Setting the check and print level when
* each operator's output tensor. It only works when FLAGS_check_nan_inf is set. * FLAGS_check_nan_inf is set.
* - 0, abort the process when any operator produce NAN/INF and only print the
* information of tensor which holds NAN/INF.
* - 1, continue the training or inference process and print the information of
* all tensors which holds NAN/INF.
* - 2, print the information of float tensors when the max or min value
* overflowing float16's limit.
* - 3, print the information of all tensors.
*/ */
PADDLE_DEFINE_EXPORTED_bool( PADDLE_DEFINE_EXPORTED_int32(
check_tensor_max_min, check_nan_inf_level,
false, 0,
"Whether to check all the output tensors's min and max value."); "Setting the check and print level when FLAGS_check_nan_inf is set.");
/** /**
* Operator related FLAG * Operator related FLAG
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册