diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index 163e5610030f6d35f5efe74e7dde2f8a5b19c316..abf575b4ca5453776f787a98ade6f4d2b1e1dde5 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -25,8 +25,7 @@ #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/funcs/math_cuda_utils.h" -DECLARE_bool(abort_on_nan_inf); -DECLARE_bool(check_tensor_max_min); +DECLARE_int32(check_nan_inf_level); namespace paddle { namespace framework { @@ -233,23 +232,46 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr, tensor_block_mean_ptr); } -template +template ::value, bool> = true> +__device__ bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) { + if (check_nan_inf_level >= 3) { + return true; + } else if (check_nan_inf_level >= 2) { + MT fp16_max = + static_cast(std::numeric_limits::max()); + return max_value > fp16_max || min_value < -fp16_max; + } + return false; +} + +template ::value, bool> = true> +__device__ bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) { + if (check_nan_inf_level >= 3) { + return true; + } + return false; +} + +template __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, - const T* tensor_block_max_ptr, - const T* tensor_block_min_ptr, - const T* tensor_block_mean_ptr, + const MT* tensor_block_max_ptr, + const MT* tensor_block_min_ptr, + const MT* tensor_block_mean_ptr, const char* debug_info, int64_t numel, int64_t numel_max_min, - bool abort_on_nan_inf, - bool check_tensor_max_min) { + int check_nan_inf_level) { if (blockIdx.x == 0 && threadIdx.x == 0) { int has_nan = found_nan_inf_ptr[0]; int has_inf = found_nan_inf_ptr[1]; - T max_value = static_cast(0); - T min_value = static_cast(0); - T mean_value = static_cast(0); + MT max_value = static_cast(0); + MT min_value = static_cast(0); + MT mean_value = static_cast(0); if (tensor_block_max_ptr && tensor_block_min_ptr && tensor_block_mean_ptr) { max_value = tensor_block_max_ptr[0]; min_value = tensor_block_min_ptr[0]; @@ -257,9 +279,9 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, // numel_max_min <= 128 for (int64_t i = 1; i < numel_max_min; ++i) { - T tmp_max_value = tensor_block_max_ptr[i]; - T tmp_min_value = tensor_block_min_ptr[i]; - T tmp_mean_value = tensor_block_mean_ptr[i]; + MT tmp_max_value = tensor_block_max_ptr[i]; + MT tmp_min_value = tensor_block_min_ptr[i]; + MT tmp_mean_value = tensor_block_mean_ptr[i]; max_value = tmp_max_value > max_value ? tmp_max_value : max_value; min_value = tmp_min_value < min_value ? tmp_min_value : min_value; @@ -268,7 +290,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, } if (has_nan || has_inf) { - if (abort_on_nan_inf) { + if (check_nan_inf_level == 0) { PADDLE_ENFORCE(false, "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, " "find_inf=%d, " @@ -280,7 +302,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, static_cast(max_value), static_cast(min_value), static_cast(mean_value)); - } else { + } else if (check_nan_inf_level >= 1) { printf( "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, " "find_inf=%d, " @@ -293,7 +315,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, static_cast(min_value), static_cast(mean_value)); } - } else if (check_tensor_max_min) { + } else if (NeedPrint(max_value, min_value, check_nan_inf_level)) { printf("[PRECISION] in %s, numel=%ld, max=%e, min=%e, mean=%e\n", debug_info, numel, @@ -423,9 +445,8 @@ void TensorCheckerVisitor::apply( tensor_block_min_ptr, tensor_block_mean_ptr); - bool abort_on_nan_inf = FLAGS_abort_on_nan_inf; - bool check_tensor_max_min = FLAGS_check_tensor_max_min; - FindGlobalMaxMinAndPrint + int check_nan_inf_level = FLAGS_check_nan_inf_level; + FindGlobalMaxMinAndPrint <<<1, 1, 0, dev_ctx->stream()>>>(found_nan_inf_ptr, tensor_block_max_ptr, tensor_block_min_ptr, @@ -433,8 +454,7 @@ void TensorCheckerVisitor::apply( gpu_str_ptr, tensor_.numel(), numel_max_min, - abort_on_nan_inf, - check_tensor_max_min); + check_nan_inf_level); #endif } diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 1649c0c0c1404ed65147c45c424a9bbe019275a4..d2d2089cee69df0777fb020ef0591eb0753bcc61 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -70,31 +70,24 @@ PADDLE_DEFINE_EXPORTED_bool( /** * Operator related FLAG - * Name: FLAGS_abort_on_nan_inf + * Name: FLAGS_check_nan_inf_level * Since Version: 2.5.0 - * Value Range: bool, default=true - * Example: - * Note: Used to debug. Whether abort the process when any operator produce - * NAN/INF. It only works when FLAGS_check_nan_inf is set. - */ -PADDLE_DEFINE_EXPORTED_bool( - abort_on_nan_inf, - true, - "Whether abort the process when any operator produce NAN/INF or not."); - -/** - * Operator related FLAG - * Name: FLAGS_check_tensor_max_min - * Since Version: 2.5.0 - * Value Range: bool, default=false + * Value Range: int32, default=0 * Example: - * Note: Used to debug. Enable to calculate and print the max and min value of - * each operator's output tensor. It only works when FLAGS_check_nan_inf is set. + * Note: Used to debug. Setting the check and print level when + * FLAGS_check_nan_inf is set. + * - 0, abort the process when any operator produce NAN/INF and only print the + * information of tensor which holds NAN/INF. + * - 1, continue the training or inference process and print the information of + * all tensors which holds NAN/INF. + * - 2, print the information of float tensors when the max or min value + * overflowing float16's limit. + * - 3, print the information of all tensors. */ -PADDLE_DEFINE_EXPORTED_bool( - check_tensor_max_min, - false, - "Whether to check all the output tensors's min and max value."); +PADDLE_DEFINE_EXPORTED_int32( + check_nan_inf_level, + 0, + "Setting the check and print level when FLAGS_check_nan_inf is set."); /** * Operator related FLAG