diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index 57552a16cc5f4920ff0ff82aaf316fbfb22098f5..163e5610030f6d35f5efe74e7dde2f8a5b19c316 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -12,15 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/details/nan_inf_utils_detail.h" +#include "paddle/fluid/framework/details/nan_inf_utils.h" + #include #include #include #include #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/details/nan_inf_utils.h" -#include "paddle/fluid/framework/details/nan_inf_utils_detail.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/kernels/funcs/math_cuda_utils.h" + +DECLARE_bool(abort_on_nan_inf); +DECLARE_bool(check_tensor_max_min); namespace paddle { namespace framework { @@ -133,6 +139,171 @@ __global__ void CheckNanInfKernel(const T* value, PrintNanInfKernel(value, numel, print_num, debug_info); } +template < + typename T, + std::enable_if_t>::value || + std::is_same>::value, + bool> = true> +__device__ void BlockReduceMaxMinAndWrite(const T max_value, + const T min_value, + const T mean_value, + int64_t offset, + T* max_ptr, + T* min_ptr, + T* mean_ptr) { + // TODO(Xreki): support complex +} + +template < + typename T, + std::enable_if_t>::value && + !std::is_same>::value, + bool> = true> +__device__ void BlockReduceMaxMinAndWrite(const T max_value, + const T min_value, + const T mean_value, + int64_t offset, + T* max_ptr, + T* min_ptr, + T* mean_ptr) { + if (max_ptr && min_ptr && mean_ptr) { + __syncthreads(); + + T block_max_value = phi::funcs::blockReduceMax(max_value, FINAL_MASK); + T block_min_value = phi::funcs::blockReduceMin(min_value, FINAL_MASK); + T block_mean_value = phi::funcs::blockReduceSum(mean_value, FINAL_MASK); + + if (threadIdx.x == 0) { + max_ptr[offset] = block_max_value; + min_ptr[offset] = block_min_value; + mean_ptr[offset] = block_mean_value; + } + } +} + +template +__global__ void FindNanInfAndBlockMaxMin(const T* value_ptr, + const int64_t numel, + int* found_nan_inf_ptr, + MT* tensor_block_max_ptr, + MT* tensor_block_min_ptr, + MT* tensor_block_mean_ptr) { + bool has_nan = false; + bool has_inf = false; + + int64_t i = threadIdx.x + blockIdx.x * blockDim.x; + + MT max_value = static_cast(i < numel ? value_ptr[i] : value_ptr[0]); + MT min_value = static_cast(i < numel ? value_ptr[i] : value_ptr[0]); + MT mean_value = static_cast(0); + for (; i < numel; i += blockDim.x * gridDim.x) { + MT value = static_cast(value_ptr[i]); + + max_value = value > max_value ? value : max_value; + min_value = value < min_value ? value : min_value; + mean_value += value / static_cast(numel); + + if (isnan(value)) { + has_nan = true; + } + if (isinf(value)) { + has_inf = true; + } + + if (has_nan || has_inf) { + if (!tensor_block_max_ptr && !tensor_block_min_ptr && + !tensor_block_mean_ptr) { + break; + } + } + } + if (has_nan) { + found_nan_inf_ptr[0] = 1; + } + if (has_inf) { + found_nan_inf_ptr[1] = 1; + } + + BlockReduceMaxMinAndWrite(max_value, + min_value, + mean_value, + blockIdx.x, + tensor_block_max_ptr, + tensor_block_min_ptr, + tensor_block_mean_ptr); +} + +template +__global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, + const T* tensor_block_max_ptr, + const T* tensor_block_min_ptr, + const T* tensor_block_mean_ptr, + const char* debug_info, + int64_t numel, + int64_t numel_max_min, + bool abort_on_nan_inf, + bool check_tensor_max_min) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + int has_nan = found_nan_inf_ptr[0]; + int has_inf = found_nan_inf_ptr[1]; + + T max_value = static_cast(0); + T min_value = static_cast(0); + T mean_value = static_cast(0); + if (tensor_block_max_ptr && tensor_block_min_ptr && tensor_block_mean_ptr) { + max_value = tensor_block_max_ptr[0]; + min_value = tensor_block_min_ptr[0]; + mean_value = tensor_block_mean_ptr[0]; + + // numel_max_min <= 128 + for (int64_t i = 1; i < numel_max_min; ++i) { + T tmp_max_value = tensor_block_max_ptr[i]; + T tmp_min_value = tensor_block_min_ptr[i]; + T tmp_mean_value = tensor_block_mean_ptr[i]; + + max_value = tmp_max_value > max_value ? tmp_max_value : max_value; + min_value = tmp_min_value < min_value ? tmp_min_value : min_value; + mean_value += tmp_mean_value; + } + } + + if (has_nan || has_inf) { + if (abort_on_nan_inf) { + PADDLE_ENFORCE(false, + "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, " + "find_inf=%d, " + "max=%e, min=%e, mean=%e===\n", + debug_info, + numel, + has_nan, + has_inf, + static_cast(max_value), + static_cast(min_value), + static_cast(mean_value)); + } else { + printf( + "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, " + "find_inf=%d, " + "max=%e, min=%e, mean=%e===\n", + debug_info, + numel, + has_nan, + has_inf, + static_cast(max_value), + static_cast(min_value), + static_cast(mean_value)); + } + } else if (check_tensor_max_min) { + printf("[PRECISION] in %s, numel=%ld, max=%e, min=%e, mean=%e\n", + debug_info, + numel, + static_cast(max_value), + static_cast(min_value), + static_cast(mean_value)); + } + } +} + template <> template void TensorCheckerVisitor::apply( @@ -141,8 +312,6 @@ void TensorCheckerVisitor::apply( std::is_same>::value || std::is_same>::value>::type*) const { - int print_num = 3; - auto* dev_ctx = reinterpret_cast( platform::DeviceContextPool::Instance().Get(tensor_.place())); int dev_id = tensor_.place().device; @@ -152,7 +321,12 @@ void TensorCheckerVisitor::apply( platform::errors::OutOfRange("GPU dev_id must >=0 and < dev_count=%d", multi_op_var2gpu_str_mutex().size())); - std::string op_var = "[op=" + op_type_ + "] [tensor=" + var_name_ + "]"; + std::string dtype_str = DataTypeToString(DataTypeTrait::DataType()); + if (dtype_str == "::paddle::platform::float16") { + dtype_str = "float16"; + } + std::string op_var = "[op=" + op_type_ + "] [tensor=" + var_name_ + + "] [dtype=" + dtype_str + "]"; char* gpu_str_ptr = NULL; { @@ -212,6 +386,8 @@ void TensorCheckerVisitor::apply( std::min(static_cast(128), static_cast((tensor_.numel() + threads - 1) / threads)); #ifdef __HIPCC__ + int print_num = 3; + hipLaunchKernelGGL(CheckNanInfKernel, dim3(blocks), dim3(threads), @@ -222,8 +398,43 @@ void TensorCheckerVisitor::apply( print_num, gpu_str_ptr); #else - CheckNanInfKernel<<stream()>>>( - tensor_.data(), tensor_.numel(), print_num, gpu_str_ptr); + using MT = typename phi::dtype::MPTypeTrait::Type; + + phi::DenseTensor found_nan_inf; + found_nan_inf.Resize({2}); + int* found_nan_inf_ptr = found_nan_inf.mutable_data(tensor_.place()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync( + found_nan_inf_ptr, 0, 2 * sizeof(int), dev_ctx->stream())); + + int64_t numel_max_min = blocks; + + phi::DenseTensor tensor_block_max_min; + tensor_block_max_min.Resize({static_cast(3 * numel_max_min)}); + MT* tensor_block_max_ptr = + tensor_block_max_min.mutable_data(tensor_.place()); + MT* tensor_block_min_ptr = tensor_block_max_ptr + numel_max_min; + MT* tensor_block_mean_ptr = tensor_block_max_ptr + 2 * numel_max_min; + + FindNanInfAndBlockMaxMin + <<stream()>>>(tensor_.data(), + tensor_.numel(), + found_nan_inf_ptr, + tensor_block_max_ptr, + tensor_block_min_ptr, + tensor_block_mean_ptr); + + bool abort_on_nan_inf = FLAGS_abort_on_nan_inf; + bool check_tensor_max_min = FLAGS_check_tensor_max_min; + FindGlobalMaxMinAndPrint + <<<1, 1, 0, dev_ctx->stream()>>>(found_nan_inf_ptr, + tensor_block_max_ptr, + tensor_block_min_ptr, + tensor_block_mean_ptr, + gpu_str_ptr, + tensor_.numel(), + numel_max_min, + abort_on_nan_inf, + check_tensor_max_min); #endif } diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 23ecfecbbd2d931054fb75c870d00ea6fd04c2de..1649c0c0c1404ed65147c45c424a9bbe019275a4 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -68,6 +68,34 @@ PADDLE_DEFINE_EXPORTED_bool( "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); +/** + * Operator related FLAG + * Name: FLAGS_abort_on_nan_inf + * Since Version: 2.5.0 + * Value Range: bool, default=true + * Example: + * Note: Used to debug. Whether abort the process when any operator produce + * NAN/INF. It only works when FLAGS_check_nan_inf is set. + */ +PADDLE_DEFINE_EXPORTED_bool( + abort_on_nan_inf, + true, + "Whether abort the process when any operator produce NAN/INF or not."); + +/** + * Operator related FLAG + * Name: FLAGS_check_tensor_max_min + * Since Version: 2.5.0 + * Value Range: bool, default=false + * Example: + * Note: Used to debug. Enable to calculate and print the max and min value of + * each operator's output tensor. It only works when FLAGS_check_nan_inf is set. + */ +PADDLE_DEFINE_EXPORTED_bool( + check_tensor_max_min, + false, + "Whether to check all the output tensors's min and max value."); + /** * Operator related FLAG * Name: FLAGS_check_nan_inf diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py index f062adb6a7a43ea690625e36a99c161feddb14a9..39c01f053f536ee6f0c2192280cd77e873660f67 100644 --- a/python/paddle/fluid/tests/unittests/test_nan_inf.py +++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py @@ -47,7 +47,7 @@ class TestNanInf(unittest.TestCase): # in python3, type(out+err) is 'bytes', need use encode if paddle.fluid.core.is_compiled_with_cuda(): - assert (out + err).find('find nan or inf==='.encode()) != -1 + assert (out + err).find('find_nan=1, find_inf=1'.encode()) != -1 else: assert (out + err).find( 'There are `nan` or `inf` in tensor'.encode()