Use an unified FLAGS_check_nan_inf_level to control the result of checking infinite. (#47672)

54bc3b46 · Yiqun Liu · GitHub · 99504cbb · 54bc3b46 · 54bc3b46
隐藏空白更改
内联并排

Showing with 57 addition and 44 deletion

paddle/fluid/framework/details/nan_inf_utils_detail.cu paddle/fluid/framework/details/nan_inf_utils_detail.cu +42 -22

paddle/fluid/platform/flags.cc paddle/fluid/platform/flags.cc +15 -22

未找到文件。
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -25,8 +25,7 @@
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/funcs/math_cuda_utils.h"

-DECLARE_bool(abort_on_nan_inf);
-DECLARE_bool(check_tensor_max_min);
+DECLARE_int32(check_nan_inf_level);

 namespace paddle {
 namespace framework {
@@ -233,23 +232,46 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
                                tensor_block_mean_ptr);
 }

-template <typename T>
+template <typename T,
+          typename MT,
+          std::enable_if_t<std::is_same<T, float>::value, bool> = true>
+__device__ bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) {
+  if (check_nan_inf_level >= 3) {
+    return true;
+  } else if (check_nan_inf_level >= 2) {
+    MT fp16_max =
+        static_cast<MT>(std::numeric_limits<phi::dtype::float16>::max());
+    return max_value > fp16_max || min_value < -fp16_max;
+  }
+  return false;
+}
+
+template <typename T,
+          typename MT,
+          std::enable_if_t<!std::is_same<T, float>::value, bool> = true>
+__device__ bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) {
+  if (check_nan_inf_level >= 3) {
+    return true;
+  }
+  return false;
+}
+
+template <typename T, typename MT>
 __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
-                                         const T* tensor_block_max_ptr,
-                                         const T* tensor_block_min_ptr,
-                                         const T* tensor_block_mean_ptr,
+                                         const MT* tensor_block_max_ptr,
+                                         const MT* tensor_block_min_ptr,
+                                         const MT* tensor_block_mean_ptr,
                                         const char* debug_info,
                                         int64_t numel,
                                         int64_t numel_max_min,
-                                         bool abort_on_nan_inf,
-                                         bool check_tensor_max_min) {
+                                         int check_nan_inf_level) {
  if (blockIdx.x == 0 && threadIdx.x == 0) {
    int has_nan = found_nan_inf_ptr[0];
    int has_inf = found_nan_inf_ptr[1];

-    T max_value = static_cast<T>(0);
-    T min_value = static_cast<T>(0);
-    T mean_value = static_cast<T>(0);
+    MT max_value = static_cast<MT>(0);
+    MT min_value = static_cast<MT>(0);
+    MT mean_value = static_cast<MT>(0);
    if (tensor_block_max_ptr && tensor_block_min_ptr && tensor_block_mean_ptr) {
      max_value = tensor_block_max_ptr[0];
      min_value = tensor_block_min_ptr[0];
@@ -257,9 +279,9 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,

      // numel_max_min <= 128
      for (int64_t i = 1; i < numel_max_min; ++i) {
-        T tmp_max_value = tensor_block_max_ptr[i];
-        T tmp_min_value = tensor_block_min_ptr[i];
-        T tmp_mean_value = tensor_block_mean_ptr[i];
+        MT tmp_max_value = tensor_block_max_ptr[i];
+        MT tmp_min_value = tensor_block_min_ptr[i];
+        MT tmp_mean_value = tensor_block_mean_ptr[i];

        max_value = tmp_max_value > max_value ? tmp_max_value : max_value;
        min_value = tmp_min_value < min_value ? tmp_min_value : min_value;
@@ -268,7 +290,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
    }

    if (has_nan || has_inf) {
-      if (abort_on_nan_inf) {
+      if (check_nan_inf_level == 0) {
        PADDLE_ENFORCE(false,
                       "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, "
                       "find_inf=%d, "
@@ -280,7 +302,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
                       static_cast<float>(max_value),
                       static_cast<float>(min_value),
                       static_cast<float>(mean_value));
-      } else {
+      } else if (check_nan_inf_level >= 1) {
        printf(
            "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, "
            "find_inf=%d, "
@@ -293,7 +315,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
            static_cast<float>(min_value),
            static_cast<float>(mean_value));
      }
-    } else if (check_tensor_max_min) {
+    } else if (NeedPrint<T, MT>(max_value, min_value, check_nan_inf_level)) {
      printf("[PRECISION] in %s, numel=%ld, max=%e, min=%e, mean=%e\n",
             debug_info,
             numel,
@@ -423,9 +445,8 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
                                                  tensor_block_min_ptr,
                                                  tensor_block_mean_ptr);

-  bool abort_on_nan_inf = FLAGS_abort_on_nan_inf;
-  bool check_tensor_max_min = FLAGS_check_tensor_max_min;
-  FindGlobalMaxMinAndPrint<MT>
+  int check_nan_inf_level = FLAGS_check_nan_inf_level;
+  FindGlobalMaxMinAndPrint<T, MT>
      <<<1, 1, 0, dev_ctx->stream()>>>(found_nan_inf_ptr,
                                       tensor_block_max_ptr,
                                       tensor_block_min_ptr,
@@ -433,8 +454,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
                                       gpu_str_ptr,
                                       tensor_.numel(),
                                       numel_max_min,
-                                       abort_on_nan_inf,
-                                       check_tensor_max_min);
+                                       check_nan_inf_level);
 #endif
 }


--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -70,31 +70,24 @@ PADDLE_DEFINE_EXPORTED_bool(

 /**
 * Operator related FLAG
- * Name: FLAGS_abort_on_nan_inf
+ * Name: FLAGS_check_nan_inf_level
 * Since Version: 2.5.0
- * Value Range: bool, default=true
- * Example:
- * Note: Used to debug. Whether abort the process when any operator produce
- * NAN/INF. It only works when FLAGS_check_nan_inf is set.
- */
-PADDLE_DEFINE_EXPORTED_bool(
-    abort_on_nan_inf,
-    true,
-    "Whether abort the process when any operator produce NAN/INF or not.");
-
-/**
- * Operator related FLAG
- * Name: FLAGS_check_tensor_max_min
- * Since Version: 2.5.0
- * Value Range: bool, default=false
+ * Value Range: int32, default=0
 * Example:
- * Note: Used to debug. Enable to calculate and print the max and min value of
- * each operator's output tensor. It only works when FLAGS_check_nan_inf is set.
+ * Note: Used to debug. Setting the check and print level when
+ * FLAGS_check_nan_inf is set.
+ * - 0, abort the process when any operator produce NAN/INF and only print the
+ * information of tensor which holds NAN/INF.
+ * - 1, continue the training or inference process and print the information of
+ * all tensors which holds NAN/INF.
+ * - 2, print the information of float tensors when the max or min value
+ * overflowing float16's limit.
+ * - 3, print the information of all tensors.
 */
-PADDLE_DEFINE_EXPORTED_bool(
-    check_tensor_max_min,
-    false,
-    "Whether to check all the output tensors's min and max value.");
+PADDLE_DEFINE_EXPORTED_int32(
+    check_nan_inf_level,
+    0,
+    "Setting the check and print level when FLAGS_check_nan_inf is set.");

 /**
 * Operator related FLAG