Add flags to control whether to check Nan value of hccl_allreduce_sum. (#35093) (#35298)

Co-authored-by: N gongweibao <weibao.gong@gmail.com>

Add flags to control whether to check Nan value of hccl_allreduce_sum. (#35093) (#35298)
Co-authored-by: N gongweibao <weibao.gong@gmail.com>
d4948bc1 · Roc · GitHub · b36fb036 · d4948bc1 · d4948bc1
3 changed file
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -45,6 +45,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/hccl_helper.h"
 #endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+DECLARE_bool(hccl_check_nan);
+#endif
 namespace paddle {
 namespace operators {
@@ -140,6 +144,7 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
  try {
    const auto& runner_mean = paddle::operators::NpuOpRunner(
        "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
+    runner_mean.Run(stream);
    TensorToVector(mean, dev_ctx, &vec);
  } catch (...) {
    LOG(WARNING) << "ContainsNan catch exception";
@@ -233,9 +238,11 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
        break;
      }
      case framework::proto::VarType::FP32: {
-        VLOG(4) << "prepare to FoundNanInf";
+        if (FLAGS_hccl_check_nan) {
-        found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
+          VLOG(3) << "prepare to FoundNanInf";
-        VLOG(4) << "check_numerics:" << found_nan;
+          found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
+          VLOG(3) << "check_numerics:" << found_nan;
+        }
        break;
      }
      default:

--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -93,6 +93,9 @@ DEFINE_string(selected_npus, "",
              "This option is useful when doing multi process training and "
              "each process have only one device (NPU). If you want to use "
              "all visible devices, set this to empty string.");
+DEFINE_bool(hccl_check_nan, false,
+            "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
+            "core when meets Nan value");
 DEFINE_string(
    npu_config_path, "",
    "The absolute path of configuration json file, like: /tmp/config.json. "

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -248,6 +248,7 @@ def __bootstrap__():
            'gpu_memory_limit_mb',
            'npu_config_path',
            'get_host_by_name_time',
+            'hccl_check_nan',
        ]
    core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])