diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 1076e84e613f4ae9577a2ab9200e6821be847f0f..4eff406893757e9af0d2c4c987d989cee6fd2a0a 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -45,6 +45,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/hccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+DECLARE_bool(hccl_check_nan);
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -140,6 +144,7 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
   try {
     const auto& runner_mean = paddle::operators::NpuOpRunner(
         "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
+    runner_mean.Run(stream);
     TensorToVector(mean, dev_ctx, &vec);
   } catch (...) {
     LOG(WARNING) << "ContainsNan catch exception";
@@ -233,9 +238,11 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
         break;
       }
       case framework::proto::VarType::FP32: {
-        VLOG(4) << "prepare to FoundNanInf";
-        found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
-        VLOG(4) << "check_numerics:" << found_nan;
+        if (FLAGS_hccl_check_nan) {
+          VLOG(3) << "prepare to FoundNanInf";
+          found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
+          VLOG(3) << "check_numerics:" << found_nan;
+        }
         break;
       }
       default:
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 33d9c6efef852d0298a27eca6dfacdd0df18f159..f18eab3246547b6661a46504b30ea2428ccc8997 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -93,6 +93,9 @@ DEFINE_string(selected_npus, "",
               "This option is useful when doing multi process training and "
               "each process have only one device (NPU). If you want to use "
               "all visible devices, set this to empty string.");
+DEFINE_bool(hccl_check_nan, false,
+            "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
+            "core when meets Nan value");
 DEFINE_string(
     npu_config_path, "",
     "The absolute path of configuration json file, like: /tmp/config.json. "
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 5d1274a1f05324f0d3187a4802c564aad8ed2314..8bb4d82b72478525c22c0be2433a2ebed7f0a974 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -248,6 +248,7 @@ def __bootstrap__():
             'gpu_memory_limit_mb',
             'npu_config_path',
             'get_host_by_name_time',
+            'hccl_check_nan',
         ]
 
     core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])