diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 4eff406893757e9af0d2c4c987d989cee6fd2a0a..a19d603ada8257995592969769bfd6db7a50d5c0 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -144,7 +144,6 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
   try {
     const auto& runner_mean = paddle::operators::NpuOpRunner(
         "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
-    runner_mean.Run(stream);
     TensorToVector(mean, dev_ctx, &vec);
   } catch (...) {
     LOG(WARNING) << "ContainsNan catch exception";
@@ -240,8 +239,8 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
       case framework::proto::VarType::FP32: {
         if (FLAGS_hccl_check_nan) {
           VLOG(3) << "prepare to FoundNanInf";
-          found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
-          VLOG(3) << "check_numerics:" << found_nan;
+          // NOTE: performance relating, DO NOT REMOVE!
+          ContainsNan(*dev_ctx, dev_ctx->stream(), in);
         }
         break;
       }
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 0274a2cea8ef4eeb086c3bf3ca7be860c7b51ddc..135cf4e3997cf44c7d32dafbbe1ccde9425719e5 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -93,7 +93,7 @@ DEFINE_string(selected_npus, "",
               "This option is useful when doing multi process training and "
               "each process have only one device (NPU). If you want to use "
               "all visible devices, set this to empty string.");
-DEFINE_bool(hccl_check_nan, false,
+DEFINE_bool(hccl_check_nan, true,
             "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
             "core when meets Nan value");
 DEFINE_string(