diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 3f210219608fb7efa740ce2d4a52c736acdfdcc9..bd88c8f9cd2b40d30a16b10e56aadb556fe91b06 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -59,6 +59,8 @@ if(WITH_ASCEND_CL)
         DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
         DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(checknumeric SRCS checknumeric_npu_test.cc
+        DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
         DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
     cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 3c51c65b073904aedaa8b4c6777aaecc7bc223c2..1076e84e613f4ae9577a2ab9200e6821be847f0f 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -121,35 +121,44 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
 };
 
 #if defined(PADDLE_WITH_ASCEND_CL)
-// return true if found_inf_or_nan or return false;
-template <typename T>
-bool CheckNumerics(const framework::ExecutionContext& exe_ctx,
-                   aclrtStream stream, const paddle::framework::Tensor* in) {
-  auto& dev_ctx =
-      exe_ctx.template device_context<paddle::platform::NPUDeviceContext>();
+// return true if found_nan or return false;
+inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
+                        aclrtStream stream,
+                        const paddle::framework::Tensor* in) {
   using Tensor = paddle::framework::Tensor;
   Tensor out(in->type());
-  out.Resize(in->dims());
-  out.mutable_data<T>(dev_ctx.GetPlace());
 
-  bool found_inf_data = false;
+  Tensor mean(in->type());
+  mean.Resize({1});
+  mean.mutable_data<float>(dev_ctx.GetPlace());
+  std::vector<int> axes;
+  for (int i = 0; i < in->dims().size(); ++i) {
+    axes.push_back(i);
+  }
 
+  std::vector<float> vec;
   try {
-    const auto& runner =
-        NpuOpRunner("CheckNumerics", {*in}, {out},
-                    {{"message", std::string("check_numberics")}});
-    runner.Run(stream);
-    dev_ctx.Wait();
-  } catch (platform::EnforceNotMet& exception) {
-    LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
-    found_inf_data = true;
+    const auto& runner_mean = paddle::operators::NpuOpRunner(
+        "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
+    TensorToVector(mean, dev_ctx, &vec);
   } catch (...) {
-    LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
-    found_inf_data = true;
+    LOG(WARNING) << "ContainsNan catch exception";
+    return true;
+  }
+
+  VLOG(4) << "reducemeand result:" << vec[0];
+  if (std::isnan(static_cast<float>(vec[0]))) {
+    LOG(WARNING) << "ContainsNan detects nan";
+    return true;
+  }
+
+  if (std::isinf(static_cast<float>(vec[0]))) {
+    LOG(WARNING) << "ContainsNan detects inf";
   }
 
-  return found_inf_data;
+  return false;
 }
+
 #endif
 
 template <ReduceType red_type, typename T>
@@ -216,22 +225,24 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
     framework::Tensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
 
-    bool check_numerics = false;
+    bool found_nan = false;
 
     auto d_type = in->type();
     switch (d_type) {
-      case framework::proto::VarType::FP16:
+      case framework::proto::VarType::FP16: {
+        break;
+      }
       case framework::proto::VarType::FP32: {
         VLOG(4) << "prepare to FoundNanInf";
-        check_numerics = CheckNumerics<T>(ctx, dev_ctx->stream(), in);
-        VLOG(4) << "check_numerics:" << check_numerics;
+        found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
+        VLOG(4) << "check_numerics:" << found_nan;
         break;
       }
       default:
         break;
     }
 
-    if (check_numerics) {
+    if (found_nan) {
       T inf = static_cast<T>(std::numeric_limits<float>::infinity());
       VLOG(4) << "fill input data constant inf";
       auto dims = in->dims();
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index f1bf9683e35593720e9db604142312a055356bb0..ecf9f18d40f86d98139b7dd9fca53a2dbfb2c416 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -38,6 +38,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/hccl_helper.h"
 #endif
 
+// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
+// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
+// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
+// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
+
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 namespace m = paddle::operators::math;
@@ -52,10 +57,11 @@ DECLARE_string(selected_npus);
 template <typename T>
 void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
   std::string debugstring = "";
+  std::cout << preStr << ":" << std::endl << debugstring;
   for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
+    std::cout << ele << " ";
   }
-  VLOG(3) << preStr << ":" << std::endl << debugstring;
+  std::cout << std::endl;
 }
 
 void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
@@ -120,6 +126,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
   ctx.Wait();
 }
 
+template <typename T>
 void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
                          int iter) {
   // init
@@ -130,10 +137,11 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
   int num1 = 3;
   int num2 = 128;
 
-  std::vector<float> init;
+  std::vector<T> init;
   for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0 + rank_id);
+    init.push_back(static_cast<T>(1.0 + rank_id));
   }
+  init[0] = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
   PrintDebugInfo("input data", init);
 
   auto place = ctx.GetPlace();
@@ -145,31 +153,33 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
   auto out = scope->Var("OutData");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
   tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
+  tensor_out->mutable_data<T>(place);  // allocate
   ctx.Wait();
 
   // run
   f::AttributeMap attrs;
   attrs["tag"] = std::string("tagx_" + std::to_string(iter));
   attrs["ring_id"] = 0;
+  attrs["use_calc_stream"] = 1;
 
   auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}},
                                     {{"Out", {"OutData"}}}, attrs);
-
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 1; i++) {
     op->Run(*scope, place);
   }
   ctx.Wait();
 
-  std::vector<float> out_vec;
+  std::vector<T> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
 
   PrintDebugInfo("output data", out_vec);
 
+  float diff = static_cast<float>(out_vec[0]) - 65504;
+  EXPECT_TRUE(diff < 0.1 && diff > -0.1);
   EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 3.0);
+  for (uint32_t i = 1; i < 10; i++) {
+    EXPECT_EQ(out_vec[i], static_cast<paddle::platform::float16>(3.0));
   }
 }
 
@@ -182,8 +192,7 @@ TEST(c_allreduce_sum, NPU) {
   // only support one device, if more than one device, use first default
   PrepareUniqueId(&scope, ctx, &hccl_id);
   Prepare(&scope, ctx, &hccl_id);
-  for (int i = 0; i < 1; i++) {
-    VLOG(2) << "iter num: " << i;
-    TestHCCLAllReduceOp(&scope, ctx, i);
-  }
+
+  TestHCCLAllReduceOp<paddle::platform::float16>(&scope, ctx, 1);
+  // TestHCCLAllReduceOp<float>(&scope, ctx, 0);
 }
diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..804e8c2a2cbe0c084713d5d1cfce6c909571516e
--- /dev/null
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <cmath>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allreduce_sum);
+USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
+DECLARE_string(selected_npus);
+
+template <typename T>
+bool Check(T value, int size = 2 * 512 * 8192) {
+  f::Scope scope;
+  auto x = scope.Var("in");
+  auto& ctx = *dynamic_cast<p::NPUDeviceContext*>(
+      p::DeviceContextPool::Instance().Get(p::NPUPlace(0)));
+  auto place = ctx.GetPlace();
+
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  tensor_x->Resize({size});
+  tensor_x->mutable_data<T>(place);  // allocate
+
+  std::vector<T> init;
+  for (int64_t i = 0; i < size; ++i) {
+    init.push_back(static_cast<T>(value));
+  }
+
+  TensorFromVector(init, ctx, tensor_x);
+  bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x);
+  return result;
+}
+
+TEST(check_numeric, NPU) {
+  auto inf = std::numeric_limits<float>::infinity();
+  auto fp16_inf = static_cast<p::float16>(inf);
+  auto nan = NAN;
+  auto fp16_nan = static_cast<p::float16>(nan);
+
+  bool result = false;
+  // Normal
+  VLOG(0) << "start normal";
+  result = Check<p::float16>(static_cast<p::float16>(65546));
+  ASSERT_FALSE(result);
+  Check<float>(static_cast<float>(1.0));
+  ASSERT_FALSE(result);
+
+  // Inf
+  VLOG(0) << "start inf";
+  result = Check<p::float16>(fp16_inf);
+  ASSERT_FALSE(result);
+  result = Check<float>(inf);
+  ASSERT_FALSE(result);
+
+  // Nan
+  VLOG(0) << "start nan";
+  result = Check<p::float16>(fp16_nan);
+  ASSERT_TRUE(result);
+  result = Check<float>(nan);
+  ASSERT_TRUE(result);
+}