diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 4eff406893757e9af0d2c4c987d989cee6fd2a0a..a19d603ada8257995592969769bfd6db7a50d5c0 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -144,7 +144,6 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx, try { const auto& runner_mean = paddle::operators::NpuOpRunner( "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}}); - runner_mean.Run(stream); TensorToVector(mean, dev_ctx, &vec); } catch (...) { LOG(WARNING) << "ContainsNan catch exception"; @@ -240,8 +239,8 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel { case framework::proto::VarType::FP32: { if (FLAGS_hccl_check_nan) { VLOG(3) << "prepare to FoundNanInf"; - found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in); - VLOG(3) << "check_numerics:" << found_nan; + // NOTE: performance relating, DO NOT REMOVE! + ContainsNan(*dev_ctx, dev_ctx->stream(), in); } break; } diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 0274a2cea8ef4eeb086c3bf3ca7be860c7b51ddc..135cf4e3997cf44c7d32dafbbe1ccde9425719e5 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -93,7 +93,7 @@ DEFINE_string(selected_npus, "", "This option is useful when doing multi process training and " "each process have only one device (NPU). If you want to use " "all visible devices, set this to empty string."); -DEFINE_bool(hccl_check_nan, false, +DEFINE_bool(hccl_check_nan, true, "Check Nan in tensor before hccl_allreduce_sum otherwise it'll " "core when meets Nan value"); DEFINE_string(