diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 1076e84e613f4ae9577a2ab9200e6821be847f0f..4eff406893757e9af0d2c4c987d989cee6fd2a0a 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -45,6 +45,10 @@ limitations under the License. */ #include "paddle/fluid/platform/hccl_helper.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +DECLARE_bool(hccl_check_nan); +#endif + namespace paddle { namespace operators { @@ -140,6 +144,7 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx, try { const auto& runner_mean = paddle::operators::NpuOpRunner( "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}}); + runner_mean.Run(stream); TensorToVector(mean, dev_ctx, &vec); } catch (...) { LOG(WARNING) << "ContainsNan catch exception"; @@ -233,9 +238,11 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel { break; } case framework::proto::VarType::FP32: { - VLOG(4) << "prepare to FoundNanInf"; - found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in); - VLOG(4) << "check_numerics:" << found_nan; + if (FLAGS_hccl_check_nan) { + VLOG(3) << "prepare to FoundNanInf"; + found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in); + VLOG(3) << "check_numerics:" << found_nan; + } break; } default: diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 33d9c6efef852d0298a27eca6dfacdd0df18f159..f18eab3246547b6661a46504b30ea2428ccc8997 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -93,6 +93,9 @@ DEFINE_string(selected_npus, "", "This option is useful when doing multi process training and " "each process have only one device (NPU). If you want to use " "all visible devices, set this to empty string."); +DEFINE_bool(hccl_check_nan, false, + "Check Nan in tensor before hccl_allreduce_sum otherwise it'll " + "core when meets Nan value"); DEFINE_string( npu_config_path, "", "The absolute path of configuration json file, like: /tmp/config.json. " diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 5d1274a1f05324f0d3187a4802c564aad8ed2314..8bb4d82b72478525c22c0be2433a2ebed7f0a974 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -248,6 +248,7 @@ def __bootstrap__(): 'gpu_memory_limit_mb', 'npu_config_path', 'get_host_by_name_time', + 'hccl_check_nan', ] core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])