From 5b7378343cb549ff8d8873f69ce1849912d0b719 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Tue, 24 Aug 2021 20:44:16 +0800 Subject: [PATCH] Add flags to control whether to check Nan value of hccl_allreduce_sum. (#35093) --- paddle/fluid/operators/collective/c_allreduce_op.h | 13 ++++++++++--- paddle/fluid/platform/flags.cc | 3 +++ python/paddle/fluid/__init__.py | 1 + 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 1076e84e61..4eff406893 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -45,6 +45,10 @@ limitations under the License. */ #include "paddle/fluid/platform/hccl_helper.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +DECLARE_bool(hccl_check_nan); +#endif + namespace paddle { namespace operators { @@ -140,6 +144,7 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx, try { const auto& runner_mean = paddle::operators::NpuOpRunner( "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}}); + runner_mean.Run(stream); TensorToVector(mean, dev_ctx, &vec); } catch (...) { LOG(WARNING) << "ContainsNan catch exception"; @@ -233,9 +238,11 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel { break; } case framework::proto::VarType::FP32: { - VLOG(4) << "prepare to FoundNanInf"; - found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in); - VLOG(4) << "check_numerics:" << found_nan; + if (FLAGS_hccl_check_nan) { + VLOG(3) << "prepare to FoundNanInf"; + found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in); + VLOG(3) << "check_numerics:" << found_nan; + } break; } default: diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 33d9c6efef..f18eab3246 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -93,6 +93,9 @@ DEFINE_string(selected_npus, "", "This option is useful when doing multi process training and " "each process have only one device (NPU). If you want to use " "all visible devices, set this to empty string."); +DEFINE_bool(hccl_check_nan, false, + "Check Nan in tensor before hccl_allreduce_sum otherwise it'll " + "core when meets Nan value"); DEFINE_string( npu_config_path, "", "The absolute path of configuration json file, like: /tmp/config.json. " diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 5d1274a1f0..8bb4d82b72 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -248,6 +248,7 @@ def __bootstrap__(): 'gpu_memory_limit_mb', 'npu_config_path', 'get_host_by_name_time', + 'hccl_check_nan', ] core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) -- GitLab