未验证 提交 d4948bc1 编写于 作者: R Roc 提交者: GitHub

Add flags to control whether to check Nan value of hccl_allreduce_sum. (#35093) (#35298)

Co-authored-by: Ngongweibao <weibao.gong@gmail.com>
上级 b36fb036
...@@ -45,6 +45,10 @@ limitations under the License. */ ...@@ -45,6 +45,10 @@ limitations under the License. */
#include "paddle/fluid/platform/hccl_helper.h" #include "paddle/fluid/platform/hccl_helper.h"
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
DECLARE_bool(hccl_check_nan);
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -140,6 +144,7 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx, ...@@ -140,6 +144,7 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
try { try {
const auto& runner_mean = paddle::operators::NpuOpRunner( const auto& runner_mean = paddle::operators::NpuOpRunner(
"ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}}); "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
runner_mean.Run(stream);
TensorToVector(mean, dev_ctx, &vec); TensorToVector(mean, dev_ctx, &vec);
} catch (...) { } catch (...) {
LOG(WARNING) << "ContainsNan catch exception"; LOG(WARNING) << "ContainsNan catch exception";
...@@ -233,9 +238,11 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> { ...@@ -233,9 +238,11 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
break; break;
} }
case framework::proto::VarType::FP32: { case framework::proto::VarType::FP32: {
VLOG(4) << "prepare to FoundNanInf"; if (FLAGS_hccl_check_nan) {
found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in); VLOG(3) << "prepare to FoundNanInf";
VLOG(4) << "check_numerics:" << found_nan; found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
VLOG(3) << "check_numerics:" << found_nan;
}
break; break;
} }
default: default:
......
...@@ -93,6 +93,9 @@ DEFINE_string(selected_npus, "", ...@@ -93,6 +93,9 @@ DEFINE_string(selected_npus, "",
"This option is useful when doing multi process training and " "This option is useful when doing multi process training and "
"each process have only one device (NPU). If you want to use " "each process have only one device (NPU). If you want to use "
"all visible devices, set this to empty string."); "all visible devices, set this to empty string.");
DEFINE_bool(hccl_check_nan, false,
"Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
"core when meets Nan value");
DEFINE_string( DEFINE_string(
npu_config_path, "", npu_config_path, "",
"The absolute path of configuration json file, like: /tmp/config.json. " "The absolute path of configuration json file, like: /tmp/config.json. "
......
...@@ -248,6 +248,7 @@ def __bootstrap__(): ...@@ -248,6 +248,7 @@ def __bootstrap__():
'gpu_memory_limit_mb', 'gpu_memory_limit_mb',
'npu_config_path', 'npu_config_path',
'get_host_by_name_time', 'get_host_by_name_time',
'hccl_check_nan',
] ]
core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册