未验证 提交 c3ad7775 编写于 作者: Y Yuang Liu 提交者: GitHub

Revert hccl check nan (#35438)

上级 18934c53
...@@ -144,7 +144,6 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx, ...@@ -144,7 +144,6 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
try { try {
const auto& runner_mean = paddle::operators::NpuOpRunner( const auto& runner_mean = paddle::operators::NpuOpRunner(
"ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}}); "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
runner_mean.Run(stream);
TensorToVector(mean, dev_ctx, &vec); TensorToVector(mean, dev_ctx, &vec);
} catch (...) { } catch (...) {
LOG(WARNING) << "ContainsNan catch exception"; LOG(WARNING) << "ContainsNan catch exception";
...@@ -240,8 +239,8 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> { ...@@ -240,8 +239,8 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
case framework::proto::VarType::FP32: { case framework::proto::VarType::FP32: {
if (FLAGS_hccl_check_nan) { if (FLAGS_hccl_check_nan) {
VLOG(3) << "prepare to FoundNanInf"; VLOG(3) << "prepare to FoundNanInf";
found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in); // NOTE: performance relating, DO NOT REMOVE!
VLOG(3) << "check_numerics:" << found_nan; ContainsNan(*dev_ctx, dev_ctx->stream(), in);
} }
break; break;
} }
......
...@@ -93,7 +93,7 @@ DEFINE_string(selected_npus, "", ...@@ -93,7 +93,7 @@ DEFINE_string(selected_npus, "",
"This option is useful when doing multi process training and " "This option is useful when doing multi process training and "
"each process have only one device (NPU). If you want to use " "each process have only one device (NPU). If you want to use "
"all visible devices, set this to empty string."); "all visible devices, set this to empty string.");
DEFINE_bool(hccl_check_nan, false, DEFINE_bool(hccl_check_nan, true,
"Check Nan in tensor before hccl_allreduce_sum otherwise it'll " "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
"core when meets Nan value"); "core when meets Nan value");
DEFINE_string( DEFINE_string(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册