未验证 提交 c3ad7775 编写于 作者: Y Yuang Liu 提交者: GitHub

Revert hccl check nan (#35438)

上级 18934c53
......@@ -144,7 +144,6 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
try {
const auto& runner_mean = paddle::operators::NpuOpRunner(
"ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
runner_mean.Run(stream);
TensorToVector(mean, dev_ctx, &vec);
} catch (...) {
LOG(WARNING) << "ContainsNan catch exception";
......@@ -240,8 +239,8 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
case framework::proto::VarType::FP32: {
if (FLAGS_hccl_check_nan) {
VLOG(3) << "prepare to FoundNanInf";
found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
VLOG(3) << "check_numerics:" << found_nan;
// NOTE: performance relating, DO NOT REMOVE!
ContainsNan(*dev_ctx, dev_ctx->stream(), in);
}
break;
}
......
......@@ -93,7 +93,7 @@ DEFINE_string(selected_npus, "",
"This option is useful when doing multi process training and "
"each process have only one device (NPU). If you want to use "
"all visible devices, set this to empty string.");
DEFINE_bool(hccl_check_nan, false,
DEFINE_bool(hccl_check_nan, true,
"Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
"core when meets Nan value");
DEFINE_string(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册