From c91b1e039f29a62fb3050f979afecd71eabd734f Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 6 Aug 2021 11:31:57 +0800 Subject: [PATCH] [NPU]Use another method to void c_allreduce_sum core! (#34619) --- .../fluid/operators/collective/CMakeLists.txt | 2 + .../operators/collective/c_allreduce_op.h | 61 +++++++----- .../collective/c_allreduce_sum_op_npu_test.cc | 37 ++++--- .../collective/checknumeric_npu_test.cc | 99 +++++++++++++++++++ 4 files changed, 160 insertions(+), 39 deletions(-) create mode 100644 paddle/fluid/operators/collective/checknumeric_npu_test.cc diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index 3f210219608..bd88c8f9cd2 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -59,6 +59,8 @@ if(WITH_ASCEND_CL) DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(checknumeric SRCS checknumeric_npu_test.cc + DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 3c51c65b073..1076e84e613 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -121,35 +121,44 @@ class CAllReduceOpCPUKernel : public framework::OpKernel { }; #if defined(PADDLE_WITH_ASCEND_CL) -// return true if found_inf_or_nan or return false; -template -bool CheckNumerics(const framework::ExecutionContext& exe_ctx, - aclrtStream stream, const paddle::framework::Tensor* in) { - auto& dev_ctx = - exe_ctx.template device_context(); +// return true if found_nan or return false; +inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx, + aclrtStream stream, + const paddle::framework::Tensor* in) { using Tensor = paddle::framework::Tensor; Tensor out(in->type()); - out.Resize(in->dims()); - out.mutable_data(dev_ctx.GetPlace()); - bool found_inf_data = false; + Tensor mean(in->type()); + mean.Resize({1}); + mean.mutable_data(dev_ctx.GetPlace()); + std::vector axes; + for (int i = 0; i < in->dims().size(); ++i) { + axes.push_back(i); + } + std::vector vec; try { - const auto& runner = - NpuOpRunner("CheckNumerics", {*in}, {out}, - {{"message", std::string("check_numberics")}}); - runner.Run(stream); - dev_ctx.Wait(); - } catch (platform::EnforceNotMet& exception) { - LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; - found_inf_data = true; + const auto& runner_mean = paddle::operators::NpuOpRunner( + "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}}); + TensorToVector(mean, dev_ctx, &vec); } catch (...) { - LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; - found_inf_data = true; + LOG(WARNING) << "ContainsNan catch exception"; + return true; + } + + VLOG(4) << "reducemeand result:" << vec[0]; + if (std::isnan(static_cast(vec[0]))) { + LOG(WARNING) << "ContainsNan detects nan"; + return true; + } + + if (std::isinf(static_cast(vec[0]))) { + LOG(WARNING) << "ContainsNan detects inf"; } - return found_inf_data; + return false; } + #endif template @@ -216,22 +225,24 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel { framework::Tensor tmp; tmp.mutable_data({8}, ctx.GetPlace()); - bool check_numerics = false; + bool found_nan = false; auto d_type = in->type(); switch (d_type) { - case framework::proto::VarType::FP16: + case framework::proto::VarType::FP16: { + break; + } case framework::proto::VarType::FP32: { VLOG(4) << "prepare to FoundNanInf"; - check_numerics = CheckNumerics(ctx, dev_ctx->stream(), in); - VLOG(4) << "check_numerics:" << check_numerics; + found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in); + VLOG(4) << "check_numerics:" << found_nan; break; } default: break; } - if (check_numerics) { + if (found_nan) { T inf = static_cast(std::numeric_limits::infinity()); VLOG(4) << "fill input data constant inf"; auto dims = in->dims(); diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc index f1bf9683e35..ecf9f18d40f 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc @@ -38,6 +38,11 @@ limitations under the License. */ #include "paddle/fluid/platform/hccl_helper.h" #endif +// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1 +// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test +// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0 +// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test + namespace f = paddle::framework; namespace p = paddle::platform; namespace m = paddle::operators::math; @@ -52,10 +57,11 @@ DECLARE_string(selected_npus); template void PrintDebugInfo(const std::string preStr, const std::vector& data) { std::string debugstring = ""; + std::cout << preStr << ":" << std::endl << debugstring; for (auto ele : data) { - debugstring += std::to_string(ele) + std::string(","); + std::cout << ele << " "; } - VLOG(3) << preStr << ":" << std::endl << debugstring; + std::cout << std::endl; } void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, @@ -120,6 +126,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx, ctx.Wait(); } +template void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) { // init @@ -130,10 +137,11 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int num1 = 3; int num2 = 128; - std::vector init; + std::vector init; for (int64_t i = 0; i < num1 * num2; ++i) { - init.push_back(1.0 + rank_id); + init.push_back(static_cast(1.0 + rank_id)); } + init[0] = static_cast(std::numeric_limits::quiet_NaN()); PrintDebugInfo("input data", init); auto place = ctx.GetPlace(); @@ -145,31 +153,33 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, auto out = scope->Var("OutData"); auto tensor_out = out->GetMutable(); tensor_out->Resize({num1, num2}); - tensor_out->mutable_data(place); // allocate + tensor_out->mutable_data(place); // allocate ctx.Wait(); // run f::AttributeMap attrs; attrs["tag"] = std::string("tagx_" + std::to_string(iter)); attrs["ring_id"] = 0; + attrs["use_calc_stream"] = 1; auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs); - - for (int i = 0; i < 10; i++) { + for (int i = 0; i < 1; i++) { op->Run(*scope, place); } ctx.Wait(); - std::vector out_vec; + std::vector out_vec; TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); PrintDebugInfo("output data", out_vec); + float diff = static_cast(out_vec[0]) - 65504; + EXPECT_TRUE(diff < 0.1 && diff > -0.1); EXPECT_EQ(out_vec.size(), init.size()); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], 3.0); + for (uint32_t i = 1; i < 10; i++) { + EXPECT_EQ(out_vec[i], static_cast(3.0)); } } @@ -182,8 +192,7 @@ TEST(c_allreduce_sum, NPU) { // only support one device, if more than one device, use first default PrepareUniqueId(&scope, ctx, &hccl_id); Prepare(&scope, ctx, &hccl_id); - for (int i = 0; i < 1; i++) { - VLOG(2) << "iter num: " << i; - TestHCCLAllReduceOp(&scope, ctx, i); - } + + TestHCCLAllReduceOp(&scope, ctx, 1); + // TestHCCLAllReduceOp(&scope, ctx, 0); } diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc new file mode 100644 index 00000000000..804e8c2a2cb --- /dev/null +++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_allreduce_sum); +USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU); +DECLARE_string(selected_npus); + +template +bool Check(T value, int size = 2 * 512 * 8192) { + f::Scope scope; + auto x = scope.Var("in"); + auto& ctx = *dynamic_cast( + p::DeviceContextPool::Instance().Get(p::NPUPlace(0))); + auto place = ctx.GetPlace(); + + auto tensor_x = x->GetMutable(); + tensor_x->Resize({size}); + tensor_x->mutable_data(place); // allocate + + std::vector init; + for (int64_t i = 0; i < size; ++i) { + init.push_back(static_cast(value)); + } + + TensorFromVector(init, ctx, tensor_x); + bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x); + return result; +} + +TEST(check_numeric, NPU) { + auto inf = std::numeric_limits::infinity(); + auto fp16_inf = static_cast(inf); + auto nan = NAN; + auto fp16_nan = static_cast(nan); + + bool result = false; + // Normal + VLOG(0) << "start normal"; + result = Check(static_cast(65546)); + ASSERT_FALSE(result); + Check(static_cast(1.0)); + ASSERT_FALSE(result); + + // Inf + VLOG(0) << "start inf"; + result = Check(fp16_inf); + ASSERT_FALSE(result); + result = Check(inf); + ASSERT_FALSE(result); + + // Nan + VLOG(0) << "start nan"; + result = Check(fp16_nan); + ASSERT_TRUE(result); + result = Check(nan); + ASSERT_TRUE(result); +} -- GitLab