未验证 提交 c91b1e03 编写于 作者: G gongweibao 提交者: GitHub

[NPU]Use another method to void c_allreduce_sum core! (#34619)

上级 436a9f14
...@@ -59,6 +59,8 @@ if(WITH_ASCEND_CL) ...@@ -59,6 +59,8 @@ if(WITH_ASCEND_CL)
DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(checknumeric SRCS checknumeric_npu_test.cc
DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
......
...@@ -121,35 +121,44 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> { ...@@ -121,35 +121,44 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
}; };
#if defined(PADDLE_WITH_ASCEND_CL) #if defined(PADDLE_WITH_ASCEND_CL)
// return true if found_inf_or_nan or return false; // return true if found_nan or return false;
template <typename T> inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
bool CheckNumerics(const framework::ExecutionContext& exe_ctx, aclrtStream stream,
aclrtStream stream, const paddle::framework::Tensor* in) { const paddle::framework::Tensor* in) {
auto& dev_ctx =
exe_ctx.template device_context<paddle::platform::NPUDeviceContext>();
using Tensor = paddle::framework::Tensor; using Tensor = paddle::framework::Tensor;
Tensor out(in->type()); Tensor out(in->type());
out.Resize(in->dims());
out.mutable_data<T>(dev_ctx.GetPlace());
bool found_inf_data = false; Tensor mean(in->type());
mean.Resize({1});
mean.mutable_data<float>(dev_ctx.GetPlace());
std::vector<int> axes;
for (int i = 0; i < in->dims().size(); ++i) {
axes.push_back(i);
}
std::vector<float> vec;
try { try {
const auto& runner = const auto& runner_mean = paddle::operators::NpuOpRunner(
NpuOpRunner("CheckNumerics", {*in}, {out}, "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
{{"message", std::string("check_numberics")}}); TensorToVector(mean, dev_ctx, &vec);
runner.Run(stream);
dev_ctx.Wait();
} catch (platform::EnforceNotMet& exception) {
LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
found_inf_data = true;
} catch (...) { } catch (...) {
LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; LOG(WARNING) << "ContainsNan catch exception";
found_inf_data = true; return true;
}
VLOG(4) << "reducemeand result:" << vec[0];
if (std::isnan(static_cast<float>(vec[0]))) {
LOG(WARNING) << "ContainsNan detects nan";
return true;
}
if (std::isinf(static_cast<float>(vec[0]))) {
LOG(WARNING) << "ContainsNan detects inf";
} }
return found_inf_data; return false;
} }
#endif #endif
template <ReduceType red_type, typename T> template <ReduceType red_type, typename T>
...@@ -216,22 +225,24 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> { ...@@ -216,22 +225,24 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
framework::Tensor tmp; framework::Tensor tmp;
tmp.mutable_data<float>({8}, ctx.GetPlace()); tmp.mutable_data<float>({8}, ctx.GetPlace());
bool check_numerics = false; bool found_nan = false;
auto d_type = in->type(); auto d_type = in->type();
switch (d_type) { switch (d_type) {
case framework::proto::VarType::FP16: case framework::proto::VarType::FP16: {
break;
}
case framework::proto::VarType::FP32: { case framework::proto::VarType::FP32: {
VLOG(4) << "prepare to FoundNanInf"; VLOG(4) << "prepare to FoundNanInf";
check_numerics = CheckNumerics<T>(ctx, dev_ctx->stream(), in); found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
VLOG(4) << "check_numerics:" << check_numerics; VLOG(4) << "check_numerics:" << found_nan;
break; break;
} }
default: default:
break; break;
} }
if (check_numerics) { if (found_nan) {
T inf = static_cast<T>(std::numeric_limits<float>::infinity()); T inf = static_cast<T>(std::numeric_limits<float>::infinity());
VLOG(4) << "fill input data constant inf"; VLOG(4) << "fill input data constant inf";
auto dims = in->dims(); auto dims = in->dims();
......
...@@ -38,6 +38,11 @@ limitations under the License. */ ...@@ -38,6 +38,11 @@ limitations under the License. */
#include "paddle/fluid/platform/hccl_helper.h" #include "paddle/fluid/platform/hccl_helper.h"
#endif #endif
// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
namespace m = paddle::operators::math; namespace m = paddle::operators::math;
...@@ -52,10 +57,11 @@ DECLARE_string(selected_npus); ...@@ -52,10 +57,11 @@ DECLARE_string(selected_npus);
template <typename T> template <typename T>
void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) { void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
std::string debugstring = ""; std::string debugstring = "";
std::cout << preStr << ":" << std::endl << debugstring;
for (auto ele : data) { for (auto ele : data) {
debugstring += std::to_string(ele) + std::string(","); std::cout << ele << " ";
} }
VLOG(3) << preStr << ":" << std::endl << debugstring; std::cout << std::endl;
} }
void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
...@@ -120,6 +126,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx, ...@@ -120,6 +126,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
ctx.Wait(); ctx.Wait();
} }
template <typename T>
void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
int iter) { int iter) {
// init // init
...@@ -130,10 +137,11 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, ...@@ -130,10 +137,11 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
int num1 = 3; int num1 = 3;
int num2 = 128; int num2 = 128;
std::vector<float> init; std::vector<T> init;
for (int64_t i = 0; i < num1 * num2; ++i) { for (int64_t i = 0; i < num1 * num2; ++i) {
init.push_back(1.0 + rank_id); init.push_back(static_cast<T>(1.0 + rank_id));
} }
init[0] = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
PrintDebugInfo("input data", init); PrintDebugInfo("input data", init);
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
...@@ -145,31 +153,33 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, ...@@ -145,31 +153,33 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
auto out = scope->Var("OutData"); auto out = scope->Var("OutData");
auto tensor_out = out->GetMutable<f::LoDTensor>(); auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num1, num2}); tensor_out->Resize({num1, num2});
tensor_out->mutable_data<float>(place); // allocate tensor_out->mutable_data<T>(place); // allocate
ctx.Wait(); ctx.Wait();
// run // run
f::AttributeMap attrs; f::AttributeMap attrs;
attrs["tag"] = std::string("tagx_" + std::to_string(iter)); attrs["tag"] = std::string("tagx_" + std::to_string(iter));
attrs["ring_id"] = 0; attrs["ring_id"] = 0;
attrs["use_calc_stream"] = 1;
auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}}, auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}},
{{"Out", {"OutData"}}}, attrs); {{"Out", {"OutData"}}}, attrs);
for (int i = 0; i < 1; i++) {
for (int i = 0; i < 10; i++) {
op->Run(*scope, place); op->Run(*scope, place);
} }
ctx.Wait(); ctx.Wait();
std::vector<float> out_vec; std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec); TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait(); ctx.Wait();
PrintDebugInfo("output data", out_vec); PrintDebugInfo("output data", out_vec);
float diff = static_cast<float>(out_vec[0]) - 65504;
EXPECT_TRUE(diff < 0.1 && diff > -0.1);
EXPECT_EQ(out_vec.size(), init.size()); EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) { for (uint32_t i = 1; i < 10; i++) {
EXPECT_EQ(out_vec[i], 3.0); EXPECT_EQ(out_vec[i], static_cast<paddle::platform::float16>(3.0));
} }
} }
...@@ -182,8 +192,7 @@ TEST(c_allreduce_sum, NPU) { ...@@ -182,8 +192,7 @@ TEST(c_allreduce_sum, NPU) {
// only support one device, if more than one device, use first default // only support one device, if more than one device, use first default
PrepareUniqueId(&scope, ctx, &hccl_id); PrepareUniqueId(&scope, ctx, &hccl_id);
Prepare(&scope, ctx, &hccl_id); Prepare(&scope, ctx, &hccl_id);
for (int i = 0; i < 1; i++) {
VLOG(2) << "iter num: " << i; TestHCCLAllReduceOp<paddle::platform::float16>(&scope, ctx, 1);
TestHCCLAllReduceOp(&scope, ctx, i); // TestHCCLAllReduceOp<float>(&scope, ctx, 0);
}
} }
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <cmath>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(c_allreduce_sum);
USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
DECLARE_string(selected_npus);
template <typename T>
bool Check(T value, int size = 2 * 512 * 8192) {
f::Scope scope;
auto x = scope.Var("in");
auto& ctx = *dynamic_cast<p::NPUDeviceContext*>(
p::DeviceContextPool::Instance().Get(p::NPUPlace(0)));
auto place = ctx.GetPlace();
auto tensor_x = x->GetMutable<f::LoDTensor>();
tensor_x->Resize({size});
tensor_x->mutable_data<T>(place); // allocate
std::vector<T> init;
for (int64_t i = 0; i < size; ++i) {
init.push_back(static_cast<T>(value));
}
TensorFromVector(init, ctx, tensor_x);
bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x);
return result;
}
TEST(check_numeric, NPU) {
auto inf = std::numeric_limits<float>::infinity();
auto fp16_inf = static_cast<p::float16>(inf);
auto nan = NAN;
auto fp16_nan = static_cast<p::float16>(nan);
bool result = false;
// Normal
VLOG(0) << "start normal";
result = Check<p::float16>(static_cast<p::float16>(65546));
ASSERT_FALSE(result);
Check<float>(static_cast<float>(1.0));
ASSERT_FALSE(result);
// Inf
VLOG(0) << "start inf";
result = Check<p::float16>(fp16_inf);
ASSERT_FALSE(result);
result = Check<float>(inf);
ASSERT_FALSE(result);
// Nan
VLOG(0) << "start nan";
result = Check<p::float16>(fp16_nan);
ASSERT_TRUE(result);
result = Check<float>(nan);
ASSERT_TRUE(result);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册