未验证 提交 c91b1e03 编写于 作者: G gongweibao 提交者: GitHub

[NPU]Use another method to void c_allreduce_sum core! (#34619)

上级 436a9f14
......@@ -59,6 +59,8 @@ if(WITH_ASCEND_CL)
DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(checknumeric SRCS checknumeric_npu_test.cc
DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
......
......@@ -121,35 +121,44 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
};
#if defined(PADDLE_WITH_ASCEND_CL)
// return true if found_inf_or_nan or return false;
template <typename T>
bool CheckNumerics(const framework::ExecutionContext& exe_ctx,
aclrtStream stream, const paddle::framework::Tensor* in) {
auto& dev_ctx =
exe_ctx.template device_context<paddle::platform::NPUDeviceContext>();
// return true if found_nan or return false;
inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
aclrtStream stream,
const paddle::framework::Tensor* in) {
using Tensor = paddle::framework::Tensor;
Tensor out(in->type());
out.Resize(in->dims());
out.mutable_data<T>(dev_ctx.GetPlace());
bool found_inf_data = false;
Tensor mean(in->type());
mean.Resize({1});
mean.mutable_data<float>(dev_ctx.GetPlace());
std::vector<int> axes;
for (int i = 0; i < in->dims().size(); ++i) {
axes.push_back(i);
}
std::vector<float> vec;
try {
const auto& runner =
NpuOpRunner("CheckNumerics", {*in}, {out},
{{"message", std::string("check_numberics")}});
runner.Run(stream);
dev_ctx.Wait();
} catch (platform::EnforceNotMet& exception) {
LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
found_inf_data = true;
const auto& runner_mean = paddle::operators::NpuOpRunner(
"ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
TensorToVector(mean, dev_ctx, &vec);
} catch (...) {
LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
found_inf_data = true;
LOG(WARNING) << "ContainsNan catch exception";
return true;
}
VLOG(4) << "reducemeand result:" << vec[0];
if (std::isnan(static_cast<float>(vec[0]))) {
LOG(WARNING) << "ContainsNan detects nan";
return true;
}
if (std::isinf(static_cast<float>(vec[0]))) {
LOG(WARNING) << "ContainsNan detects inf";
}
return found_inf_data;
return false;
}
#endif
template <ReduceType red_type, typename T>
......@@ -216,22 +225,24 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
framework::Tensor tmp;
tmp.mutable_data<float>({8}, ctx.GetPlace());
bool check_numerics = false;
bool found_nan = false;
auto d_type = in->type();
switch (d_type) {
case framework::proto::VarType::FP16:
case framework::proto::VarType::FP16: {
break;
}
case framework::proto::VarType::FP32: {
VLOG(4) << "prepare to FoundNanInf";
check_numerics = CheckNumerics<T>(ctx, dev_ctx->stream(), in);
VLOG(4) << "check_numerics:" << check_numerics;
found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
VLOG(4) << "check_numerics:" << found_nan;
break;
}
default:
break;
}
if (check_numerics) {
if (found_nan) {
T inf = static_cast<T>(std::numeric_limits<float>::infinity());
VLOG(4) << "fill input data constant inf";
auto dims = in->dims();
......
......@@ -38,6 +38,11 @@ limitations under the License. */
#include "paddle/fluid/platform/hccl_helper.h"
#endif
// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
......@@ -52,10 +57,11 @@ DECLARE_string(selected_npus);
template <typename T>
void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
std::string debugstring = "";
std::cout << preStr << ":" << std::endl << debugstring;
for (auto ele : data) {
debugstring += std::to_string(ele) + std::string(",");
std::cout << ele << " ";
}
VLOG(3) << preStr << ":" << std::endl << debugstring;
std::cout << std::endl;
}
void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
......@@ -120,6 +126,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
ctx.Wait();
}
template <typename T>
void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
int iter) {
// init
......@@ -130,10 +137,11 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
int num1 = 3;
int num2 = 128;
std::vector<float> init;
std::vector<T> init;
for (int64_t i = 0; i < num1 * num2; ++i) {
init.push_back(1.0 + rank_id);
init.push_back(static_cast<T>(1.0 + rank_id));
}
init[0] = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
PrintDebugInfo("input data", init);
auto place = ctx.GetPlace();
......@@ -145,31 +153,33 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
auto out = scope->Var("OutData");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num1, num2});
tensor_out->mutable_data<float>(place); // allocate
tensor_out->mutable_data<T>(place); // allocate
ctx.Wait();
// run
f::AttributeMap attrs;
attrs["tag"] = std::string("tagx_" + std::to_string(iter));
attrs["ring_id"] = 0;
attrs["use_calc_stream"] = 1;
auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}},
{{"Out", {"OutData"}}}, attrs);
for (int i = 0; i < 10; i++) {
for (int i = 0; i < 1; i++) {
op->Run(*scope, place);
}
ctx.Wait();
std::vector<float> out_vec;
std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
PrintDebugInfo("output data", out_vec);
float diff = static_cast<float>(out_vec[0]) - 65504;
EXPECT_TRUE(diff < 0.1 && diff > -0.1);
EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 3.0);
for (uint32_t i = 1; i < 10; i++) {
EXPECT_EQ(out_vec[i], static_cast<paddle::platform::float16>(3.0));
}
}
......@@ -182,8 +192,7 @@ TEST(c_allreduce_sum, NPU) {
// only support one device, if more than one device, use first default
PrepareUniqueId(&scope, ctx, &hccl_id);
Prepare(&scope, ctx, &hccl_id);
for (int i = 0; i < 1; i++) {
VLOG(2) << "iter num: " << i;
TestHCCLAllReduceOp(&scope, ctx, i);
}
TestHCCLAllReduceOp<paddle::platform::float16>(&scope, ctx, 1);
// TestHCCLAllReduceOp<float>(&scope, ctx, 0);
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <cmath>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(c_allreduce_sum);
USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
DECLARE_string(selected_npus);
template <typename T>
bool Check(T value, int size = 2 * 512 * 8192) {
f::Scope scope;
auto x = scope.Var("in");
auto& ctx = *dynamic_cast<p::NPUDeviceContext*>(
p::DeviceContextPool::Instance().Get(p::NPUPlace(0)));
auto place = ctx.GetPlace();
auto tensor_x = x->GetMutable<f::LoDTensor>();
tensor_x->Resize({size});
tensor_x->mutable_data<T>(place); // allocate
std::vector<T> init;
for (int64_t i = 0; i < size; ++i) {
init.push_back(static_cast<T>(value));
}
TensorFromVector(init, ctx, tensor_x);
bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x);
return result;
}
TEST(check_numeric, NPU) {
auto inf = std::numeric_limits<float>::infinity();
auto fp16_inf = static_cast<p::float16>(inf);
auto nan = NAN;
auto fp16_nan = static_cast<p::float16>(nan);
bool result = false;
// Normal
VLOG(0) << "start normal";
result = Check<p::float16>(static_cast<p::float16>(65546));
ASSERT_FALSE(result);
Check<float>(static_cast<float>(1.0));
ASSERT_FALSE(result);
// Inf
VLOG(0) << "start inf";
result = Check<p::float16>(fp16_inf);
ASSERT_FALSE(result);
result = Check<float>(inf);
ASSERT_FALSE(result);
// Nan
VLOG(0) << "start nan";
result = Check<p::float16>(fp16_nan);
ASSERT_TRUE(result);
result = Check<float>(nan);
ASSERT_TRUE(result);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册