未验证 提交 5cb20f30 编写于 作者: L Leo Chen 提交者: GitHub

add npu kernel for elementwise_sub and elementwise_sub_grad (#30973)

* add npu sub op

* fix typo

* rename test

* fix bug

* fix bug

* add fp16 kernel

* fix typo

* support sub grad op

* support elementwise_sub_grad op
Co-authored-by: Nfrankwhzhang <frankwhzhang@126.com>
上级 c687edec
...@@ -97,6 +97,36 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -97,6 +97,36 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place)); "Copy from %s to %s is not supported.", src_place, dst_place));
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
// TODO(zhiqiu): handle different condition like CUDA code below
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, stream);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) {
auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, stream);
}
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, stream);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) { platform::is_cuda_pinned_place(dst_place)) {
...@@ -304,6 +334,32 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -304,6 +334,32 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place)); "Copy from %s to %s is not supported.", src_place, dst_place));
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, nullptr);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, nullptr);
}
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, nullptr);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) { platform::is_cuda_pinned_place(dst_place)) {
...@@ -433,10 +489,9 @@ class AnyVisitor : public boost::static_visitor<bool> { ...@@ -433,10 +489,9 @@ class AnyVisitor : public boost::static_visitor<bool> {
bool GetResult(const framework::Tensor& out, bool GetResult(const framework::Tensor& out,
const platform::NPUPlace& npu) const { const platform::NPUPlace& npu) const {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(
"Not supported on place (%s) ", platform::errors::Unimplemented("Not supported on place (%s) ", npu));
npu)); // return GetResultHelper(out, npu);
//return GetResultHelper(out, npu);
} }
bool GetResult(const framework::Tensor& out, bool GetResult(const framework::Tensor& out,
...@@ -642,7 +697,7 @@ struct BothFalseVisitor : public boost::static_visitor<> { ...@@ -642,7 +697,7 @@ struct BothFalseVisitor : public boost::static_visitor<> {
} }
void VisitorImpl(const platform::NPUPlace& npu) const { void VisitorImpl(const platform::NPUPlace& npu) const {
//TODO(zhiqiu) // TODO(zhiqiu)
} }
void VisitorImpl(const platform::CPUPlace& cpu) const { void VisitorImpl(const platform::CPUPlace& cpu) const {
......
...@@ -8,4 +8,4 @@ register_operators(DEPS op_version_registry) ...@@ -8,4 +8,4 @@ register_operators(DEPS op_version_registry)
cc_test(test_elementwise_add_op_inplace SRCS test_elementwise_add_op_inplace.cc DEPS op_registry elementwise_add_op scope device_context enforce executor) cc_test(test_elementwise_add_op_inplace SRCS test_elementwise_add_op_inplace.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
cc_test(test_elementwise_div_grad_grad SRCS test_elementwise_div_grad_grad.cc DEPS op_registry elementwise_div_op scope device_context enforce executor) cc_test(test_elementwise_div_grad_grad SRCS test_elementwise_div_grad_grad.cc DEPS op_registry elementwise_div_op scope device_context enforce executor)
cc_test(test_elementwise_add_grad_grad SRCS test_elementwise_add_grad_grad.cc DEPS op_registry elementwise_add_op scope device_context enforce executor) cc_test(test_elementwise_add_grad_grad SRCS test_elementwise_add_grad_grad.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
cc_test(elementwise_add_op_npu_test SRCS elementwise_add_op_npu_test.cc DEPS op_registry elementwise_add_op scope device_context enforce executor) cc_test(elementwise_op_npu_test SRCS elementwise_op_npu_test.cc DEPS op_registry elementwise_add_op elementwise_sub_op scope device_context enforce executor)
...@@ -29,17 +29,9 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> { ...@@ -29,17 +29,9 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
auto* x = ctx.Input<framework::LoDTensor>("X"); auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y"); auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* out = ctx.Output<framework::LoDTensor>("Out"); auto* out = ctx.Output<framework::LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
// TODO(zhiqiu): get the attr infomation of Ascend op and
// convert paddle AttributeMap to Ascend attrs.
// Ascend op add has no attribute ?
// int axis = ctx.Attr<int>("axis");
// NOTE(zhiqiu): the order of inputs and outputs is important
auto runner = NpuOpRunner("Add", {*x, *y}, {*out}, {}); auto runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
......
...@@ -34,8 +34,12 @@ namespace m = paddle::operators::math; ...@@ -34,8 +34,12 @@ namespace m = paddle::operators::math;
USE_OP(elementwise_add); USE_OP(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, NPU); USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
USE_OP(elementwise_sub);
USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
void Compare(f::Scope* scope, const p::DeviceContext& ctx) { template <typename T>
void Compare(f::Scope* scope, const p::DeviceContext& ctx,
std::string op_type) {
// init // init
auto x = scope->Var("X"); auto x = scope->Var("X");
auto tensor_x = x->GetMutable<f::LoDTensor>(); auto tensor_x = x->GetMutable<f::LoDTensor>();
...@@ -43,14 +47,19 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { ...@@ -43,14 +47,19 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
auto y = scope->Var("Y"); auto y = scope->Var("Y");
auto tensor_y = y->GetMutable<f::LoDTensor>(); auto tensor_y = y->GetMutable<f::LoDTensor>();
std::vector<float> init; std::vector<T> init_x;
for (int64_t i = 0; i < 10 * 10; ++i) { for (int64_t i = 0; i < 10 * 10; ++i) {
init.push_back(1.0); init_x.push_back(static_cast<T>(1.0));
} }
TensorFromVector(init, ctx, tensor_x); std::vector<T> init_y;
for (int64_t i = 0; i < 10 * 10; ++i) {
init_y.push_back(static_cast<T>(2.0));
}
TensorFromVector(init_x, ctx, tensor_x);
tensor_x->Resize({10, 10}); tensor_x->Resize({10, 10});
TensorFromVector(init, ctx, tensor_y); TensorFromVector(init_y, ctx, tensor_y);
tensor_y->Resize({10, 10}); tensor_y->Resize({10, 10});
ctx.Wait(); ctx.Wait();
...@@ -58,30 +67,115 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { ...@@ -58,30 +67,115 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
auto out = scope->Var("Out"); auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<f::LoDTensor>(); auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({10, 10});
tensor_out->mutable_data<float>(place); // allocate
// run // run
f::AttributeMap attrs; f::AttributeMap attrs;
auto op = auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}, {"Y", {"Y"}}},
f::OpRegistry::CreateOp("elementwise_add", {{"X", {"X"}}, {"Y", {"Y"}}},
{{"Out", {"Out"}}}, attrs); {{"Out", {"Out"}}}, attrs);
op->Run(*scope, place); op->Run(*scope, place);
std::vector<float> out_vec; std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec); TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait(); ctx.Wait();
float expected;
EXPECT_EQ(out_vec.size(), init.size()); if (op_type == "elementwise_add") {
expected = 3.0;
} else if (op_type == "elementwise_sub") {
expected = -1.0;
}
EXPECT_EQ(out_vec.size(), init_x.size());
for (uint32_t i = 0; i < out_vec.size(); i++) { for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 2.0); EXPECT_EQ(out_vec[i], static_cast<T>(expected));
}
}
template <typename T>
void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
std::string op_type) {
// init
auto dout = scope->Var("DOut");
auto tensor_dout = dout->GetMutable<f::LoDTensor>();
tensor_dout->Resize({2, 3, 5});
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<f::LoDTensor>();
tensor_x->Resize({2, 3, 5});
auto y = scope->Var("Y");
auto tensor_y = y->GetMutable<f::LoDTensor>();
tensor_y->Resize({1, 5});
auto dx = scope->Var("DX");
auto tensor_dx = dx->GetMutable<f::LoDTensor>();
auto dy = scope->Var("DY");
auto tensor_dy = dy->GetMutable<f::LoDTensor>();
std::vector<T> init_dout;
for (int64_t i = 0; i < tensor_dout->numel(); ++i) {
init_dout.push_back(static_cast<T>(1.0));
} }
TensorFromVector(init_dout, ctx, tensor_dout);
tensor_dout->Resize({2, 3, 5});
ctx.Wait();
// run
f::AttributeMap attrs;
auto op = f::OpRegistry::CreateOp(op_type,
{{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
{{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);
auto place = ctx.GetPlace();
op->Run(*scope, place);
std::vector<T> dx_vec;
TensorToVector(*tensor_dx, ctx, &dx_vec);
std::vector<T> dy_vec;
TensorToVector(*tensor_dy, ctx, &dy_vec);
ctx.Wait();
float expected_x, expected_y;
if (op_type == "elementwise_add_grad") {
expected_x = 1.0;
expected_y = 6.0;
} else if (op_type == "elementwise_sub_grad") {
expected_x = 1.0;
expected_y = -6.0;
}
for (uint32_t i = 0; i < dx_vec.size(); i++) {
EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
}
for (uint32_t i = 0; i < dy_vec.size(); i++) {
EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
}
}
TEST(elementwise_add, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "elementwise_add");
}
TEST(elementwise_sub, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "elementwise_sub");
}
TEST(elementwise_sub, NPU_fp16) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<p::float16>(&scope, ctx, "elementwise_sub");
} }
TEST(elementwise_add, NPU) { TEST(elementwise_sub_grad, NPU) {
f::Scope scope; f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0)); p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare(&scope, ctx); CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
} }
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
#include "paddle/fluid/operators/npu_op_runner.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Output<Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};
template <typename DeviceContext, typename T>
class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
dx->mutable_data<T>(ctx.GetPlace());
dy->mutable_data<T>(ctx.GetPlace());
// NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
// default axis=-1?
// So, the sub_grad should do reduce if needed.
// For example, the shape of each variable in elementwise_sub:
// x, dx: [2, 3, 5]
// y, dy: [1, 5]
// out, dout: [2, 3, 5]
// Then, out = x - y => dx = dout, dy = -dout
// And, the shape of dy can be computed by two stages reduce,
// 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
// 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
// For dx
// stage 1
auto reduce_ndim = dout->dims().size() - dx->dims().size();
std::vector<int> axes;
for (auto i = 0; i < reduce_ndim; ++i) {
axes.push_back(i);
}
Tensor* tmp_dout = const_cast<Tensor*>(dout);
Tensor reduced_dout(dx->type());
if (axes.size() != 0) {
std::vector<int64_t> reduced_dout_dims;
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]);
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}
// stage 2
axes.clear();
for (auto i = 0; i < dx->dims().size(); ++i) {
if (dx->dims()[i] == 1) {
axes.push_back(i);
}
}
if (axes.size() != 0) {
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
}
// For dy
// stage 1
reduce_ndim = dout->dims().size() - dy->dims().size();
axes.clear();
for (auto i = 0; i < reduce_ndim; ++i) {
axes.push_back(i);
}
tmp_dout = const_cast<Tensor*>(dout);
Tensor reduced_dy(dy->type());
if (axes.size() != 0) {
std::vector<int64_t> reduced_dout_dims;
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]);
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}
// stage 2
axes.clear();
Tensor* tmp_dy = tmp_dout;
for (auto i = 0; i < dy->dims().size(); ++i) {
if (dy->dims()[i] == 1) {
axes.push_back(i);
}
}
if (axes.size() != 0) {
reduced_dy.Resize(dy->dims());
reduced_dy.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
tmp_dy = &reduced_dy;
}
// stage 3, negative
auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
elementwise_sub,
ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
REGISTER_OP_NPU_KERNEL(
elementwise_sub_grad,
ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
#endif
...@@ -253,7 +253,7 @@ void NpuOpRunner::Run(aclrtStream stream) { ...@@ -253,7 +253,7 @@ void NpuOpRunner::Run(aclrtStream stream) {
input_buffers_.data(), output_descs_.size(), output_descs_.data(), input_buffers_.data(), output_descs_.size(), output_descs_.data(),
output_buffers_.data(), attr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, output_buffers_.data(), attr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL,
stream); stream);
VLOG(4) << "after aclopCompileAndExecute"; VLOG(4) << "after aclopCompileAndExecute: " << ret;
PADDLE_ENFORCE_NPU_SUCCESS(ret); PADDLE_ENFORCE_NPU_SUCCESS(ret);
} }
} // namespace operators } // namespace operators
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册