From 8497e2aad3dd089ea2747f2e2b159e7eb5846f71 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 2 Mar 2021 15:04:39 +0800 Subject: [PATCH] [NPU] add npu kernel for elementwise_add_grad (#31347) * fix reading flags from env * fix problem caused by async run * support partial grad * support elementwise_add_grad npu kernel * add unittest * fix bug? --- .../elementwise/elementwise_add_op_npu.cc | 128 ++++++++++++- .../elementwise/elementwise_op_npu_test.cc | 86 +++++---- .../elementwise/elementwise_sub_op_npu.cc | 180 +++++++++--------- 3 files changed, 258 insertions(+), 136 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc index 1e7e5e02c01..5b8d08a8943 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc @@ -12,17 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND_CL #include #include +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; -template +template class ElementwiseAddNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -39,12 +40,127 @@ class ElementwiseAddNPUKernel : public framework::OpKernel { } }; +template +class ElementwiseAddGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + + auto stream = + ctx.template device_context() + .stream(); + + // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with + // default axis=-1? + // So, the sub_grad should do reduce if needed. + // For example, the shape of each variable in elementwise_sub: + // x, dx: [2, 3, 5] + // y, dy: [1, 5] + // out, dout: [2, 3, 5] + // Then, out = x - y => dx = dout, dy = -dout + // And, the shape of dy can be computed by two stages reduce, + // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false. + // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true. + + if (dx) { + dx->mutable_data(ctx.GetPlace()); + // For dx + // stage 1 + auto reduce_ndim = dout->dims().size() - dx->dims().size(); + std::vector axes; + for (auto i = 0; i < reduce_ndim; ++i) { + axes.push_back(i); + } + Tensor* tmp_dout = const_cast(dout); + Tensor reduced_dout(dx->type()); + if (axes.size() != 0) { + std::vector reduced_dout_dims; + for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { + reduced_dout_dims.push_back(dout->dims()[i]); + } + reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); + reduced_dout.mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); + tmp_dout = &reduced_dout; + } + + // stage 2 + axes.clear(); + for (auto i = 0; i < dx->dims().size(); ++i) { + if (dx->dims()[i] == 1) { + axes.push_back(i); + } + } + if (axes.size() != 0) { + auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, + {{"axes", axes}, {"keep_dims", true}}); + runner.Run(stream); + } else { + ctx.template device_context() + .Wait(); + framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); + } + } + + if (dy) { + // For dy + // stage 1 + auto reduce_ndim = dout->dims().size() - dy->dims().size(); + std::vector axes; + for (auto i = 0; i < reduce_ndim; ++i) { + axes.push_back(i); + } + Tensor* tmp_dout = const_cast(dout); + Tensor reduced_dout(dout->type()); + if (axes.size() != 0) { + std::vector reduced_dout_dims; + for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { + reduced_dout_dims.push_back(dout->dims()[i]); + } + reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); + reduced_dout.mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); + tmp_dout = &reduced_dout; + ctx.template device_context() + .Wait(); + } + + // stage 2 + axes.clear(); + for (auto i = 0; i < dy->dims().size(); ++i) { + if (dy->dims()[i] == 1) { + axes.push_back(i); + } + } + if (axes.size() != 0) { + dy->mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy}, + {{"axes", axes}, {"keep_dims", true}}); + runner.Run(stream); + } else { + ctx.template device_context() + .Wait(); + framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dy); + } + } + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(elementwise_add, ops::ElementwiseAddNPUKernel, + ops::ElementwiseAddNPUKernel); -REGISTER_OP_NPU_KERNEL( - elementwise_add, - ops::ElementwiseAddNPUKernel); -#endif +REGISTER_OP_NPU_KERNEL(elementwise_add_grad, + ops::ElementwiseAddGradNPUKernel, + ops::ElementwiseAddGradNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index 0cb8fd1c578..df6fae6c848 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -74,6 +74,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, {{"Out", {"Out"}}}, attrs); op->Run(*scope, place); + ctx.Wait(); std::vector out_vec; TensorToVector(*tensor_out, ctx, &out_vec); @@ -125,57 +126,64 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, // run f::AttributeMap attrs; - auto op = f::OpRegistry::CreateOp(op_type, - {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}}, - {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs); + auto op = f::OpRegistry::CreateOp( + op_type, {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}}, + {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs); auto place = ctx.GetPlace(); - op->Run(*scope, place); - - std::vector dx_vec; - TensorToVector(*tensor_dx, ctx, &dx_vec); - - std::vector dy_vec; - TensorToVector(*tensor_dy, ctx, &dy_vec); - - ctx.Wait(); - float expected_x, expected_y; - if (op_type == "elementwise_add_grad") { - expected_x = 1.0; - expected_y = 6.0; - } else if (op_type == "elementwise_sub_grad") { - expected_x = 1.0; - expected_y = -6.0; - } - - for (uint32_t i = 0; i < dx_vec.size(); i++) { - EXPECT_EQ(dx_vec[i], static_cast(expected_x)); - } - for (uint32_t i = 0; i < dy_vec.size(); i++) { - EXPECT_EQ(dy_vec[i], static_cast(expected_y)); - } + op->Run(*scope, place); + ctx.Wait(); + + std::vector dx_vec; + TensorToVector(*tensor_dx, ctx, &dx_vec); + + std::vector dy_vec; + TensorToVector(*tensor_dy, ctx, &dy_vec); + + ctx.Wait(); + float expected_x, expected_y; + if (op_type == "elementwise_add_grad") { + expected_x = 1.0; + expected_y = 6.0; + } else if (op_type == "elementwise_sub_grad") { + expected_x = 1.0; + expected_y = -6.0; + } + + for (uint32_t i = 0; i < dx_vec.size(); i++) { + EXPECT_EQ(dx_vec[i], static_cast(expected_x)); + } + for (uint32_t i = 0; i < dy_vec.size(); i++) { + EXPECT_EQ(dy_vec[i], static_cast(expected_y)); + } } TEST(elementwise_add, NPU_fp32) { - f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "elementwise_add"); + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "elementwise_add"); } TEST(elementwise_sub, NPU_fp32) { - f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "elementwise_sub"); + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "elementwise_sub"); } TEST(elementwise_sub, NPU_fp16) { - f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "elementwise_sub"); + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "elementwise_sub"); } TEST(elementwise_sub_grad, NPU) { - f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx, "elementwise_sub_grad"); + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + CompareGrad(&scope, ctx, "elementwise_sub_grad"); +} + +TEST(elementwise_add_grad, NPU) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + CompareGrad(&scope, ctx, "elementwise_add_grad"); } diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc index c3cf76451f6..809445c2862 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND_CL #include #include @@ -24,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class ElementwiseSubNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -43,7 +42,7 @@ class ElementwiseSubNPUKernel : public framework::OpKernel { } }; -template +template class ElementwiseSubGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -51,8 +50,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); - dx->mutable_data(ctx.GetPlace()); - dy->mutable_data(ctx.GetPlace()); + auto stream = + ctx.template device_context() + .stream(); // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with // default axis=-1? @@ -66,89 +66,92 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false. // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true. - auto stream = - ctx.template device_context() - .stream(); - // For dx - // stage 1 - auto reduce_ndim = dout->dims().size() - dx->dims().size(); - std::vector axes; - for (auto i = 0; i < reduce_ndim; ++i) { - axes.push_back(i); - } - Tensor* tmp_dout = const_cast(dout); - Tensor reduced_dout(dx->type()); - if (axes.size() != 0) { - std::vector reduced_dout_dims; - for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { - reduced_dout_dims.push_back(dout->dims()[i]); - } - reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); - reduced_dout.mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); - tmp_dout = &reduced_dout; - } - - // stage 2 - axes.clear(); - for (auto i = 0; i < dx->dims().size(); ++i) { - if (dx->dims()[i] == 1) { + if (dx) { + dx->mutable_data(ctx.GetPlace()); + // For dx + // stage 1 + auto reduce_ndim = dout->dims().size() - dx->dims().size(); + std::vector axes; + for (auto i = 0; i < reduce_ndim; ++i) { axes.push_back(i); } - } - if (axes.size() != 0) { - auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, - {{"axes", axes}, {"keep_dims", true}}); - runner.Run(stream); - } else { - framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); - } - - // For dy - // stage 1 - reduce_ndim = dout->dims().size() - dy->dims().size(); - axes.clear(); - for (auto i = 0; i < reduce_ndim; ++i) { - axes.push_back(i); - } - tmp_dout = const_cast(dout); - Tensor reduced_dy(dy->type()); + Tensor* tmp_dout = const_cast(dout); + Tensor reduced_dout(dx->type()); + if (axes.size() != 0) { + std::vector reduced_dout_dims; + for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { + reduced_dout_dims.push_back(dout->dims()[i]); + } + reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); + reduced_dout.mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); + tmp_dout = &reduced_dout; + } - if (axes.size() != 0) { - std::vector reduced_dout_dims; - for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { - reduced_dout_dims.push_back(dout->dims()[i]); + // stage 2 + axes.clear(); + for (auto i = 0; i < dx->dims().size(); ++i) { + if (dx->dims()[i] == 1) { + axes.push_back(i); + } + } + if (axes.size() != 0) { + auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, + {{"axes", axes}, {"keep_dims", true}}); + runner.Run(stream); + } else { + framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); } - reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); - reduced_dout.mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); - tmp_dout = &reduced_dout; } - - // stage 2 - axes.clear(); - Tensor* tmp_dy = tmp_dout; - for (auto i = 0; i < dy->dims().size(); ++i) { - if (dy->dims()[i] == 1) { + if (dy) { + dy->mutable_data(ctx.GetPlace()); + // For dy + // stage 1 + auto reduce_ndim = dout->dims().size() - dy->dims().size(); + std::vector axes; + for (auto i = 0; i < reduce_ndim; ++i) { axes.push_back(i); } - } - if (axes.size() != 0) { - reduced_dy.Resize(dy->dims()); - reduced_dy.mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy}, - {{"axes", axes}, {"keep_dims", true}}); + Tensor* tmp_dout = const_cast(dout); + Tensor reduced_dy(dy->type()); + Tensor reduced_dout(dy->type()); + + if (axes.size() != 0) { + std::vector reduced_dout_dims; + for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { + reduced_dout_dims.push_back(dout->dims()[i]); + } + reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); + reduced_dout.mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); + tmp_dout = &reduced_dout; + } + + // stage 2 + axes.clear(); + Tensor* tmp_dy = tmp_dout; + for (auto i = 0; i < dy->dims().size(); ++i) { + if (dy->dims()[i] == 1) { + axes.push_back(i); + } + } + if (axes.size() != 0) { + reduced_dy.Resize(dy->dims()); + reduced_dy.mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy}, + {{"axes", axes}, {"keep_dims", true}}); + runner.Run(stream); + tmp_dy = &reduced_dy; + } + + // stage 3, negative + auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {}); runner.Run(stream); - tmp_dy = &reduced_dy; } - - // stage 3, negative - auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {}); - runner.Run(stream); } }; @@ -156,16 +159,11 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel, + ops::ElementwiseSubNPUKernel); -REGISTER_OP_NPU_KERNEL( - elementwise_sub, - ops::ElementwiseSubNPUKernel, - ops::ElementwiseSubNPUKernel); - -REGISTER_OP_NPU_KERNEL( - elementwise_sub_grad, - ops::ElementwiseSubGradNPUKernel, - ops::ElementwiseSubGradNPUKernel); -#endif +REGISTER_OP_NPU_KERNEL(elementwise_sub_grad, + ops::ElementwiseSubGradNPUKernel, + ops::ElementwiseSubGradNPUKernel); -- GitLab