From b3446670c1619153dc7c5ae5db028c2ef07898dd Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 23 Mar 2021 19:21:30 +0800 Subject: [PATCH] [NPU] add npu kernel for concat op (#31695) * add npu kernel for concat op * add npu kernel for concat op * refine code * update * refine concat_grad --- paddle/fluid/operators/concat_op_npu.cc | 125 ++++++++++++++++++ paddle/fluid/operators/npu_op_runner.cc | 3 + .../tests/unittests/npu/test_concat_op_npu.py | 115 ++++++++++++++++ 3 files changed, 243 insertions(+) create mode 100644 paddle/fluid/operators/concat_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc new file mode 100644 index 00000000000..04aa10d712e --- /dev/null +++ b/paddle/fluid/operators/concat_op_npu.cc @@ -0,0 +1,125 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class ConcatNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + framework::LoDTensor* out = ctx.Output("Out"); + PADDLE_ENFORCE_NOT_NULL(ins[0], + platform::errors::NotFound( + "The first input tensor is not initalized.")); + auto axis = ctx.Attr("axis"); + + if (ctx.HasInput("AxisTensor")) { + PADDLE_THROW(platform::errors::NotFound( + "The AxisTensor is not supported on NPU now.")); + } + axis = ComputeAxis(static_cast(axis), + static_cast(ins[0]->dims().size())); + + auto place = ctx.GetPlace(); + out->mutable_data(place); + + std::vector inputs; + std::vector names; + for (size_t i = 0; i < ins.size(); ++i) { + if (ins[i] && ins[i]->numel() > 0) { + inputs.push_back(*ins[i]); + names.push_back("x" + std::to_string(i)); + } else { + continue; + } + } + auto stream = + ctx.template device_context() + .stream(); + auto runner = NpuOpRunner( + "ConcatD", {inputs}, {*out}, + {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}); + runner.AddInputNames(names); + runner.Run(stream); + } +}; + +template +class ConcatGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto ins = ctx.MultiInput("X"); + auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); + auto outs = + ctx.MultiOutput(framework::GradVarName("X")); + + { + auto dx = outs; + auto x = ins; + for (size_t i = 0; i < dx.size(); ++i) { + if (dx[i] != nullptr) { + dx[i]->set_lod(x[i]->lod()); + } + } + } + PADDLE_ENFORCE_NOT_NULL(ins[0], + platform::errors::NotFound( + "The first input tensor is not initalized.")); + + auto axis = ctx.Attr("axis"); + + axis = ComputeAxis(static_cast(axis), + static_cast(ins[0]->dims().size())); + // get output tensor that the name is not kEmptyVarName + std::vector outputs; + std::vector sizes; + for (size_t j = 0; j < outs.size(); ++j) { + if (out_var_names[j] != framework::kEmptyVarName && + outs[j]->numel() != 0UL) { + outs[j]->mutable_data(ctx.GetPlace()); + outputs.push_back(*outs[j]); + sizes.push_back(outs[j]->dims()[axis]); + } + } + auto runner = + NpuOpRunner("SplitVD", {*out_grad}, outputs, + {{"split_dim", axis}, + {"size_splits", sizes}, + {"num_split", static_cast(outputs.size())}}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL(concat, ops::ConcatNPUKernel, + ops::ConcatNPUKernel, + ops::ConcatNPUKernel); + +REGISTER_OP_NPU_KERNEL(concat_grad, ops::ConcatGradNPUKernel, + ops::ConcatGradNPUKernel, + ops::ConcatGradNPUKernel); diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index f7c3597da4a..dc503a0a96e 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -256,6 +256,9 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) { auto *desc = aclCreateTensorDesc(dtype, dims.size(), dims.data(), format); PADDLE_ENFORCE_NOT_NULL( desc, platform::errors::External("Call aclCreateTensorDesc failed.")); + PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageFormat(desc, format)); + PADDLE_ENFORCE_NPU_SUCCESS( + aclSetTensorStorageShape(desc, dims.size(), dims.data())); return desc; } diff --git a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py new file mode 100644 index 00000000000..6201df135b0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py @@ -0,0 +1,115 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestConcat(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "concat" + self.place = paddle.NPUPlace(4) + self.init_dtype() + self.init_test_data() + + self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} + self.attrs = {'axis': self.axis} + if self.axis < 0: + self.actual_axis = self.axis + len(self.x0.shape) + self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0 + else: + self.actual_axis = self.axis + + self.outputs = { + 'Out': np.concatenate( + (self.x0, self.x1, self.x2), axis=self.actual_axis) + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def init_test_data(self): + self.x0 = np.random.random((1, 4, 50)).astype(self.dtype) + self.x1 = np.random.random((2, 4, 50)).astype(self.dtype) + self.x2 = np.random.random((3, 4, 50)).astype(self.dtype) + self.axis = 0 + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['x0'], 'Out', check_dygraph=False) + self.check_grad_with_place( + self.place, ['x1'], 'Out', check_dygraph=False) + self.check_grad_with_place( + self.place, ['x2'], 'Out', check_dygraph=False) + + +class TestConcatFP16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "concat" + self.place = paddle.NPUPlace(4) + self.init_dtype() + self.init_test_data() + + self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} + self.attrs = {'axis': self.axis} + if self.axis < 0: + self.actual_axis = self.axis + len(self.x0.shape) + self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0 + else: + self.actual_axis = self.axis + + self.outputs = { + 'Out': np.concatenate( + (self.x0, self.x1, self.x2), axis=self.actual_axis) + } + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def init_test_data(self): + self.x0 = np.random.random((1, 4, 50)).astype(self.dtype) + self.x1 = np.random.random((2, 4, 50)).astype(self.dtype) + self.x2 = np.random.random((3, 4, 50)).astype(self.dtype) + self.axis = 0 + + +if __name__ == '__main__': + unittest.main() -- GitLab