From cbd15f7d00b4e639b2b115d4aee61a8b48faa9ce Mon Sep 17 00:00:00 2001 From: Qi Li Date: Mon, 18 Oct 2021 15:10:07 +0800 Subject: [PATCH] [NPU] add kernels for elementwise_add gather_nd tile, test=develop (#36464) --- .../elementwise/elementwise_add_op_npu.cc | 3 ++ paddle/fluid/operators/gather_nd_op_npu.cc | 36 +++++++++--------- paddle/fluid/operators/tile_op_npu.cc | 38 +++++++++++-------- .../npu/test_elementwise_add_op_npu.py | 15 +++++--- .../unittests/npu/test_gather_nd_op_npu.py | 16 ++++---- .../tests/unittests/npu/test_tile_op_npu.py | 20 +++++++++- 6 files changed, 80 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc index cd1d50a017c..41d5d718c24 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc @@ -146,6 +146,9 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL(elementwise_add, ops::ElementwiseAddNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ElementwiseAddNPUKernel, +#endif ops::ElementwiseAddNPUKernel); REGISTER_OP_NPU_KERNEL(elementwise_add_grad, diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc index d04e0bce36f..8102322bd3b 100644 --- a/paddle/fluid/operators/gather_nd_op_npu.cc +++ b/paddle/fluid/operators/gather_nd_op_npu.cc @@ -18,7 +18,10 @@ limitations under the License. */ namespace paddle { namespace operators { -template +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template class GatherNdNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -49,14 +52,12 @@ class GatherNdNPUKernel : public framework::OpKernel { framework::proto::VarType::INT64))); const auto &runner = NpuOpRunner("GatherNd", {*x, *index}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); + auto stream = ctx.template device_context().stream(); runner.Run(stream); } }; -template +template class GatherNdGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -91,10 +92,7 @@ class GatherNdGradNPUKernel : public framework::OpKernel { dout = &tmp_tensor2; } - auto stream = - ctx.template device_context() - .stream(); - + auto stream = ctx.template device_context().stream(); platform::NPUMemsetAsync(static_cast(p), 0, dx->numel() * sizeof(T), stream); @@ -108,13 +106,13 @@ class GatherNdGradNPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - gather_nd, ops::GatherNdNPUKernel, - ops::GatherNdNPUKernel); - -REGISTER_OP_NPU_KERNEL( - gather_nd_grad, - ops::GatherNdGradNPUKernel, - ops::GatherNdGradNPUKernel); +REGISTER_OP_NPU_KERNEL(gather_nd, + ops::GatherNdNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::GatherNdNPUKernel, +#endif + ops::GatherNdNPUKernel); + +REGISTER_OP_NPU_KERNEL(gather_nd_grad, + ops::GatherNdGradNPUKernel, + ops::GatherNdGradNPUKernel); diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc index c85a1cbc671..95d7cb9e362 100644 --- a/paddle/fluid/operators/tile_op_npu.cc +++ b/paddle/fluid/operators/tile_op_npu.cc @@ -16,7 +16,11 @@ limitations under the License. */ namespace paddle { namespace operators { -template + +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template class TileNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -92,18 +96,21 @@ class TileNPUKernel : public framework::OpKernel { std::vector temp(repeat_times.size(), 1); if (repeat_times == temp) { - framework::TensorCopy( - *in0, context.GetPlace(), - context.template device_context(), out0); + framework::TensorCopy(*in0, context.GetPlace(), + context.template device_context(), + out0); return; } - const auto& runner = - NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}}); - auto stream = - context.template device_context() - .stream(); - runner.Run(stream); + // const auto& runner = + // NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}}); + auto stream = context.template device_context().stream(); + NpuOpRunner runner; + runner.SetType("Tile") + .AddInput(*in0) + .AddInput(std::move(repeat_times)) + .AddOutput(*out0) + .Run(stream); } }; @@ -111,8 +118,9 @@ class TileNPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - tile, ops::TileNPUKernel, - ops::TileNPUKernel, - ops::TileNPUKernel); +REGISTER_OP_NPU_KERNEL(tile, ops::TileNPUKernel, ops::TileNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::TileNPUKernel, +#endif + ops::TileNPUKernel, + ops::TileNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py index 9b27e75e37d..75c70e0a131 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py @@ -65,7 +65,7 @@ class TestElementwiseAddOp(OpTest): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16: + if self.dtype == np.float16 or self.dtype == np.int64: return self.check_grad_with_place( @@ -75,7 +75,7 @@ class TestElementwiseAddOp(OpTest): max_relative_error=0.006, ) def test_check_grad_ingore_x(self): - if self.dtype == np.float16: + if self.dtype == np.float16 or self.dtype == np.int64: return self.check_grad_with_place( @@ -86,7 +86,7 @@ class TestElementwiseAddOp(OpTest): max_relative_error=0.006, ) def test_check_grad_ingore_y(self): - if self.dtype == np.float16: + if self.dtype == np.float16 or self.dtype == np.int64: return self.check_grad_with_place( @@ -102,6 +102,11 @@ class TestFP16ElementwiseAddOp(TestElementwiseAddOp): self.dtype = np.float16 +class TestINT64ElementwiseAddOp(TestElementwiseAddOp): + def init_dtype(self): + self.dtype = np.int64 + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseAddOp_scalar(TestElementwiseAddOp): @@ -507,8 +512,8 @@ class TestAddApi(unittest.TestCase): def test_dygraph(self): with fluid.dygraph.guard(paddle.NPUPlace(0)): - np_x = np.array([2, 3, 4]).astype('float64') - np_y = np.array([1, 5, 2]).astype('float64') + np_x = np.array([2, 3, 4]).astype('float32') + np_y = np.array([1, 5, 2]).astype('float32') x = fluid.dygraph.to_variable(np_x) y = fluid.dygraph.to_variable(np_y) z = self._executed_api(x, y) diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py index b124a546241..acb4ffd686f 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py @@ -61,7 +61,7 @@ def test_class1(op_type, typename): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -88,7 +88,7 @@ def test_class2(op_type, typename): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -120,7 +120,7 @@ def test_class3(op_type, typename): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place( @@ -153,7 +153,7 @@ def test_class4(op_type, typename): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -184,7 +184,7 @@ def test_class5(op_type, typename): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -217,7 +217,7 @@ def test_class6(op_type, typename): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -252,7 +252,7 @@ def test_class7(op_type, typename): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -276,7 +276,7 @@ class TestGatherNdAPI(unittest.TestCase): paddle.enable_static() -for _typename in {'float16', 'float32'}: +for _typename in {'float16', 'float32', 'int64'}: test_class1('gather_nd', _typename) test_class2('gather_nd', _typename) test_class3('gather_nd', _typename) diff --git a/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py index 0da80189f7d..0e61fa00fdf 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py @@ -206,7 +206,7 @@ class TestTileOpInt64_t(OpTest): self.op_type = "tile" self.inputs = { 'X': np.random.randint( - 10, size=(2, 4, 5)).astype("int32") + 10, size=(2, 4, 5)).astype("int64") } self.attrs = {'repeat_times': [2, 1, 4]} output = np.tile(self.inputs['X'], (2, 1, 4)) @@ -219,6 +219,24 @@ class TestTileOpInt64_t(OpTest): self.check_output_with_place(self.place) +# Situation 6: input x is Bool +class TestTileOpBool(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "tile" + self.inputs = {'X': np.random.randint(1, size=(2, 4, 5)).astype("bool")} + self.attrs = {'repeat_times': [2, 1, 4]} + output = np.tile(self.inputs['X'], (2, 1, 4)) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + # Test python API class TestTileAPI(unittest.TestCase): def test_api(self): -- GitLab