From f354e1d6d51dd0fafb7f722d2a3a926e3c4a3336 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 25 Mar 2021 15:18:46 +0800 Subject: [PATCH] [NPU] fix some op bugs (#31855) * fix some op bugs * fix some bugs * follow comments * fix log level * add ut --- paddle/fluid/operators/activation_op_npu.cc | 3 +- .../amp/update_loss_scaling_op_npu.cc | 6 +-- paddle/fluid/operators/concat_op_npu.cc | 46 ++++++++++--------- .../operators/reduce_ops/reduce_sum_op_npu.cc | 39 +++++++++++++++- paddle/fluid/operators/slice_op_npu.cc | 10 +++- .../tests/unittests/npu/test_concat_op_npu.py | 6 +-- .../unittests/npu/test_reduce_sum_op_npu.py | 8 +++- .../tests/unittests/npu/test_slice_op_npu.py | 12 +++++ 8 files changed, 98 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc index 1a843bfc991..923b581af28 100644 --- a/paddle/fluid/operators/activation_op_npu.cc +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -364,4 +364,5 @@ REGISTER_OP_NPU_KERNEL( REGISTER_OP_NPU_KERNEL( square, ops::SquareNPUKernel, ops::SquareNPUKernel); + paddle::platform::float16>, + ops::SquareNPUKernel); diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc index 1385a3182fd..dd6dbfd5c0b 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -148,13 +148,13 @@ class LazyZerosNPU { for (size_t i = 0; i < xs.size(); ++i) { auto* out = outs[i]; if (found_inf_vec[0]) { - VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --"; + VLOG(4) << "-- UpdateLossScaling: Find infinite grads. --"; auto place = dev_ctx.GetPlace(); auto stream = dev_ctx.stream(); - auto g = out->mutable_data(place); + auto g = out->mutable_data(place); platform::NPUMemsetAsync(static_cast(g), 0, - out->numel() * sizeof(int), stream); + out->numel() * sizeof(T), stream); } } } diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc index 04aa10d712e..9b979dede04 100644 --- a/paddle/fluid/operators/concat_op_npu.cc +++ b/paddle/fluid/operators/concat_op_npu.cc @@ -71,15 +71,6 @@ class ConcatGradNPUKernel : public framework::OpKernel { auto outs = ctx.MultiOutput(framework::GradVarName("X")); - { - auto dx = outs; - auto x = ins; - for (size_t i = 0; i < dx.size(); ++i) { - if (dx[i] != nullptr) { - dx[i]->set_lod(x[i]->lod()); - } - } - } PADDLE_ENFORCE_NOT_NULL(ins[0], platform::errors::NotFound( "The first input tensor is not initalized.")); @@ -88,26 +79,39 @@ class ConcatGradNPUKernel : public framework::OpKernel { axis = ComputeAxis(static_cast(axis), static_cast(ins[0]->dims().size())); - // get output tensor that the name is not kEmptyVarName - std::vector outputs; + std::vector sizes; + int offset = 0; + auto stream = + ctx.template device_context() + .stream(); for (size_t j = 0; j < outs.size(); ++j) { + // For stop gradient + // get output tensor that the name is not kEmptyVarName if (out_var_names[j] != framework::kEmptyVarName && outs[j]->numel() != 0UL) { outs[j]->mutable_data(ctx.GetPlace()); - outputs.push_back(*outs[j]); sizes.push_back(outs[j]->dims()[axis]); + std::vector offsets; + std::vector sizes; + for (int dim = 0; dim < ins[j]->dims().size(); ++dim) { + if (dim == axis) { + offsets.push_back(offset); + sizes.push_back(ins[j]->dims()[dim]); + } else { + offsets.push_back(0); + sizes.push_back(ins[j]->dims()[dim]); + } + } + auto runner = + NpuOpRunner("SliceD", {*out_grad}, {*outs[j]}, + {{"offsets", offset}, {"size", ins[j]->dims()[axis]}}); + runner.Run(stream); + } + if (ins[j]->numel() != 0UL) { + offset += ins[j]->dims()[axis]; } } - auto runner = - NpuOpRunner("SplitVD", {*out_grad}, outputs, - {{"split_dim", axis}, - {"size_splits", sizes}, - {"num_split", static_cast(outputs.size())}}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc index e2cd7ca353c..f3b6e69a48b 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc @@ -34,23 +34,58 @@ class ReduceSumNPUKernel : public framework::OpKernel { out->mutable_data(ctx.GetPlace()); + // special case + if (x->dims().size() == 1 && keep_dims == false) { + keep_dims = true; + } + auto stream = ctx.template device_context() .stream(); + + framework::Tensor cast_x; + framework::Tensor cast_out; + // NOTE: ReduceSumD only supports fp32 and fp16 + if (x->type() != framework::proto::VarType::FP32 && + x->type() != framework::proto::VarType::FP16) { + cast_x.Resize(x->dims()); + cast_x.mutable_data(ctx.GetPlace()); + auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32); + auto runner_cast = NpuOpRunner( + "Cast", {*x}, {cast_x}, {{"dst_type", static_cast(dst_dtype)}}); + runner_cast.Run(stream); + + cast_out.Resize(out->dims()); + cast_out.mutable_data(ctx.GetPlace()); + } else { + cast_x.ShareDataWith(*x); + cast_out.ShareDataWith(*out); + } + if (reduce_all) { std::vector dim_vec; for (int i = 0; i < x->dims().size(); i++) { dim_vec.push_back(i); } - auto runner = NpuOpRunner("ReduceSumD", {*x}, {*out}, + + auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out}, {{"axes", dim_vec}, {"keep_dims", keep_dims}}); runner.Run(stream); } else { - auto runner = NpuOpRunner("ReduceSumD", {*x}, {*out}, + auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out}, {{"axes", dims}, {"keep_dims", keep_dims}}); runner.Run(stream); } + + if (x->type() != framework::proto::VarType::FP32 && + x->type() != framework::proto::VarType::FP16) { + auto dst_dtype = ConvertToNpuDtype(out->type()); + auto runner_cast = + NpuOpRunner("Cast", {cast_out}, {*out}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast.Run(stream); + } } }; diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc index 7dc06700e00..e5e0dafdae0 100644 --- a/paddle/fluid/operators/slice_op_npu.cc +++ b/paddle/fluid/operators/slice_op_npu.cc @@ -36,7 +36,15 @@ void UpdateAttr(const framework::DDim in_dims, const std::vector axes, if (axis == i) { start = starts[cnt]; - end = ends[cnt] <= in_dims[i] ? ends[cnt] : end; + if (start < 0) { + start = (start + in_dims[i]); + } + start = std::max(start, static_cast(0)); + end = ends[cnt]; + if (end < 0) { + end = (end + in_dims[i]); + } + end = std::min(end, static_cast(in_dims[i])); cnt++; } diff --git a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py index 6201df135b0..a2ec1c7a9ee 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py @@ -32,7 +32,7 @@ class TestConcat(OpTest): def setUp(self): self.set_npu() self.op_type = "concat" - self.place = paddle.NPUPlace(4) + self.place = paddle.NPUPlace(0) self.init_dtype() self.init_test_data() @@ -66,7 +66,7 @@ class TestConcat(OpTest): def test_check_grad(self): self.check_grad_with_place( - self.place, ['x0'], 'Out', check_dygraph=False) + self.place, ['x0', 'x2'], 'Out', check_dygraph=False) self.check_grad_with_place( self.place, ['x1'], 'Out', check_dygraph=False) self.check_grad_with_place( @@ -77,7 +77,7 @@ class TestConcatFP16(OpTest): def setUp(self): self.set_npu() self.op_type = "concat" - self.place = paddle.NPUPlace(4) + self.place = paddle.NPUPlace(0) self.init_dtype() self.init_test_data() diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py index dea5141a024..d3861bf0780 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py @@ -32,6 +32,7 @@ class TestReduceSum(OpTest): def setUp(self): np.random.seed(SEED) self.set_npu() + self.init_dtype() self.place = paddle.NPUPlace(0) self.init_op_type() self.initTestCase() @@ -42,7 +43,7 @@ class TestReduceSum(OpTest): 'keep_dim': self.keep_dim, 'reduce_all': self.reduce_all } - self.inputs = {'X': np.random.random(self.shape).astype("float32")} + self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)} if self.attrs['reduce_all']: self.outputs = {'Out': self.inputs['X'].sum()} else: @@ -78,6 +79,11 @@ class TestReduceSum(OpTest): # +class TestReduceSum2(OpTest): + def init_dtype(self): + self.dtype = np.int32 + + @unittest.skipIf(not paddle.is_compiled_with_npu(), "core is not compiled with NPU") class TestReduceSumNet(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py index 1e30bb00782..500618f509f 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py @@ -63,10 +63,22 @@ class TestSliceOp(OpTest): self.check_output_with_place(self.place, check_dygraph=False) def test_check_grad_normal(self): + if self.dtype == np.float16: + return self.check_grad_with_place( self.place, ['Input'], 'Out', check_dygraph=False) +class TestSliceOp2(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, -3] + self.ends = [3, 3, -1] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[1:3, 0:3, -3:-1, :] + + @unittest.skipIf(not paddle.is_compiled_with_npu(), "core is not compiled with NPU") class TestSliceOpFp16(TestSliceOp): -- GitLab