From c038cc7a34489b70c5d20748a2e00d78a5d281dd Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Thu, 28 Oct 2021 15:57:11 +0800 Subject: [PATCH] [NPU] Add int64 supporting for expand_v2, reduce_max, scale and tests (#36582) * add TypeAdapter method for npu_op_runner * add int64 supporting for elementwise_mul and reduce_sum * add int64 supporting and UT for expand_v2, scale and reduce_max * fix bug --- paddle/fluid/operators/activation_op_npu.cc | 4 +- .../elementwise/elementwise_mul_op_npu.cc | 12 +++- paddle/fluid/operators/expand_v2_op_npu.cc | 31 ++++++++-- .../fluid/operators/fill_constant_op_npu.cc | 61 ++++++++++++------ paddle/fluid/operators/npu_op_runner.cc | 62 +++++++++++++++++++ paddle/fluid/operators/npu_op_runner.h | 10 +++ .../operators/reduce_ops/reduce_max_op_npu.cc | 33 +++++++--- .../operators/reduce_ops/reduce_sum_op_npu.cc | 6 ++ paddle/fluid/operators/scale_op_npu.cc | 46 ++++++++++++-- .../unittests/npu/test_expand_v2_op_npu.py | 24 ++++++- .../npu/test_fill_constant_op_npu.py | 24 +++++++ .../unittests/npu/test_reduce_max_op_npu.py | 25 ++++++++ .../tests/unittests/npu/test_scale_op_npu.py | 13 +++- 13 files changed, 305 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc index 20c56d6a27..e0cb4dee53 100644 --- a/paddle/fluid/operators/activation_op_npu.cc +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -503,7 +503,6 @@ class SwishGradNPUKernel : public framework::OpKernel { beta_x.mutable_data(x->dims(), ctx.GetPlace()); sigmoid_out.mutable_data(x->dims(), ctx.GetPlace()); swish_out.mutable_data(x->dims(), ctx.GetPlace()); - const auto& muls_runner = NpuOpRunner("Muls", {*x}, {beta_x}, {{"value", beta}}); muls_runner.Run(stream); @@ -515,6 +514,9 @@ class SwishGradNPUKernel : public framework::OpKernel { const auto& mul_runner = NpuOpRunner("Mul", {sigmoid_out, *x}, {swish_out}, {}); mul_runner.Run(stream); + const auto& muls_runner2 = + NpuOpRunner("Muls", {swish_out}, {swish_out}, {{"value", beta}}); + muls_runner2.Run(stream); const auto& mul_runner1 = NpuOpRunner("Mul", {sigmoid_out, swish_out}, {*dx}, {}); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc index b2030ad21e..36a7d54f8c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc @@ -143,8 +143,16 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL(elementwise_mul, ops::ElementwiseMulNPUKernel, - ops::ElementwiseMulNPUKernel); + ops::ElementwiseMulNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ElementwiseMulNPUKernel, +#endif + ops::ElementwiseMulNPUKernel); REGISTER_OP_NPU_KERNEL( elementwise_mul_grad, ops::ElementwiseMulGradNPUKernel, - ops::ElementwiseMulGradNPUKernel); + ops::ElementwiseMulGradNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ElementwiseMulGradNPUKernel, +#endif + ops::ElementwiseMulGradNPUKernel); diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc index 4b0e077057..46385a20ab 100644 --- a/paddle/fluid/operators/expand_v2_op_npu.cc +++ b/paddle/fluid/operators/expand_v2_op_npu.cc @@ -106,11 +106,28 @@ class ExpandV2NPUKernel : public framework::OpKernel { Out->Resize(out_dims); Out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("ExpandD", {*X}, {*Out}, attr_input); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); + const auto& dev_ctx = + ctx.template device_context(); + auto op_func = [](const std::vector& inputs, + const std::vector& outputs, + const NPUAttributeMap& attrs, + const platform::NPUDeviceContext& dev_ctx) { + const auto& runner = NpuOpRunner("ExpandD", inputs, outputs, attrs); + runner.Run(dev_ctx.stream()); + }; + + if (X->type() == framework::proto::VarType::BOOL) { + NpuOpRunner::TypeAdapter({*X}, {*Out}, attr_input, dev_ctx, op_func, + {framework::proto::VarType::UINT8}, + {framework::proto::VarType::UINT8}); + } else if (X->type() == framework::proto::VarType::INT64) { + NpuOpRunner::TypeAdapter({*X}, {*Out}, attr_input, dev_ctx, op_func, + {framework::proto::VarType::INT32}, + {framework::proto::VarType::INT32}); + } else { + const auto& runner = NpuOpRunner("ExpandD", {*X}, {*Out}, attr_input); + runner.Run(dev_ctx.stream()); + } } }; @@ -181,7 +198,9 @@ REGISTER_OP_NPU_KERNEL( ops::ExpandV2NPUKernel, ops::ExpandV2NPUKernel, - ops::ExpandV2NPUKernel); + ops::ExpandV2NPUKernel, + ops::ExpandV2NPUKernel, + ops::ExpandV2NPUKernel); REGISTER_OP_NPU_KERNEL( expand_v2_grad, diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index 16a2433f5c..7241fcaf18 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -22,13 +22,13 @@ namespace operators { template class FillConstantNPUKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext &ctx) const override { auto data_type = static_cast(ctx.Attr("dtype")); auto str_value = ctx.Attr("str_value"); auto float_value = ctx.Attr("value"); - auto* out_var = ctx.Output("Out"); + auto *out_var = ctx.Output("Out"); auto stream = ctx.template device_context() .stream(); @@ -59,28 +59,49 @@ class FillConstantNPUKernel : public framework::OpKernel { } auto shape = GetShape(ctx); - Tensor tensor_value(data_type); - tensor_value.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&tensor_value, value); - out_var->mutable_data(shape, ctx.GetPlace()); - - NpuOpRunner runner; + if (data_type != framework::proto::VarType::BOOL) { + Tensor tensor_value(data_type); + tensor_value.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&tensor_value, value); + NpuOpRunner runner; #if (CANN_VERSION_CODE >= 503003) - runner.SetType("FillD") - .AddInput(tensor_value) - .AddOutput(*out_var) - .AddAttrs( - {{ "dims", - framework::vectorize(shape) }}) - .Run(stream); + runner.SetType("FillD") + .AddInput(tensor_value) + .AddOutput(*out_var) + .AddAttrs( + {{ "dims", + framework::vectorize(shape) }}) + .Run(stream); #else - runner.SetType("Fill") - .AddInput(framework::vectorize(shape)) - .AddInput(tensor_value) - .AddOutput(*out_var) - .Run(stream); + runner.SetType("Fill") + .AddInput(framework::vectorize(shape)) + .AddInput(tensor_value) + .AddOutput(*out_var) + .Run(stream); #endif + } else { + const auto &dev_ctx = + ctx.template device_context(); + auto op_func = [&shape, &value]( + const std::vector &inputs, const std::vector &outputs, + const NPUAttributeMap &attrs, + const platform::NPUDeviceContext &dev_ctx) { + Tensor tensor_value; + tensor_value.mutable_data({1}, dev_ctx.GetPlace()); + FillNpuTensorWithConstant(&tensor_value, + static_cast(value)); + + NpuOpRunner runner; + runner.SetType("Fill") + .AddInput(framework::vectorize(shape)) + .AddInput(tensor_value) + .AddOutput(outputs[0]) + .Run(dev_ctx.stream()); + }; + NpuOpRunner::TypeAdapter({}, {*out_var}, {}, dev_ctx, op_func, {}, + {framework::proto::VarType::UINT8}); + } } }; } // namespace operators diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index 830e18cb8a..e104fc157d 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -436,5 +436,67 @@ void NpuOpRunner::Run(aclrtStream stream) const { PADDLE_ENFORCE_NPU_SUCCESS(ret); } +void NpuOpRunner::TypeAdapter( + const std::vector &inputs, const std::vector &outputs, + const NPUAttributeMap &attrs, const platform::NPUDeviceContext &dev_ctx, + std::function &, const std::vector &, + const NPUAttributeMap &, + const platform::NPUDeviceContext &)> + op_runner, + const std::vector &input_type, + const std::vector &output_type) { + PADDLE_ENFORCE_EQ( + inputs.size(), input_type.size(), + platform::errors::InvalidArgument( + "The number of inputs must be equal to input_type.size().")); + PADDLE_ENFORCE_EQ( + outputs.size(), output_type.size(), + platform::errors::InvalidArgument( + "The number of outputs must be equal to output_type.size().")); + + std::vector tmp_inputs(inputs.size()); + std::vector tmp_outputs(outputs.size()); + + for (size_t i = 0; i < input_type.size(); ++i) { + bool cast_input = + (input_type[i] == -1 || input_type[i] != inputs[i].type()); + if (!cast_input) { + tmp_inputs[i].ShareDataWith(inputs[i]); + } else { + tmp_inputs[i].Resize(inputs[i].dims()); + tmp_inputs[i].mutable_data(dev_ctx.GetPlace(), input_type[i]); + + const auto &cast_runner = NpuOpRunner( + "Cast", {inputs[i]}, {tmp_inputs[i]}, + {{"dst_type", static_cast(ConvertToNpuDtype(input_type[i]))}}); + cast_runner.Run(dev_ctx.stream()); + } + } + for (size_t i = 0; i < output_type.size(); ++i) { + bool cast_output = + (output_type[i] == -1 || output_type[i] != outputs[i].type()); + if (!cast_output) { + tmp_outputs[i].ShareDataWith(outputs[i]); + } else { + tmp_outputs[i].Resize(outputs[i].dims()); + tmp_outputs[i].mutable_data(dev_ctx.GetPlace(), output_type[i]); + } + } + + op_runner(tmp_inputs, tmp_outputs, attrs, dev_ctx); + + for (size_t i = 0; i < output_type.size(); ++i) { + bool cast_output = + (output_type[i] == -1 || output_type[i] != outputs[i].type()); + if (cast_output) { + const auto &cast_runner = NpuOpRunner( + "Cast", {tmp_outputs[i]}, {outputs[i]}, + {{"dst_type", + static_cast(ConvertToNpuDtype(outputs[i].type()))}}); + cast_runner.Run(dev_ctx.stream()); + } + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index 6db5f17d67..a4a3786b5d 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -103,6 +103,16 @@ class NpuOpRunner { void Run(aclrtStream stream = nullptr) const; + static void TypeAdapter( + const std::vector &inputs, const std::vector &outputs, + const NPUAttributeMap &attrs, const platform::NPUDeviceContext &dev_ctx, + std::function &, + const std::vector &, const NPUAttributeMap &, + const platform::NPUDeviceContext &)> + op_runner, + const std::vector &input_type, + const std::vector &output_type); + private: aclTensorDesc *CreateTensorDesc(Tensor tensor, aclMemType mem_type = ACL_MEMTYPE_DEVICE); diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc index 5efc7e9b86..68417cdad5 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc @@ -73,20 +73,33 @@ class ReduceMaxNPUKernel : public framework::OpKernel { attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}}; } - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = - NpuOpRunner("ReduceMaxD", {*x}, {cast_out}, attr_input); - runner.Run(stream); + const auto& dev_ctx = + ctx.template device_context(); + if (x->type() == framework::proto::VarType::INT64) { + auto op_func = [](const std::vector& inputs, + const std::vector& outputs, + const NPUAttributeMap& attrs, + const platform::NPUDeviceContext& dev_ctx) { + const auto& runner = + NpuOpRunner("ReduceMaxD", {inputs[0]}, {outputs[0]}, attrs); + runner.Run(dev_ctx.stream()); + }; + + NpuOpRunner::TypeAdapter({*x}, {cast_out}, attr_input, dev_ctx, op_func, + {framework::proto::VarType::INT32}, + {framework::proto::VarType::INT32}); + } else { + const auto& runner = + NpuOpRunner("ReduceMaxD", {*x}, {cast_out}, attr_input); + runner.Run(dev_ctx.stream()); + } if (x->type() != cast_out_dtype) { auto dst_dtype = ConvertToNpuDtype(cast_out_dtype); const auto& runner_cast = NpuOpRunner("Cast", {cast_out}, {*out}, {{"dst_type", static_cast(dst_dtype)}}); - runner_cast.Run(stream); + runner_cast.Run(dev_ctx.stream()); } } }; @@ -98,4 +111,6 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL( reduce_max, ops::ReduceMaxNPUKernel, - ops::ReduceMaxNPUKernel); + ops::ReduceMaxNPUKernel, + ops::ReduceMaxNPUKernel, + ops::ReduceMaxNPUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc index 78bd42ff00..33fcdbce9d 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc @@ -142,12 +142,18 @@ namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL( reduce_sum, ops::ReduceSumNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ReduceSumNPUKernel, +#endif ops::ReduceSumNPUKernel, ops::ReduceSumNPUKernel); REGISTER_OP_NPU_KERNEL( reduce_sum_grad, ops::ReduceSumGradNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ReduceSumGradNPUKernel, +#endif ops::ReduceSumGradNPUKernel, ops::ReduceSumGradNPUKernel); diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc index 744a9b137f..c2f320ed68 100644 --- a/paddle/fluid/operators/scale_op_npu.cc +++ b/paddle/fluid/operators/scale_op_npu.cc @@ -37,15 +37,47 @@ class ScaleNPUKernel : public framework::OpKernel { auto* scale_tensor = ctx.Input("ScaleTensor"); scale = static_cast(GetAttrFromTensor(scale_tensor)); } - + if (isinf(scale)) { + if (signbit(scale)) { + scale = -std::numeric_limits::max(); + } else { + scale = std::numeric_limits::max(); + } + } if (!bias_after_scale) { bias *= scale; } out->mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("Power", {*x}, {*out}, - {{"power", power}, {"scale", scale}, {"shift", bias}}); - runner.Run(stream); + + framework::NPUAttributeMap attrs = { + {"power", power}, {"scale", scale}, {"shift", bias}}; + const auto& dev_ctx = + ctx.template device_context(); + auto op_func = [](const std::vector& inputs, + const std::vector& outputs, + const NPUAttributeMap& attrs, + const platform::NPUDeviceContext& dev_ctx) { + const auto& muls_runner = NpuOpRunner("Muls", {inputs[0]}, {outputs[0]}, + {{"value", attrs.at("scale")}}); + muls_runner.Run(dev_ctx.stream()); + + const auto& adds_runner = NpuOpRunner("Adds", {outputs[0]}, {outputs[0]}, + {{"value", attrs.at("shift")}}); + adds_runner.Run(dev_ctx.stream()); + }; + + if (x->type() == framework::proto::VarType::INT32) { + NpuOpRunner::TypeAdapter({*x}, {*out}, attrs, dev_ctx, op_func, + {framework::proto::VarType::INT32}, + {framework::proto::VarType::INT32}); + } else if (x->type() == framework::proto::VarType::INT64) { + NpuOpRunner::TypeAdapter({*x}, {*out}, attrs, dev_ctx, op_func, + {framework::proto::VarType::INT32}, + {framework::proto::VarType::INT32}); + } else { + const auto& runner = NpuOpRunner("Power", {*x}, {*out}, attrs); + runner.Run(stream); + } } }; @@ -54,4 +86,6 @@ class ScaleNPUKernel : public framework::OpKernel { REGISTER_OP_NPU_KERNEL( scale, paddle::operators::ScaleNPUKernel, - paddle::operators::ScaleNPUKernel); + paddle::operators::ScaleNPUKernel, + paddle::operators::ScaleNPUKernel, + paddle::operators::ScaleNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py index d48d2a8430..fd0b985030 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py @@ -201,13 +201,16 @@ class TestExpandV2OpFloat(OpTest): # Situation 5: input x is int32 # skip grad check for int32 class TestExpandV2OpInteger(OpTest): + def init_dtype(self): + self.dtype = 'int32' + def setUp(self): self.set_npu() self.place = paddle.NPUPlace(0) self.op_type = "expand_v2" self.inputs = { 'X': np.random.randint( - 10, size=(2, 4, 20)).astype("int32") + 10, size=(2, 4, 20)).astype(self.dtype) } self.attrs = {'shape': [2, 4, 20]} output = np.tile(self.inputs['X'], (1, 1, 1)) @@ -221,6 +224,25 @@ class TestExpandV2OpInteger(OpTest): self.check_output_with_place(self.place) +class TesstExpandV2OpInt64(TestExpandV2OpInteger): + def init_dtype(self): + self.dtype = 'int64' + + +class TesstExpandV2OpBool(TestExpandV2OpInteger): + def init_dtype(self): + self.dtype = 'bool' + + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "expand_v2" + self.inputs = {'X': np.random.randint(10, size=(2, 4, 20)) > 5} + self.attrs = {'shape': [2, 4, 20]} + output = np.tile(self.inputs['X'], (1, 1, 1)) + self.outputs = {'Out': output} + + class TestExpandV2Error(unittest.TestCase): def test_errors(self): with program_guard(Program(), Program()): diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py index 2ab1521380..a3e781c990 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py @@ -120,5 +120,29 @@ class TestFillConstantFP16(OpTest): self.check_output_with_place(self.place, atol=1e-3) +class TestFillConstantBool(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "fill_constant" + + self.inputs = {} + self.attrs = { + 'shape': [123, 92], + 'value': True, + 'dtype': core.VarDesc.VarType.BOOL + } + self.outputs = {'Out': np.full((123, 92), True).astype(self.dtype)} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.BOOL + + def test_check_output(self): + self.check_output_with_place(self.place) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py index f6c346159b..68a28ea72e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py @@ -271,5 +271,30 @@ class TestReduceMaxOpWithOutDtype_fp32_2(TestNPUReduceMaxOp): self.dtype = np.float16 +@skip_check_grad_ci( + reason="reduce_max is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework.") +class TestReduceMaxOpInt64(TestNPUReduceMaxOp): + """Remove Max with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_max" + self.set_npu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = { + 'dim': [-2, -1], + 'out_dtype': int(core.VarDesc.VarType.INT64) + } + self.outputs = { + 'Out': self.inputs['X'].max( + axis=tuple(self.attrs['dim'])).astype(np.float32) + } + + def init_dtype(self): + self.dtype = np.int64 + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py index 65ec28fbf7..424c4ca0ff 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py @@ -39,7 +39,8 @@ class TestScale(OpTest): } self.attrs = {'scale': -2.3, 'bias': 0, 'bias_after_scale': True} self.outputs = { - 'Out': self.inputs['X'] * self.dtype(self.attrs['scale']) + 'Out': (self.inputs['X'] * + self.dtype(self.attrs['scale'])).astype(self.dtype) } def set_npu(self): @@ -57,6 +58,16 @@ class TestFP16Scale(TestScale): self.dtype = np.float16 +class TestScaleInt(TestScale): + def init_dtype(self): + self.dtype = np.int32 + + +class TestScaleInt64(TestScale): + def init_dtype(self): + self.dtype = np.int64 + + class TestBiasAfterScale(OpTest): def setUp(self): self.set_npu() -- GitLab