From ceec1e21df709006bb4a1c73083a034499eea634 Mon Sep 17 00:00:00 2001 From: zhangyk0314 <48021248+zhangyk0314@users.noreply.github.com> Date: Thu, 30 Dec 2021 19:08:09 +0800 Subject: [PATCH] Add exp, abs_grad, reciprocal, reciprocal_grad operator for XPU and update xpu2_op_list.h,test=kunlun (#38570) --- cmake/external/xpu.cmake | 2 +- paddle/fluid/operators/activation_op_xpu.cc | 183 +++++++++++------- .../fluid/platform/device/xpu/xpu2_op_list.h | 18 ++ .../unittests/xpu/test_activation_op_xpu.py | 24 +++ 4 files changed, 154 insertions(+), 73 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 9041feb10c8..588ba0bfe86 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,7 +36,7 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211226") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211228") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc index fe85eb26705..60188ee53ef 100644 --- a/paddle/fluid/operators/activation_op_xpu.cc +++ b/paddle/fluid/operators/activation_op_xpu.cc @@ -98,29 +98,29 @@ void xpu_activation_backward( } template -struct XPUReluFunctor : public BaseActivationFunctor { +struct XPUAbsFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_forward( - ctx, xpu::relu); + ctx, xpu::abs); } }; template -struct XPUSigmoidFunctor : public BaseActivationFunctor { +struct XPUAbsGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { - xpu_activation_forward( - ctx, xpu::sigmoid); + xpu_activation_backward( + ctx, xpu::abs_grad); } }; template -struct XPUTanhFunctor : public BaseActivationFunctor { +struct XPUExpFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_forward( - ctx, xpu::tanh); + ctx, xpu::exp); } }; @@ -134,119 +134,83 @@ struct XPULogFunctor : public BaseActivationFunctor { }; template -struct XPUSquareFunctor : public BaseActivationFunctor { +struct XPUReciprocalFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_forward( - ctx, xpu::square); + ctx, xpu::reciprocal); } }; template -struct XPUSqrtFunctor : public BaseActivationFunctor { +struct XPUReciprocalGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { - xpu_activation_forward( - ctx, xpu::sqrt); + xpu_activation_backward( + ctx, xpu::reciprocal_grad); } }; template -struct XPUAbsFunctor : public BaseActivationFunctor { +struct XPUReluFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_forward( - ctx, xpu::abs); + ctx, xpu::relu); } }; template -struct XPUPowFunctor : public BaseActivationFunctor { +struct XPUReluGradFunctor : public BaseActivationFunctor { + using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { - const auto *x = ctx.Input("X"); - auto *y = ctx.Output("Out"); - auto pow_factor = ctx.Attr("factor"); - const T *x_data = x->data(); - T *y_data = y->mutable_data(ctx.GetPlace()); - T *factor_data = nullptr; - - auto xpu_context = - ctx.device_context().x_context(); - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&factor_data), - x->numel() * sizeof(T)), - XPU_SUCCESS, platform::errors::ResourceExhausted( - "XPU has no enough memory")); - int r = xpu::constant(xpu_context, factor_data, x->numel(), pow_factor); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External("XPU constant op return" - " wrong value[%d %s] in pow op.", - r, XPUAPIErrorMsg[r])); - r = xpu::pow(xpu_context, x_data, factor_data, y_data, x->numel()); - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::External("XPU pow op return" - " wrong value[%d %s].", - r, XPUAPIErrorMsg[r])); - if (xpu_context->xpu_stream != nullptr) { - xpu_wait(xpu_context->xpu_stream); - } - xpu_free(factor_data); + xpu_activation_backward( + ctx, xpu::relu_grad); } }; template -struct XPUHardSwishFunctor : public BaseActivationFunctor { +struct XPUSigmoidFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { - float threshold = ctx.Attr("threshold"); - float scale = ctx.Attr("scale"); - float offset = ctx.Attr("offset"); - PADDLE_ENFORCE_EQ(threshold, 6.0f, - platform::errors::External( - "Not support threshold [%f] in XPU", threshold)); - PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External( - "Not support scale [%f] in XPU", scale)); - PADDLE_ENFORCE_EQ( - offset, 3.0f, - platform::errors::External("Not support offset [%f] in XPU", offset)); xpu_activation_forward( - ctx, xpu::hard_swish); + ctx, xpu::sigmoid); } }; template -struct XPUReluGradFunctor : public BaseActivationFunctor { +struct XPUSigmoidGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_backward( - ctx, xpu::relu_grad); + ctx, xpu::sigmoid_grad); } }; template -struct XPUTanhGradFunctor : public BaseActivationFunctor { +struct XPUSqrtFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { - xpu_activation_backward( - ctx, xpu::tanh_grad); + xpu_activation_forward( + ctx, xpu::sqrt); } }; template -struct XPUSigmoidGradFunctor : public BaseActivationFunctor { +struct XPUSqrtGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_backward( - ctx, xpu::sigmoid_grad); + ctx, xpu::sqrt_grad); } }; template -struct XPUSqrtGradFunctor : public BaseActivationFunctor { +struct XPUSquareFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { - xpu_activation_backward( - ctx, xpu::sqrt_grad); + xpu_activation_forward( + ctx, xpu::square); } }; @@ -259,6 +223,44 @@ struct XPUSquareGradFunctor : public BaseActivationFunctor { } }; +template +struct XPUTanhFunctor : public BaseActivationFunctor { + using XPUType = typename XPUTypeTrait::Type; + void operator()(const framework::ExecutionContext &ctx) const { + xpu_activation_forward( + ctx, xpu::tanh); + } +}; + +template +struct XPUTanhGradFunctor : public BaseActivationFunctor { + using XPUType = typename XPUTypeTrait::Type; + void operator()(const framework::ExecutionContext &ctx) const { + xpu_activation_backward( + ctx, xpu::tanh_grad); + } +}; + +template +struct XPUHardSwishFunctor : public BaseActivationFunctor { + using XPUType = typename XPUTypeTrait::Type; + void operator()(const framework::ExecutionContext &ctx) const { + float threshold = ctx.Attr("threshold"); + float scale = ctx.Attr("scale"); + float offset = ctx.Attr("offset"); + PADDLE_ENFORCE_EQ(threshold, 6.0f, + platform::errors::External( + "Not support threshold [%f] in XPU", threshold)); + PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External( + "Not support scale [%f] in XPU", scale)); + PADDLE_ENFORCE_EQ( + offset, 3.0f, + platform::errors::External("Not support offset [%f] in XPU", offset)); + xpu_activation_forward( + ctx, xpu::hard_swish); + } +}; + template struct XPUHardSwishGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; @@ -328,6 +330,40 @@ struct XPULeakyReluGradFunctor : public BaseActivationFunctor { } }; +template +struct XPUPowFunctor : public BaseActivationFunctor { + void operator()(const framework::ExecutionContext &ctx) const { + const auto *x = ctx.Input("X"); + auto *y = ctx.Output("Out"); + auto pow_factor = ctx.Attr("factor"); + const T *x_data = x->data(); + T *y_data = y->mutable_data(ctx.GetPlace()); + T *factor_data = nullptr; + + auto xpu_context = + ctx.device_context().x_context(); + PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&factor_data), + x->numel() * sizeof(T)), + XPU_SUCCESS, platform::errors::ResourceExhausted( + "XPU has no enough memory")); + int r = xpu::constant(xpu_context, factor_data, x->numel(), pow_factor); + PADDLE_ENFORCE_EQ( + r, xpu::Error_t::SUCCESS, + platform::errors::External("XPU constant op return" + " wrong value[%d %s] in pow op.", + r, XPUAPIErrorMsg[r])); + r = xpu::pow(xpu_context, x_data, factor_data, y_data, x->numel()); + PADDLE_ENFORCE_EQ( + r, xpu::Error_t::SUCCESS, + platform::errors::External("XPU pow op return wrong value[%d %s].", r, + XPUAPIErrorMsg[r])); + if (xpu_context->xpu_stream != nullptr) { + xpu_wait(xpu_context->xpu_stream); + } + xpu_free(factor_data); + } +}; + } // namespace operators } // namespace paddle @@ -340,15 +376,18 @@ namespace ops = paddle::operators; act_type##_grad, \ ops::XPUActivationGradKernel>); +REGISTER_ACTIVATION_XPU_KERNEL(abs, XPUAbsFunctor, XPUAbsGradFunctor) +REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, XPUHardSwishFunctor, + XPUHardSwishGradFunctor) +REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, XPULeakyReluFunctor, + XPULeakyReluGradFunctor) +REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, XPUReciprocalFunctor, + XPUReciprocalGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor, XPUSigmoidGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSquareGradFunctor) -REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, XPUHardSwishFunctor, - XPUHardSwishGradFunctor) -REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, XPULeakyReluFunctor, - XPULeakyReluGradFunctor) REGISTER_OP_XPU_KERNEL( tanh, ops::XPUActivationKernel>, @@ -358,11 +397,11 @@ REGISTER_OP_XPU_KERNEL( ops::XPUActivationGradKernel< ops::XPUTanhGradFunctor>); +REGISTER_OP_XPU_KERNEL(exp, + ops::XPUActivationKernel>); REGISTER_OP_XPU_KERNEL(log, ops::XPUActivationKernel>); REGISTER_OP_XPU_KERNEL(pow, ops::XPUActivationKernel>); -REGISTER_OP_XPU_KERNEL(abs, - ops::XPUActivationKernel>); #endif // PADDLE_WITH_XPU diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index c5a140a7681..b4ad88ce6ab 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -29,6 +29,9 @@ using XPUOpMap = std::unordered_map; XPUOpMap& get_kl2_ops() { // KL1支持的op,通过op_name, data_type, place来索引 static XPUOpMap s_xpu2_kernels{ + {"abs", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"abs_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, @@ -106,6 +109,7 @@ XPUOpMap& get_kl2_ops() { {"equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"expand_as_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), @@ -185,6 +189,9 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"hard_swish_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"iou_similarity", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"label_smooth", @@ -227,6 +234,10 @@ XPUOpMap& get_kl2_ops() { {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"nearest_interp_v2", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"nearest_interp_v2_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, @@ -239,6 +250,10 @@ XPUOpMap& get_kl2_ops() { {"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace())})}, + {"reciprocal", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reciprocal_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"reduce_max_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, @@ -273,6 +288,9 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP32, XPUPlace())})}, {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace())})}, + {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sigmoid_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py index c2c69be45bf..ce82b20eca4 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py @@ -154,6 +154,11 @@ class TestXPUAbs(TestXPUActivation): self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} self.outputs = {'Out': out} + def test_check_grad(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_grad_with_place(place, ['X'], 'Out') + @unittest.skipIf(not paddle.is_compiled_with_xpu(), "core is not compiled with XPU") @@ -334,6 +339,25 @@ def leaky_relu(x, alpha): return y_ref.astype(x.dtype) +class TestXPUReciprocal(TestXPUActivation): + def setUp(self): + self.op_type = "reciprocal" + self.init_dtype() + + np.random.seed(1024) + x = np.random.uniform(1, 2, [1111, 1117]).astype(self.dtype) + out = np.reciprocal(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + def test_check_grad(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_grad_with_place(place, ['X'], 'Out') + + if __name__ == "__main__": paddle.enable_static() unittest.main() -- GitLab