/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU #include #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { using paddle::framework::Tensor; template class XPUActivationKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { Functor functor; auto attrs = functor.GetAttrs(); for (auto &attr : attrs) { *attr.second = context.Attr(attr.first); } functor(context); } }; template class XPUActivationGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { Functor functor; auto attrs = functor.GetAttrs(); for (auto &attr : attrs) { *attr.second = context.Attr(attr.first); } functor(context); } }; template void xpu_activation_forward( const framework::ExecutionContext &ctx, std::function func) { const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); const XPUT *x_data = reinterpret_cast(x->data()); XPUT *y_data = reinterpret_cast(y->mutable_data(ctx.GetPlace())); auto xpu_context = ctx.device_context().x_context(); int r = func(xpu_context, x_data, y_data, x->numel()); PADDLE_ENFORCE_EQ( r, xpu::Error_t::SUCCESS, platform::errors::External("XPU activation op return wrong value[%d %s].", r, XPUAPIErrorMsg[r])); } template void xpu_activation_backward( const framework::ExecutionContext &ctx, std::function func) { /* TODO: relu tanh sigmoid are inplace */ const auto *x = ctx.Input("X"); auto *y = ctx.Input("Out"); auto *dOut = ctx.Input(framework::GradVarName("Out")); auto *dX = ctx.Output(framework::GradVarName("X")); const XPUT *x_data = nullptr; const XPUT *y_data = nullptr; const XPUT *y_grad = nullptr; if (x != nullptr) x_data = reinterpret_cast(x->data()); if (y != nullptr) y_data = reinterpret_cast(y->data()); if (dOut != nullptr) y_grad = reinterpret_cast(dOut->data()); XPUT *x_grad = reinterpret_cast(dX->mutable_data(ctx.GetPlace())); auto xpu_context = ctx.device_context().x_context(); int r = func(xpu_context, x_data, y_data, y_grad, x_grad, dX->numel()); PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, platform::errors::External( "XPU activation grad op return wrong value[%d %s].", r, XPUAPIErrorMsg[r])); } template struct XPUAbsFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_forward( ctx, xpu::abs); } }; template struct XPUAbsGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_backward( ctx, xpu::abs_grad); } }; template struct XPUExpFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_forward( ctx, xpu::exp); } }; template struct XPULogFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_forward( ctx, xpu::log); } }; template struct XPUReciprocalFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_forward( ctx, xpu::reciprocal); } }; template struct XPUReciprocalGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_backward( ctx, xpu::reciprocal_grad); } }; template struct XPUReluGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_backward( ctx, xpu::relu_grad); } }; template struct XPURelu6Functor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_forward( ctx, xpu::relu6); } }; template struct XPURelu6GradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_backward( ctx, xpu::relu6_grad); } }; template struct XPUSigmoidFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_forward( ctx, xpu::sigmoid); } }; template struct XPUSigmoidGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_backward( ctx, xpu::sigmoid_grad); } }; template struct XPUSqrtFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_forward( ctx, xpu::sqrt); } }; template struct XPUSqrtGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_backward( ctx, xpu::sqrt_grad); } }; template struct XPUSquareFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_forward( ctx, xpu::square); } }; template struct XPUSquareGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_backward( ctx, xpu::square_grad); } }; template struct XPUTanhFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_forward( ctx, xpu::tanh); } }; template struct XPUTanhGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { xpu_activation_backward( ctx, xpu::tanh_grad); } }; template struct XPUHardSwishFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { float threshold = ctx.Attr("threshold"); float scale = ctx.Attr("scale"); float offset = ctx.Attr("offset"); PADDLE_ENFORCE_EQ(threshold, 6.0f, platform::errors::External( "Not support threshold [%f] in XPU", threshold)); PADDLE_ENFORCE_EQ( scale, 6.0f, platform::errors::External("Not support scale [%f] in XPU", scale)); PADDLE_ENFORCE_EQ( offset, 3.0f, platform::errors::External("Not support offset [%f] in XPU", offset)); xpu_activation_forward( ctx, xpu::hard_swish); } }; template struct XPUHardSwishGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { float threshold = ctx.Attr("threshold"); float scale = ctx.Attr("scale"); float offset = ctx.Attr("offset"); PADDLE_ENFORCE_EQ(threshold, 6.0f, platform::errors::External( "Not support threshold [%f] in XPU", threshold)); PADDLE_ENFORCE_EQ( scale, 6.0f, platform::errors::External("Not support scale [%f] in XPU", scale)); PADDLE_ENFORCE_EQ( offset, 3.0f, platform::errors::External("Not support offset [%f] in XPU", offset)); xpu_activation_backward( ctx, xpu::hard_swish_grad); } }; template struct XPULeakyReluFunctor : public BaseActivationFunctor { void operator()(const framework::ExecutionContext &ctx) const { const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); float alpha = ctx.Attr("alpha"); const T *x_data = x->data(); T *y_data = y->mutable_data(ctx.GetPlace()); auto xpu_context = ctx.device_context().x_context(); int r = xpu::leaky_relu(xpu_context, x_data, y_data, x->numel(), alpha); PADDLE_ENFORCE_EQ( r, xpu::Error_t::SUCCESS, platform::errors::External( "XPU leaky_relu return wrong value[%d %s].", r, XPUAPIErrorMsg[r])); } }; template struct XPULeakyReluGradFunctor : public BaseActivationFunctor { void operator()(const framework::ExecutionContext &ctx) const { const auto *x = ctx.Input("X"); auto *dOut = ctx.Input(framework::GradVarName("Out")); auto *dX = ctx.Output(framework::GradVarName("X")); float alpha = ctx.Attr("alpha"); const T *x_data = nullptr; const T *y_grad = nullptr; if (x != nullptr) x_data = x->data(); if (dOut != nullptr) y_grad = dOut->data(); T *x_grad = dX->mutable_data(ctx.GetPlace()); auto xpu_context = ctx.device_context().x_context(); // The signs of x and y are the same, // y == nullptr here, // so we give 2 x to the api int r = xpu::leaky_relu_grad(xpu_context, reinterpret_cast(x_data), reinterpret_cast(x_data), reinterpret_cast(y_grad), reinterpret_cast(x_grad), dX->numel(), alpha); PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, platform::errors::External( "XPU leaky_relu_grad return wrong value[%d %s].", r, XPUAPIErrorMsg[r])); } }; template struct XPULogGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { const auto *x = ctx.Input("X"); auto *dOut = ctx.Input(framework::GradVarName("Out")); auto *dX = ctx.Output(framework::GradVarName("X")); const T *x_data = nullptr; const T *y_grad = nullptr; if (x != nullptr) x_data = x->data(); if (dOut != nullptr) y_grad = dOut->data(); T *x_grad = dX->mutable_data(ctx.GetPlace()); auto dev_ctx = ctx.device_context().x_context(); const auto x_dims = x->dims(); auto xshape = phi::vectorize(x_dims); int len = x->dims()[x_dims.size() - 1]; std::vector yshape(1, len); xpu::ctx_guard RAII_GUARD(dev_ctx); T *y_data = RAII_GUARD.alloc_l3_or_gm(len); PADDLE_ENFORCE_XDNN_NOT_NULL(y_data); T *tmp_grad = RAII_GUARD.alloc_l3_or_gm(x->numel()); PADDLE_ENFORCE_XDNN_NOT_NULL(tmp_grad); int r = xpu::constant(dev_ctx, y_data, len, static_cast(1.0)); PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); // dx.device(d) = dout * (static_cast(1) / x); r = xpu::broadcast_div(dev_ctx, reinterpret_cast(y_data), reinterpret_cast(x_data), reinterpret_cast(tmp_grad), yshape, xshape); PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_div"); r = xpu::broadcast_mul(dev_ctx, reinterpret_cast(y_grad), reinterpret_cast(tmp_grad), reinterpret_cast(x_grad), xshape, xshape); PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul"); } }; template struct XPUPowFunctor : public BaseActivationFunctor { void operator()(const framework::ExecutionContext &ctx) const { const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); auto pow_factor = ctx.Attr("factor"); const T *x_data = x->data(); T *y_data = y->mutable_data(ctx.GetPlace()); // allocate temp memory for factor on xpu auto xpu_context = ctx.device_context().x_context(); xpu::ctx_guard RAII_GUARD(xpu_context); T *factor_data = RAII_GUARD.alloc_l3_or_gm(1); PADDLE_ENFORCE_NOT_NULL( factor_data, platform::errors::External("XPU alloc_l3_or_gm returns nullptr")); memory::Copy(ctx.GetPlace(), static_cast(factor_data), platform::CPUPlace(), static_cast(&pow_factor), sizeof(T)); // broadcast_pow(Context* ctx, const T* x, const T* y, T* z, const // std::vector& xshape, const std::vector& yshape); auto x_dims = phi::vectorize(x->dims()); int r = xpu::broadcast_pow( xpu_context, x_data, factor_data, y_data, x_dims, {1}); PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow"); } }; template struct XPUPowGradFunctor : public BaseActivationFunctor { void operator()(const framework::ExecutionContext &ctx) const { const auto *x = ctx.Input("X"); auto *dOut = ctx.Input(framework::GradVarName("Out")); auto *dX = ctx.Output(framework::GradVarName("X")); const T *x_data = x->data(); const T *y_grad = dOut->data(); T *x_grad = dX->mutable_data(ctx.GetPlace()); // check dims: all dims should equal auto x_dims = phi::vectorize(x->dims()); auto dy_dims = phi::vectorize(dOut->dims()); auto dx_dims = phi::vectorize(dX->dims()); PADDLE_ENFORCE_EQ( x_dims, dy_dims, platform::errors::PreconditionNotMet("x_dims should match dy_dims.")); PADDLE_ENFORCE_EQ( x_dims, dx_dims, platform::errors::PreconditionNotMet("x_dims should match dx_dims.")); float pow_factor = ctx.Attr("factor"); auto xpu_context = ctx.device_context().x_context(); // int pow_grad(Context* ctx, const T* x, const T* dy, T* dx, int len, float // factor); int r = xpu::pow_grad( xpu_context, x_data, y_grad, x_grad, x->numel(), pow_factor); PADDLE_ENFORCE_XDNN_SUCCESS(r, "pow_grad"); } }; template struct XPUReluFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; void operator()(const framework::ExecutionContext &ctx) const { const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); const XPUType *x_data = reinterpret_cast(x->data()); XPUType *y_data = reinterpret_cast(y->mutable_data(ctx.GetPlace())); auto xpu_context = ctx.device_context().x_context(); int r = xpu::relu(xpu_context, x_data, y_data, x->numel(), nullptr, nullptr); PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu"); } }; template struct XPUSoftPlusFunctor : public BaseActivationFunctor { void operator()(const framework::ExecutionContext &ctx) const { const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); const T *x_data = x->data(); T *y_data = y->mutable_data(ctx.GetPlace()); float beta = ctx.Attr("beta"); float threshold = ctx.Attr("threshold"); auto xpu_context = ctx.device_context().x_context(); int r = xpu::softplus(xpu_context, x_data, y_data, x->numel(), beta, threshold); PADDLE_ENFORCE_XDNN_SUCCESS(r, "softplus"); } }; template struct XPUSoftPlusGradFunctor : public BaseActivationFunctor { void operator()(const framework::ExecutionContext &ctx) const { const auto *x = ctx.Input("X"); auto *dOut = ctx.Input(framework::GradVarName("Out")); auto *dX = ctx.Output(framework::GradVarName("X")); const T *x_data = x->data(); const T *y_grad = dOut->data(); T *x_grad = dX->mutable_data(ctx.GetPlace()); float beta = ctx.Attr("beta"); float threshold = ctx.Attr("threshold"); auto xpu_context = ctx.device_context().x_context(); int r = xpu::softplus_grad(xpu_context, reinterpret_cast(x_data), reinterpret_cast( x_data), // softplus_grad do not need y_data reinterpret_cast(y_grad), reinterpret_cast(x_grad), dX->numel(), beta, threshold); PADDLE_ENFORCE_XDNN_SUCCESS(r, "softplus_grad"); } }; template struct XPUSwishFunctor : public BaseActivationFunctor { void operator()(const framework::ExecutionContext &ctx) const { const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); const T *x_data = x->data(); T *y_data = y->mutable_data(ctx.GetPlace()); auto xpu_context = ctx.device_context().x_context(); // int swish(Context* ctx, const T* x, T* y, int len); int r = xpu::swish(xpu_context, x_data, y_data, x->numel()); PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish"); } }; template struct XPUSwishGradFunctor : public BaseActivationFunctor { void operator()(const framework::ExecutionContext &ctx) const { const auto *x = ctx.Input("X"); auto *dOut = ctx.Input(framework::GradVarName("Out")); auto *dX = ctx.Output(framework::GradVarName("X")); const T *x_data = x->data(); const T *y_grad = dOut->data(); T *x_grad = dX->mutable_data(ctx.GetPlace()); auto xpu_context = ctx.device_context().x_context(); // int swish_grad(Context* ctx, const T* x, const T* dy, T* dx, int len); int r = xpu::swish_grad(xpu_context, x_data, y_grad, x_grad, dX->numel()); PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish_grad"); } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; #define REGISTER_ACTIVATION_XPU_KERNEL(act_type, functor, grad_functor) \ REGISTER_OP_XPU_KERNEL(act_type, \ ops::XPUActivationKernel>); \ REGISTER_OP_XPU_KERNEL( \ act_type##_grad, \ ops::XPUActivationGradKernel>); REGISTER_ACTIVATION_XPU_KERNEL(abs, XPUAbsFunctor, XPUAbsGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, XPUHardSwishFunctor, XPUHardSwishGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, XPULeakyReluFunctor, XPULeakyReluGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, XPUReciprocalFunctor, XPUReciprocalGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor, XPUSigmoidGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSquareGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(softplus, XPUSoftPlusFunctor, XPUSoftPlusGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(swish, XPUSwishFunctor, XPUSwishGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(pow, XPUPowFunctor, XPUPowGradFunctor) REGISTER_OP_XPU_KERNEL( relu, ops::XPUActivationKernel>, ops::XPUActivationKernel>); REGISTER_OP_XPU_KERNEL( relu_grad, ops::XPUActivationGradKernel>, ops::XPUActivationGradKernel< ops::XPUReluGradFunctor>); REGISTER_OP_XPU_KERNEL(relu6, ops::XPUActivationKernel>); REGISTER_OP_XPU_KERNEL( relu6_grad, ops::XPUActivationKernel>); REGISTER_OP_XPU_KERNEL( tanh, ops::XPUActivationKernel>, ops::XPUActivationKernel>); REGISTER_OP_XPU_KERNEL( tanh_grad, ops::XPUActivationGradKernel>, ops::XPUActivationGradKernel< ops::XPUTanhGradFunctor>); REGISTER_OP_XPU_KERNEL(exp, ops::XPUActivationKernel>); REGISTER_OP_XPU_KERNEL(log, ops::XPUActivationKernel>); REGISTER_OP_XPU_KERNEL( log_grad, ops::XPUActivationGradKernel>); #endif // PADDLE_WITH_XPU