migrate xpu...

migrate xpu activation/activation_grad/transpose/transpose_grad/tril_triu/tril_triu_grad kernel to PHI, test=kunlun (#45554)

migrate xpu...
migrate xpu activation/activation_grad/transpose/transpose_grad/tril_triu/tril_triu_grad kernel to PHI, test=kunlun (#45554)
6c5f9aa8 · ykkk2333 · GitHub · 530f6b79 · 530f6b79 · 530f6b79
12 changed file
--- a/paddle/fluid/operators/activation_op_xpu.cc
+++ b/paddle/fluid/operators/activation_op_xpu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include <string>
-
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
-
-namespace paddle {
-namespace operators {
-
-using paddle::framework::Tensor;
-
-template <typename Functor>
-class XPUActivationKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto &attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
-    }
-    functor(context);
-  }
-};
-
-template <typename Functor>
-class XPUActivationGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto &attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
-    }
-    functor(context);
-  }
-};
-
-template <typename DeviceContext, typename T, typename XPUT>
-void xpu_activation_forward(
-    const framework::ExecutionContext &ctx,
-    std::function<int(xpu::Context *, const XPUT *, XPUT *, int)> func) {
-  const auto *x = ctx.Input<Tensor>("X");
-  auto *y = ctx.Output<Tensor>("Out");
-  const XPUT *x_data = reinterpret_cast<const XPUT *>(x->data<T>());
-  XPUT *y_data = reinterpret_cast<XPUT *>(y->mutable_data<T>(ctx.GetPlace()));
-
-  auto xpu_context = ctx.device_context<DeviceContext>().x_context();
-  int r = func(xpu_context, x_data, y_data, x->numel());
-  PADDLE_ENFORCE_EQ(
-      r,
-      xpu::Error_t::SUCCESS,
-      platform::errors::External("XPU activation op return wrong value[%d %s].",
-                                 r,
-                                 XPUAPIErrorMsg[r]));
-}
-
-template <typename DeviceContext, typename T, typename XPUT>
-void xpu_activation_backward(
-    const framework::ExecutionContext &ctx,
-    std::function<int(
-        xpu::Context *, const XPUT *, const XPUT *, const XPUT *, XPUT *, int)>
-        func) {
-  /* TODO: relu tanh sigmoid are inplace */
-  const auto *x = ctx.Input<Tensor>("X");
-  auto *y = ctx.Input<Tensor>("Out");
-  auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-  auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-  const XPUT *x_data = nullptr;
-  const XPUT *y_data = nullptr;
-  const XPUT *y_grad = nullptr;
-  if (x != nullptr) x_data = reinterpret_cast<const XPUT *>(x->data<T>());
-  if (y != nullptr) y_data = reinterpret_cast<const XPUT *>(y->data<T>());
-  if (dOut != nullptr) y_grad = reinterpret_cast<const XPUT *>(dOut->data<T>());
-  XPUT *x_grad = reinterpret_cast<XPUT *>(dX->mutable_data<T>(ctx.GetPlace()));
-  auto xpu_context = ctx.device_context<DeviceContext>().x_context();
-
-  int r = func(xpu_context, x_data, y_data, y_grad, x_grad, dX->numel());
-  PADDLE_ENFORCE_EQ(r,
-                    xpu::Error_t::SUCCESS,
-                    platform::errors::External(
-                        "XPU activation grad op return wrong value[%d %s].",
-                        r,
-                        XPUAPIErrorMsg[r]));
-}
-
-template <typename T>
-struct XPUAbsFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::abs<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUAbsGradFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::abs_grad<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUExpFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::exp<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPULogFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::log<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUReciprocalFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::reciprocal<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUReciprocalGradFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::reciprocal_grad<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUReluGradFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::relu_grad<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPURelu6Functor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::relu6<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPURelu6GradFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::relu6_grad<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUSigmoidFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sigmoid<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sigmoid_grad<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUSqrtFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sqrt<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUSqrtGradFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sqrt_grad<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUSquareFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::square<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUSquareGradFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::square_grad<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUTanhFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::tanh<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUTanhGradFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::tanh_grad<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUHardSwishFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    float threshold = ctx.Attr<float>("threshold");
-    float scale = ctx.Attr<float>("scale");
-    float offset = ctx.Attr<float>("offset");
-    PADDLE_ENFORCE_EQ(threshold,
-                      6.0f,
-                      platform::errors::External(
-                          "Not support threshold [%f] in XPU", threshold));
-    PADDLE_ENFORCE_EQ(
-        scale,
-        6.0f,
-        platform::errors::External("Not support scale [%f] in XPU", scale));
-    PADDLE_ENFORCE_EQ(
-        offset,
-        3.0f,
-        platform::errors::External("Not support offset [%f] in XPU", offset));
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::hard_swish<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPUHardSwishGradFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    float threshold = ctx.Attr<float>("threshold");
-    float scale = ctx.Attr<float>("scale");
-    float offset = ctx.Attr<float>("offset");
-    PADDLE_ENFORCE_EQ(threshold,
-                      6.0f,
-                      platform::errors::External(
-                          "Not support threshold [%f] in XPU", threshold));
-    PADDLE_ENFORCE_EQ(
-        scale,
-        6.0f,
-        platform::errors::External("Not support scale [%f] in XPU", scale));
-    PADDLE_ENFORCE_EQ(
-        offset,
-        3.0f,
-        platform::errors::External("Not support offset [%f] in XPU", offset));
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::hard_swish_grad<XPUType>);
-  }
-};
-
-template <typename T>
-struct XPULeakyReluFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Out");
-    float alpha = ctx.Attr<float>("alpha");
-    const T *x_data = x->data<T>();
-    T *y_data = y->mutable_data<T>(ctx.GetPlace());
-
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    int r = xpu::leaky_relu(xpu_context, x_data, y_data, x->numel(), alpha);
-    PADDLE_ENFORCE_EQ(
-        r,
-        xpu::Error_t::SUCCESS,
-        platform::errors::External(
-            "XPU leaky_relu return wrong value[%d %s].", r, XPUAPIErrorMsg[r]));
-  }
-};
-
-template <typename T>
-struct XPULeakyReluGradFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    float alpha = ctx.Attr<float>("alpha");
-    const T *x_data = nullptr;
-    const T *y_grad = nullptr;
-    if (x != nullptr) x_data = x->data<T>();
-    if (dOut != nullptr) y_grad = dOut->data<T>();
-    T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-
-    // The signs of x and y are the same,
-    // y == nullptr here,
-    // so we give 2 x to the api
-    int r = xpu::leaky_relu_grad(xpu_context,
-                                 reinterpret_cast<const float *>(x_data),
-                                 reinterpret_cast<const float *>(x_data),
-                                 reinterpret_cast<const float *>(y_grad),
-                                 reinterpret_cast<float *>(x_grad),
-                                 dX->numel(),
-                                 alpha);
-    PADDLE_ENFORCE_EQ(r,
-                      xpu::Error_t::SUCCESS,
-                      platform::errors::External(
-                          "XPU leaky_relu_grad return wrong value[%d %s].",
-                          r,
-                          XPUAPIErrorMsg[r]));
-  }
-};
-
-template <typename T>
-struct XPULogGradFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    const T *x_data = nullptr;
-    const T *y_grad = nullptr;
-    if (x != nullptr) x_data = x->data<T>();
-    if (dOut != nullptr) y_grad = dOut->data<T>();
-    T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
-    auto dev_ctx =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    const auto x_dims = x->dims();
-    auto xshape = phi::vectorize<int>(x_dims);
-    int len = x->dims()[x_dims.size() - 1];
-    std::vector<int> yshape(1, len);
-
-    xpu::ctx_guard RAII_GUARD(dev_ctx);
-    T *y_data = RAII_GUARD.alloc_l3_or_gm<T>(len);
-    PADDLE_ENFORCE_XDNN_NOT_NULL(y_data);
-    T *tmp_grad = RAII_GUARD.alloc_l3_or_gm<T>(x->numel());
-    PADDLE_ENFORCE_XDNN_NOT_NULL(tmp_grad);
-    int r = xpu::constant<T>(dev_ctx, y_data, len, static_cast<T>(1.0));
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
-
-    // dx.device(d) = dout * (static_cast<T>(1) / x);
-    r = xpu::broadcast_div(dev_ctx,
-                           reinterpret_cast<const float *>(y_data),
-                           reinterpret_cast<const float *>(x_data),
-                           reinterpret_cast<float *>(tmp_grad),
-                           yshape,
-                           xshape);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_div");
-
-    r = xpu::broadcast_mul(dev_ctx,
-                           reinterpret_cast<const float *>(y_grad),
-                           reinterpret_cast<const float *>(tmp_grad),
-                           reinterpret_cast<float *>(x_grad),
-                           xshape,
-                           xshape);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul");
-  }
-};
-
-template <typename T>
-struct XPUMishFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Out");
-    const T *x_data = x->data<T>();
-    T *y_data = y->mutable_data<T>(ctx.GetPlace());
-
-    float threshold = ctx.Attr<float>("threshold");
-
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    int r = xpu::mish(xpu_context, x_data, y_data, x->numel(), threshold);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "mish");
-  }
-};
-
-template <typename T>
-struct XPUMishGradFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    const T *x_data = x->data<T>();
-    const T *y_grad = dOut->data<T>();
-    T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
-
-    float threshold = ctx.Attr<float>("threshold");
-
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    int r = xpu::mish_grad(xpu_context,
-                           reinterpret_cast<const float *>(x_data),
-                           reinterpret_cast<const float *>(
-                               x_data),  // mish_grad do not need y_data
-                           reinterpret_cast<const float *>(y_grad),
-                           reinterpret_cast<float *>(x_grad),
-                           dX->numel(),
-                           threshold);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "mish_grad");
-  }
-};
-
-template <typename T>
-struct XPUPowFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Out");
-    auto pow_factor = ctx.Attr<float>("factor");
-    const T *x_data = x->data<T>();
-    T *y_data = y->mutable_data<T>(ctx.GetPlace());
-
-    // allocate temp memory for factor on xpu
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    xpu::ctx_guard RAII_GUARD(xpu_context);
-    T *factor_data = RAII_GUARD.alloc_l3_or_gm<T>(1);
-    PADDLE_ENFORCE_NOT_NULL(
-        factor_data,
-        platform::errors::External("XPU alloc_l3_or_gm returns nullptr"));
-    memory::Copy(ctx.GetPlace(),
-                 static_cast<void *>(factor_data),
-                 platform::CPUPlace(),
-                 static_cast<void *>(&pow_factor),
-                 sizeof(T));
-
-    // broadcast_pow(Context* ctx, const T* x, const T* y, T* z, const
-    // std::vector<int>& xshape, const std::vector<int>& yshape);
-    auto x_dims = phi::vectorize<int>(x->dims());
-    int r = xpu::broadcast_pow(
-        xpu_context, x_data, factor_data, y_data, x_dims, {1});
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
-  }
-};
-
-template <typename T>
-struct XPUPowGradFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    const T *x_data = x->data<T>();
-    const T *y_grad = dOut->data<T>();
-    T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
-
-    // check dims: all dims should equal
-    auto x_dims = phi::vectorize<int>(x->dims());
-    auto dy_dims = phi::vectorize<int>(dOut->dims());
-    auto dx_dims = phi::vectorize<int>(dX->dims());
-    PADDLE_ENFORCE_EQ(
-        x_dims,
-        dy_dims,
-        platform::errors::PreconditionNotMet("x_dims should match dy_dims."));
-    PADDLE_ENFORCE_EQ(
-        x_dims,
-        dx_dims,
-        platform::errors::PreconditionNotMet("x_dims should match dx_dims."));
-    float pow_factor = ctx.Attr<float>("factor");
-
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    // int pow_grad(Context* ctx, const T* x, const T* dy, T* dx, int len, float
-    // factor);
-    int r = xpu::pow_grad(
-        xpu_context, x_data, y_grad, x_grad, x->numel(), pow_factor);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "pow_grad");
-  }
-};
-
-template <typename T>
-struct XPUReluFunctor : public BaseActivationFunctor<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Out");
-    const XPUType *x_data = reinterpret_cast<const XPUType *>(x->data<T>());
-    XPUType *y_data =
-        reinterpret_cast<XPUType *>(y->mutable_data<T>(ctx.GetPlace()));
-
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    int r =
-        xpu::relu(xpu_context, x_data, y_data, x->numel(), nullptr, nullptr);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu");
-  }
-};
-
-template <typename T>
-struct XPUSoftPlusFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Out");
-    const T *x_data = x->data<T>();
-    T *y_data = y->mutable_data<T>(ctx.GetPlace());
-
-    float beta = ctx.Attr<float>("beta");
-    float threshold = ctx.Attr<float>("threshold");
-
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    int r =
-        xpu::softplus(xpu_context, x_data, y_data, x->numel(), beta, threshold);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "softplus");
-  }
-};
-
-template <typename T>
-struct XPUSoftPlusGradFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    const T *x_data = x->data<T>();
-    const T *y_grad = dOut->data<T>();
-    T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
-
-    float beta = ctx.Attr<float>("beta");
-    float threshold = ctx.Attr<float>("threshold");
-
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    int r = xpu::softplus_grad(xpu_context,
-                               reinterpret_cast<const float *>(x_data),
-                               reinterpret_cast<const float *>(
-                                   x_data),  // softplus_grad do not need y_data
-                               reinterpret_cast<const float *>(y_grad),
-                               reinterpret_cast<float *>(x_grad),
-                               dX->numel(),
-                               beta,
-                               threshold);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "softplus_grad");
-  }
-};
-
-template <typename T>
-struct XPUSwishFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Out");
-    const T *x_data = x->data<T>();
-    T *y_data = y->mutable_data<T>(ctx.GetPlace());
-
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    // int swish(Context* ctx, const T* x, T* y, int len);
-    int r = xpu::swish(xpu_context, x_data, y_data, x->numel());
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish");
-  }
-};
-
-template <typename T>
-struct XPUSwishGradFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    const T *x_data = x->data<T>();
-    const T *y_grad = dOut->data<T>();
-    T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
-
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    // int swish_grad(Context* ctx, const T* x, const T* dy, T* dx, int len);
-    int r = xpu::swish_grad(xpu_context, x_data, y_grad, x_grad, dX->numel());
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish_grad");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, functor, grad_functor)  \
-  REGISTER_OP_XPU_KERNEL(act_type,                                       \
-                         ops::XPUActivationKernel<ops::functor<float>>); \
-  REGISTER_OP_XPU_KERNEL(                                                \
-      act_type##_grad,                                                   \
-      ops::XPUActivationGradKernel<ops::grad_functor<float>>);
-
-REGISTER_ACTIVATION_XPU_KERNEL(abs, XPUAbsFunctor, XPUAbsGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(hard_swish,
-                               XPUHardSwishFunctor,
-                               XPUHardSwishGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu,
-                               XPULeakyReluFunctor,
-                               XPULeakyReluGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(mish, XPUMishFunctor, XPUMishGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(reciprocal,
-                               XPUReciprocalFunctor,
-                               XPUReciprocalGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(sigmoid,
-                               XPUSigmoidFunctor,
-                               XPUSigmoidGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSquareGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(softplus,
-                               XPUSoftPlusFunctor,
-                               XPUSoftPlusGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(swish, XPUSwishFunctor, XPUSwishGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(pow, XPUPowFunctor, XPUPowGradFunctor)
-
-REGISTER_OP_XPU_KERNEL(
-    relu,
-    ops::XPUActivationKernel<ops::XPUReluFunctor<float>>,
-    ops::XPUActivationKernel<ops::XPUReluFunctor<paddle::platform::float16>>);
-REGISTER_OP_XPU_KERNEL(
-    relu_grad,
-    ops::XPUActivationGradKernel<ops::XPUReluGradFunctor<float>>,
-    ops::XPUActivationGradKernel<
-        ops::XPUReluGradFunctor<paddle::platform::float16>>);
-REGISTER_OP_XPU_KERNEL(relu6,
-                       ops::XPUActivationKernel<ops::XPURelu6Functor<float>>);
-REGISTER_OP_XPU_KERNEL(
-    relu6_grad, ops::XPUActivationKernel<ops::XPURelu6GradFunctor<float>>);
-REGISTER_OP_XPU_KERNEL(
-    tanh,
-    ops::XPUActivationKernel<ops::XPUTanhFunctor<float>>,
-    ops::XPUActivationKernel<ops::XPUTanhFunctor<paddle::platform::float16>>);
-REGISTER_OP_XPU_KERNEL(
-    tanh_grad,
-    ops::XPUActivationGradKernel<ops::XPUTanhGradFunctor<float>>,
-    ops::XPUActivationGradKernel<
-        ops::XPUTanhGradFunctor<paddle::platform::float16>>);
-
-REGISTER_OP_XPU_KERNEL(exp,
-                       ops::XPUActivationKernel<ops::XPUExpFunctor<float>>);
-REGISTER_OP_XPU_KERNEL(log,
-                       ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
-REGISTER_OP_XPU_KERNEL(
-    log_grad, ops::XPUActivationGradKernel<ops::XPULogGradFunctor<float>>);
-#endif  // PADDLE_WITH_XPU
--- a/paddle/fluid/operators/instance_norm_op_xpu.cc
+++ b/paddle/fluid/operators/instance_norm_op_xpu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
-#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
-#include "paddle/phi/kernels/instance_norm_kernel.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class InstanceNormXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto epsilon = ctx.Attr<float>("epsilon");
-    const auto* x = ctx.Input<Tensor>("X");
-    const auto* scale = ctx.Input<Tensor>("Scale");
-    const auto* bias = ctx.Input<Tensor>("Bias");
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("SavedMean");
-    auto* variance = ctx.Output<Tensor>("SavedVariance");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    // call phi kernel
-    phi::InstanceNormKernel<T>(
-        static_cast<const typename paddle::framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *x,
-        *scale,
-        *bias,
-        epsilon,
-        y,
-        mean,
-        variance);
-  }
-};
-template <typename DeviceContext, typename T>
-class InstanceNormGradXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto epsilon = ctx.Attr<float>("epsilon");
-    const auto* x = ctx.Input<Tensor>("X");
-    const auto* mean = ctx.Input<Tensor>("SavedMean");
-    const auto* variance = ctx.Input<Tensor>("SavedVariance");
-    const auto* scale = ctx.Input<Tensor>("Scale");
-    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dscale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* dbias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    // call phi kernel
-    phi::InstanceNormGradKernel<T>(
-        static_cast<const typename paddle::framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *x,
-        *dy,
-        *scale,
-        *mean,
-        *variance,
-        epsilon,
-        dx,
-        dbias,
-        dscale);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_XPU_KERNEL(
-    instance_norm,
-    ops::InstanceNormXPUKernel<paddle::platform::XPUDeviceContext, float>);
-REGISTER_OP_XPU_KERNEL(
-    instance_norm_grad,
-    ops::InstanceNormGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
-
-#endif  // PADDLE_WITH_XPU}
--- a/paddle/fluid/operators/transpose_op_xpu.cc
+++ b/paddle/fluid/operators/transpose_op_xpu.cc
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class TransposeXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto x = context.Input<framework::Tensor>("X");
-    auto out = context.Output<framework::Tensor>("Out");
-
-    // axis is permute
-    auto axis = context.Attr<std::vector<int>>("axis");
-    int ndims = axis.size();
-    const auto x_dims = x->dims();
-    const T* x_data = x->data<T>();
-    T* y_data = out->mutable_data<T>(context.GetPlace());
-    if (out->numel() == 0) {
-      return;
-    }
-
-    std::vector<int> x_shape_host(ndims, 0);
-    for (int i = 0; i < ndims; ++i) {
-      x_shape_host[i] = x_dims[i];
-    }
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r = xpu::transpose<XPUType>(dev_ctx.x_context(),
-                                    reinterpret_cast<const XPUType*>(x_data),
-                                    reinterpret_cast<XPUType*>(y_data),
-                                    x_shape_host,
-                                    axis);
-    PADDLE_ENFORCE_EQ(
-        r,
-        xpu::Error_t::SUCCESS,
-        platform::errors::External("XPU kernel error! error code=%d", r));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TransposeGradXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    if (!x_grad) return;
-
-    x_grad->mutable_data<T>(context.GetPlace());
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
-
-    int ndims = axis.size();
-    std::vector<int> out_shape_host(ndims, 0);
-    for (int i = 0; i < ndims; ++i) {
-      out_shape_host[i] = out_grad->dims()[i];
-    }
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r = xpu::transpose<XPUType>(
-        dev_ctx.x_context(),
-        reinterpret_cast<const XPUType*>(out_grad->data<T>()),
-        reinterpret_cast<XPUType*>(x_grad->data<T>()),
-        out_shape_host,
-        reversed_axis);
-    PADDLE_ENFORCE_EQ(
-        r,
-        xpu::Error_t::SUCCESS,
-        platform::errors::External("XPU kernel error! error code=%d", r));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_XPU_KERNEL(
-    transpose,
-    ops::TransposeXPUKernel<paddle::platform::XPUDeviceContext, float>,
-    ops::TransposeXPUKernel<paddle::platform::XPUDeviceContext,
-                            paddle::platform::float16>);
-REGISTER_OP_XPU_KERNEL(
-    transpose_grad,
-    ops::TransposeGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
-    ops::TransposeGradXPUKernel<paddle::platform::XPUDeviceContext,
-                                paddle::platform::float16>);
-REGISTER_OP_XPU_KERNEL(
-    transpose2,
-    ops::TransposeXPUKernel<paddle::platform::XPUDeviceContext, float>,
-    ops::TransposeXPUKernel<paddle::platform::XPUDeviceContext,
-                            paddle::platform::float16>);
-REGISTER_OP_XPU_KERNEL(
-    transpose2_grad,
-    ops::TransposeGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
-    ops::TransposeGradXPUKernel<paddle::platform::XPUDeviceContext,
-                                paddle::platform::float16>);
-
-#endif  // PADDLE_WITH_XPU
--- a/paddle/fluid/operators/tril_triu_op_xpu.cc
+++ b/paddle/fluid/operators/tril_triu_op_xpu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.  Licensed under
-the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class TrilTriuXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* x = context.Input<framework::Tensor>("X");
-    const auto* x_data = x->data<T>();
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-
-    const int diagonal = context.Attr<int>("diagonal");
-    const bool lower = context.Attr<bool>("lower");
-    auto xshape = phi::vectorize<int>(x->dims());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r = 0;
-    if (lower) {
-      r = xpu::tril(dev_ctx.x_context(), x_data, out_data, xshape, diagonal);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op");
-    } else {
-      r = xpu::triu(dev_ctx.x_context(), x_data, out_data, xshape, diagonal);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op");
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TrilTriuGradXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const auto* dout_data = d_out->data<T>();
-    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dx_data = d_x->mutable_data<T>(context.GetPlace());
-
-    const int diagonal = context.Attr<int>("diagonal");
-    const bool lower = context.Attr<bool>("lower");
-
-    auto dy_shape = phi::vectorize<int>(d_out->dims());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r = 0;
-    if (lower) {
-      r = xpu::tril(
-          dev_ctx.x_context(), dout_data, dx_data, dy_shape, diagonal);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op");
-    } else {
-      r = xpu::triu(
-          dev_ctx.x_context(), dout_data, dx_data, dy_shape, diagonal);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    tril_triu,
-    ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, int>,
-    ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, float>);
-REGISTER_OP_XPU_KERNEL(
-    tril_triu_grad,
-    ops::TrilTriuGradXPUKernel<paddle::platform::XPUDeviceContext, int>,
-    ops::TrilTriuGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
-#endif
--- a/paddle/phi/kernels/xpu/abs_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/abs_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/abs_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AbsGradKernel(const Context& ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& dout,
+                   DenseTensor* dx) {
+  ctx.template Alloc<T>(dx);
+  int r = xpu::abs_grad(ctx.x_context(),
+                        x.data<T>(),
+                        dout.data<T>(),
+                        dout.data<T>(),
+                        dx->data<T>(),
+                        x.numel());
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs_grad");
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(abs_grad, XPU, ALL_LAYOUT, phi::AbsGradKernel, float) {}
--- a/paddle/phi/kernels/xpu/abs_kernel.cc
+++ b/paddle/phi/kernels/xpu/abs_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/abs_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  int r = xpu::abs(ctx.x_context(), x.data<T>(), out->data<T>(), x.numel());
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs");
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(abs, XPU, ALL_LAYOUT, phi::AbsKernel, float) {}
--- a/paddle/phi/kernels/xpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/activation_grad_kernel.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationGradXPUImpl(const Context& dev_ctx,
+                           const DenseTensor* x,
+                           const DenseTensor* out,
+                           const DenseTensor* d_out,
+                           DenseTensor* d_x,
+                           const Functor& functor) {
+  PADDLE_ENFORCE_NOT_NULL(
+      d_out, errors::NotFound("The input DenseTensor dOut can not be nullptr"));
+  PADDLE_ENFORCE_NOT_NULL(
+      d_x, errors::NotFound("The output DenseTensor dX can not be nullptr"));
+  if (!out) {
+    out = d_out;  // fake out
+  }
+  dev_ctx.template Alloc<T>(d_x);
+  functor(dev_ctx, x, out, d_out, d_x);
+}
+
+#define DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
+  template <typename T, typename Context>                           \
+  void name##GradKernel(const Context& dev_ctx,                     \
+                        const DenseTensor& x,                       \
+                        const DenseTensor& dout,                    \
+                        DenseTensor* dx) {                          \
+    functor_class<T> functor;                                       \
+    ActivationGradXPUImpl<T, Context, functor_class<T>>(            \
+        dev_ctx, &x, nullptr, &dout, dx, functor);                  \
+  }
+
+#define DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(  \
+    name, functor_class, attr)                           \
+  template <typename T, typename Context>                \
+  void name##GradKernel(const Context& dev_ctx,          \
+                        const DenseTensor& x,            \
+                        const DenseTensor& dout,         \
+                        float attr,                      \
+                        DenseTensor* dx) {               \
+    functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                     \
+    *(attrs[0].second) = attr;                           \
+    ActivationGradXPUImpl<T, Context, functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);       \
+  }
+
+#define DEFINE_XPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(  \
+    name, functor_class, attr1, attr2)                   \
+  template <typename T, typename Context>                \
+  void name##GradKernel(const Context& dev_ctx,          \
+                        const DenseTensor& x,            \
+                        const DenseTensor& dout,         \
+                        float attr1,                     \
+                        float attr2,                     \
+                        DenseTensor* dx) {               \
+    functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                     \
+    *(attrs[0].second) = attr1;                          \
+    *(attrs[1].second) = attr2;                          \
+    ActivationGradXPUImpl<T, Context, functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);       \
+  }
+
+#define DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
+  template <typename T, typename Context>                             \
+  void name##GradKernel(const Context& dev_ctx,                       \
+                        const DenseTensor& out,                       \
+                        const DenseTensor& dout,                      \
+                        DenseTensor* dx) {                            \
+    functor_class<T> functor;                                         \
+    ActivationGradXPUImpl<T, Context, functor_class<T>>(              \
+        dev_ctx, nullptr, &out, &dout, dx, functor);                  \
+  }
+
+#define DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT( \
+    name, functor_class, attr)                            \
+  template <typename T, typename Context>                 \
+  void name##GradKernel(const Context& dev_ctx,           \
+                        const DenseTensor& out,           \
+                        const DenseTensor& dout,          \
+                        float attr,                       \
+                        DenseTensor* dx) {                \
+    functor_class<T> functor;                             \
+    auto attrs = functor.GetAttrs();                      \
+    *(attrs[0].second) = attr;                            \
+    ActivationGradXPUImpl<T, Context, functor_class<T>>(  \
+        dev_ctx, nullptr, &out, &dout, dx, functor);      \
+  }
+
+#define DEFINE_XPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT( \
+    name, functor_class, attr1, attr2)                    \
+  template <typename T, typename Context>                 \
+  void name##GradKernel(const Context& dev_ctx,           \
+                        const DenseTensor& out,           \
+                        const DenseTensor& dout,          \
+                        float attr1,                      \
+                        float attr2,                      \
+                        DenseTensor* dx) {                \
+    functor_class<T> functor;                             \
+    auto attrs = functor.GetAttrs();                      \
+    *(attrs[0].second) = attr1;                           \
+    *(attrs[1].second) = attr2;                           \
+    ActivationGradXPUImpl<T, Context, functor_class<T>>(  \
+        dev_ctx, nullptr, &out, &dout, dx, functor);      \
+  }
+
+#define DEFINE_XPU_ACTIVATION_GRAD_KERNEL_NODEP(name, functor_class)      \
+  template <typename T, typename Context>                                 \
+  void name##GradKernel(                                                  \
+      const Context& dev_ctx, const DenseTensor& dout, DenseTensor* dx) { \
+    functor_class<T> functor;                                             \
+    ActivationGradXPUImpl<T, Context, functor_class<T>>(                  \
+        dev_ctx, nullptr, nullptr, &dout, dx, functor);                   \
+  }
+
+template <typename Context, typename T, typename XPUType>
+int xpu_activation_backward(const Context& dev_ctx,
+                            const DenseTensor* x,
+                            const DenseTensor* out,
+                            const DenseTensor* dout,
+                            DenseTensor* dx,
+                            std::function<int(xpu::Context*,
+                                              const XPUType*,
+                                              const XPUType*,
+                                              const XPUType*,
+                                              XPUType*,
+                                              int)> func) {
+  /* TODO: relu tanh sigmoid are inplace */
+  const XPUType* x_data = nullptr;
+  const XPUType* y_data = nullptr;
+  const XPUType* y_grad = nullptr;
+  if (x != nullptr) x_data = reinterpret_cast<const XPUType*>(x->data<T>());
+  if (out != nullptr) y_data = reinterpret_cast<const XPUType*>(out->data<T>());
+  if (dout != nullptr)
+    y_grad = reinterpret_cast<const XPUType*>(dout->data<T>());
+  XPUType* x_grad = reinterpret_cast<XPUType*>(dx->data<T>());
+
+  int r =
+      func(dev_ctx.x_context(), x_data, y_data, y_grad, x_grad, dx->numel());
+  return r;
+}
+
+template <typename T>
+struct XPULogGradFunctor : public funcs::BaseActivationFunctor<T> {
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    const T* x_data = nullptr;
+    const T* y_grad = nullptr;
+    if (x != nullptr) x_data = x->data<T>();
+    if (dOut != nullptr) y_grad = dOut->data<T>();
+    T* x_grad = dX->data<T>();
+    const auto x_dims = x->dims();
+    auto xshape = vectorize<int>(x_dims);
+    int len = x->dims()[x_dims.size() - 1];
+    std::vector<int> yshape(1, len);
+
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    T* y_data = RAII_GUARD.alloc_l3_or_gm<T>(len);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(y_data);
+    T* tmp_grad = RAII_GUARD.alloc_l3_or_gm<T>(x->numel());
+    PADDLE_ENFORCE_XDNN_NOT_NULL(tmp_grad);
+    int r =
+        xpu::constant<T>(dev_ctx.x_context(), y_data, len, static_cast<T>(1.0));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+
+    // dx.device(d) = dout * (static_cast<T>(1) / x);
+    r = xpu::broadcast_div(dev_ctx.x_context(),
+                           reinterpret_cast<const float*>(y_data),
+                           reinterpret_cast<const float*>(x_data),
+                           reinterpret_cast<float*>(tmp_grad),
+                           yshape,
+                           xshape);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_div");
+
+    r = xpu::broadcast_mul(dev_ctx.x_context(),
+                           reinterpret_cast<const float*>(y_grad),
+                           reinterpret_cast<const float*>(tmp_grad),
+                           reinterpret_cast<float*>(x_grad),
+                           xshape,
+                           xshape);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul");
+  }
+};
+
+template <typename T>
+struct XPULeakyReluGradFunctor : public funcs::BaseActivationFunctor<T> {
+  float alpha;
+  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dout,
+                  DenseTensor* dx) const {
+    const T* x_data = nullptr;
+    const T* y_grad = nullptr;
+    if (x != nullptr) x_data = x->data<T>();
+    if (dout != nullptr) y_grad = dout->data<T>();
+    T* x_grad = dx->data<T>();
+    auto xpu_context = dev_ctx.x_context();
+
+    // The signs of x and y are the same,
+    // y == nullptr here,
+    // so we give 2 x to the api
+    int r = xpu::leaky_relu_grad(xpu_context,
+                                 reinterpret_cast<const float*>(x_data),
+                                 reinterpret_cast<const float*>(x_data),
+                                 reinterpret_cast<const float*>(y_grad),
+                                 reinterpret_cast<float*>(x_grad),
+                                 dx->numel(),
+                                 alpha);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "leaky_relu_grad");
+  }
+};
+
+template <typename T>
+struct XPUHardSwishGradFunctor : public funcs::BaseActivationFunctor<T> {
+  float threshold;
+  float scale;
+  float offset;
+
+  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
+  }
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dout,
+                  DenseTensor* dx) const {
+    using XPUType = typename XPUTypeTrait<T>::Type;
+    PADDLE_ENFORCE_EQ(
+        threshold,
+        6.0f,
+        errors::External("Not support threshold [%f] in XPU", threshold));
+    PADDLE_ENFORCE_EQ(
+        scale, 6.0f, errors::External("Not support scale [%f] in XPU", scale));
+    PADDLE_ENFORCE_EQ(
+        offset,
+        3.0f,
+        errors::External("Not support offset [%f] in XPU", offset));
+    int r = xpu_activation_backward<Context, T, XPUType>(
+        dev_ctx, x, out, dout, dx, xpu::hard_swish_grad<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_swish_grad");
+  }
+};
+
+template <typename T>
+struct XPUReciprocalGradFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dout,
+                  DenseTensor* dx) const {
+    int r = xpu_activation_backward<Context, T, XPUType>(
+        dev_ctx, x, out, dout, dx, xpu::reciprocal_grad<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "reciprocal_grad");
+  }
+};
+
+template <typename T>
+struct XPUReluGradFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dout,
+                  DenseTensor* dx) const {
+    int r = xpu_activation_backward<Context, T, XPUType>(
+        dev_ctx, x, out, dout, dx, xpu::relu_grad<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu_grad");
+  }
+};
+
+template <typename T>
+struct XPURelu6GradFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  float threshold;
+  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dout,
+                  DenseTensor* dx) const {
+    int r = xpu_activation_backward<Context, T, XPUType>(
+        dev_ctx, x, out, dout, dx, xpu::relu6_grad<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu6_grad");
+  }
+};
+
+template <typename T>
+struct XPUSigmoidGradFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dout,
+                  DenseTensor* dx) const {
+    int r = xpu_activation_backward<Context, T, XPUType>(
+        dev_ctx, x, out, dout, dx, xpu::sigmoid_grad<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sigmoid_grad");
+  }
+};
+
+template <typename T>
+struct XPUTanhGradFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dout,
+                  DenseTensor* dx) const {
+    int r = xpu_activation_backward<Context, T, XPUType>(
+        dev_ctx, x, out, dout, dx, xpu::tanh_grad<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "tanh_grad");
+  }
+};
+
+template <typename T>
+struct XPUSquareGradFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dout,
+                  DenseTensor* dx) const {
+    int r = xpu_activation_backward<Context, T, XPUType>(
+        dev_ctx, x, out, dout, dx, xpu::square_grad<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "square_grad");
+  }
+};
+
+template <typename T>
+struct XPUSqrtGradFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dout,
+                  DenseTensor* dx) const {
+    int r = xpu_activation_backward<Context, T, XPUType>(
+        dev_ctx, x, out, dout, dx, xpu::sqrt_grad<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sqrt_grad");
+  }
+};
+
+template <typename T, typename Context>
+void PowGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& dout,
+                   const Scalar& factor,
+                   DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+  const T* x_data = x.data<T>();
+  const T* y_grad = dout.data<T>();
+  T* x_grad = dx->data<T>();
+
+  // check dims: all dims should equal
+  auto x_dims = vectorize<int>(x.dims());
+  auto dy_dims = vectorize<int>(dout.dims());
+  auto dx_dims = vectorize<int>(dx->dims());
+  PADDLE_ENFORCE_EQ(x_dims,
+                    dy_dims,
+                    errors::PreconditionNotMet("x_dims should match dy_dims."));
+  PADDLE_ENFORCE_EQ(x_dims,
+                    dx_dims,
+                    errors::PreconditionNotMet("x_dims should match dx_dims."));
+  float pow_factor = factor.to<float>();
+
+  auto xpu_context = dev_ctx.x_context();
+  // int pow_grad(Context* ctx, const T* x, const T* dy, T* dx, int len, float
+  // factor);
+  int r =
+      xpu::pow_grad(xpu_context, x_data, y_grad, x_grad, x.numel(), pow_factor);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "pow_grad");
+}
+
+template <typename T>
+struct XPUSwishGradFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  float beta;
+  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dout,
+                  DenseTensor* dx) const {
+    const XPUType* x_data = reinterpret_cast<const XPUType*>(x->data<T>());
+    const XPUType* y_grad = reinterpret_cast<const XPUType*>(dout->data<T>());
+    XPUType* x_grad = reinterpret_cast<XPUType*>(dx->data<T>());
+
+    auto xpu_context = dev_ctx.x_context();
+    int r = xpu::swish_grad(xpu_context, x_data, y_grad, x_grad, dx->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish_grad");
+  }
+};
+
+template <typename T>
+struct XPUMishGradFunctor : public funcs::BaseActivationFunctor<T> {
+  float threshold;
+
+  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dout,
+                  DenseTensor* dx) const {
+    const T* x_data = x->data<T>();
+    const T* y_grad = dout->data<T>();
+    T* x_grad = dx->data<T>();
+
+    auto xpu_context = dev_ctx.x_context();
+    int r = xpu::mish_grad(
+        xpu_context,
+        reinterpret_cast<const float*>(x_data),
+        reinterpret_cast<const float*>(x_data),  // mish_grad do not need y_data
+        reinterpret_cast<const float*>(y_grad),
+        reinterpret_cast<float*>(x_grad),
+        dx->numel(),
+        threshold);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "mish_grad");
+  }
+};
+
+template <typename T>
+struct XPUSoftPlusGradFunctor : public funcs::BaseActivationFunctor<T> {
+  float beta;
+  float threshold;
+  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    const T* x_data = x->data<T>();
+    const T* y_grad = dOut->data<T>();
+    T* x_grad = dX->data<T>();
+
+    auto xpu_context = dev_ctx.x_context();
+    int r = xpu::softplus_grad(xpu_context,
+                               reinterpret_cast<const float*>(x_data),
+                               reinterpret_cast<const float*>(
+                                   x_data),  // softplus_grad do not need y_data
+                               reinterpret_cast<const float*>(y_grad),
+                               reinterpret_cast<float*>(x_grad),
+                               dX->numel(),
+                               beta,
+                               threshold);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "softplus_grad");
+  }
+};
+
+DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, XPUReciprocalGradFunctor);
+DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, XPUSigmoidGradFunctor);
+DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, XPUSqrtGradFunctor);
+DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, XPUTanhGradFunctor);
+DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, XPUReluGradFunctor);
+
+DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, XPULogGradFunctor);
+DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, XPUSquareGradFunctor);
+
+DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
+                                               XPUSwishGradFunctor,
+                                               beta);
+DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
+                                               XPUMishGradFunctor,
+                                               threshold);
+DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
+                                               XPULeakyReluGradFunctor,
+                                               alpha);
+
+DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
+                                                 XPURelu6GradFunctor,
+                                                 threshold);
+
+DEFINE_XPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
+                                               XPUSoftPlusGradFunctor,
+                                               beta,
+                                               threshold)
+
+template <typename T, typename Context>
+void HardSwishGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         float threshold,
+                         float scale,
+                         float offset,
+                         DenseTensor* dx) {
+  XPUHardSwishGradFunctor<T> functor;
+  auto attrs = functor.GetAttrs();
+  *(attrs[0].second) = threshold;
+  *(attrs[1].second) = scale;
+  *(attrs[2].second) = offset;
+  ActivationGradXPUImpl<T, Context, XPUHardSwishGradFunctor<T>>(
+      dev_ctx, &x, nullptr, &dout, dx, functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(relu_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ReluGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name, XPU, ALL_LAYOUT, phi::func, float) {}
+
+PD_REGISTER_KERNEL(tanh_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::TanhGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log_grad, LogGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_swish_grad, HardSwishGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(relu6_grad, Relu6GradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(swish_grad, SwishGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(square_grad, SquareGradKernel)
+PD_REGISTER_KERNEL(pow_grad, XPU, ALL_LAYOUT, phi::PowGradKernel, float) {}
--- a/paddle/phi/kernels/xpu/activation_kernel.cc
+++ b/paddle/phi/kernels/xpu/activation_kernel.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationXPUImpl(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out,
+                       const Functor& functor) {
+  PADDLE_ENFORCE_NOT_NULL(out,
+                          errors::NotFound("Output Out should not be nullptr"));
+  dev_ctx.template Alloc<T>(out);
+  functor(dev_ctx, x, out);
+}
+
+#define DEFINE_XPU_ACTIVATION_KERNEL(name, functor_class)                      \
+  template <typename T, typename Context>                                      \
+  void name##Kernel(                                                           \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {        \
+    functor_class<T> functor;                                                  \
+    ActivationXPUImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \
+  }
+
+#define DEFINE_XPU_ACTIVATION_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
+  template <typename T, typename Context>                                      \
+  void name##Kernel(const Context& dev_ctx,                                    \
+                    const DenseTensor& x,                                      \
+                    float attr,                                                \
+                    DenseTensor* out) {                                        \
+    functor_class<T> functor;                                                  \
+    auto attrs = functor.GetAttrs();                                           \
+    *(attrs[0].second) = attr;                                                 \
+    ActivationXPUImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \
+  }
+
+#define DEFINE_XPU_ACTIVATION_KERNEL_WITH_TWO_ATTRS(                           \
+    name, functor_class, attr1, attr2)                                         \
+  template <typename T, typename Context>                                      \
+  void name##Kernel(const Context& dev_ctx,                                    \
+                    const DenseTensor& x,                                      \
+                    float attr1,                                               \
+                    float attr2,                                               \
+                    DenseTensor* out) {                                        \
+    functor_class<T> functor;                                                  \
+    auto attrs = functor.GetAttrs();                                           \
+    *(attrs[0].second) = attr1;                                                \
+    *(attrs[1].second) = attr2;                                                \
+    ActivationXPUImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \
+  }
+
+template <typename Context, typename T, typename XPUType>
+int xpu_activation_func(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    DenseTensor* out,
+    std::function<int(xpu::Context*, const XPUType*, XPUType*, int)> func) {
+  int r = func(dev_ctx.x_context(),
+               reinterpret_cast<const XPUType*>(x.data<T>()),
+               reinterpret_cast<XPUType*>(out->data<T>()),
+               x.numel());
+  return r;
+}
+
+template <typename Context, typename T, typename XPUType>
+int xpu_activation_1attr_func(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    DenseTensor* out,
+    float attr,
+    std::function<int(xpu::Context*, const XPUType*, XPUType*, int, float)>
+        func) {
+  int r = func(dev_ctx.x_context(),
+               reinterpret_cast<const XPUType*>(x.data<T>()),
+               reinterpret_cast<XPUType*>(out->data<T>()),
+               x.numel(),
+               attr);
+  return r;
+}
+
+template <typename Context, typename T, typename XPUType>
+int xpu_activation_2attr_func(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    DenseTensor* out,
+    float attr1,
+    float attr2,
+    std::function<
+        int(xpu::Context*, const XPUType*, XPUType*, int, float, float)> func) {
+  int r = func(dev_ctx.x_context(),
+               reinterpret_cast<const XPUType*>(x.data<T>()),
+               reinterpret_cast<XPUType*>(out->data<T>()),
+               x.numel(),
+               attr1,
+               attr2);
+  return r;
+}
+
+template <typename T>
+struct XPUExpFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    int r = xpu_activation_func<Context, T, XPUType>(
+        dev_ctx, x, out, xpu::exp<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "exp");
+  }
+};
+
+template <typename T>
+struct XPULogFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    int r = xpu_activation_func<Context, T, XPUType>(
+        dev_ctx, x, out, xpu::log<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "log");
+  }
+};
+
+template <typename T>
+struct XPULeakyReluFunctor : public funcs::BaseActivationFunctor<T> {
+  float alpha;
+  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    using XPUType = typename XPUTypeTrait<T>::Type;
+    int r = xpu_activation_1attr_func<Context, T, XPUType>(
+        dev_ctx, x, out, alpha, xpu::leaky_relu<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "leaky_relu");
+  }
+};
+
+template <typename T, typename Context>
+void PowKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const Scalar& factor,
+               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  float pow_factor = factor.to<float>();
+  const T* x_data = x.data<T>();
+  T* y_data = out->data<T>();
+
+  auto xpu_context = dev_ctx.x_context();
+  // allocate temp memory for factor on xpu
+  xpu::ctx_guard RAII_GUARD(xpu_context);
+  T* factor_data = RAII_GUARD.alloc_l3_or_gm<T>(1);
+  PADDLE_ENFORCE_NOT_NULL(
+      factor_data, errors::External("XPU alloc_l3_or_gm returns nullptr"));
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       static_cast<void*>(factor_data),
+                       phi::CPUPlace(),
+                       static_cast<void*>(&pow_factor),
+                       sizeof(T));
+
+  // broadcast_pow(Context* ctx, const T* x, const T* y, T* z, const
+  // std::vector<int>& xshape, const std::vector<int>& yshape);
+  auto x_dims = vectorize<int>(x.dims());
+  int r =
+      xpu::broadcast_pow(xpu_context, x_data, factor_data, y_data, x_dims, {1});
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
+}
+
+template <typename T>
+struct XPUHardSwishFunctor : public funcs::BaseActivationFunctor<T> {
+  float threshold;
+  float scale;
+  float offset;
+
+  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
+  }
+
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    using XPUType = typename XPUTypeTrait<T>::Type;
+    PADDLE_ENFORCE_EQ(
+        threshold,
+        6.0f,
+        errors::External("Not support threshold [%f] in XPU", threshold));
+    PADDLE_ENFORCE_EQ(
+        scale, 6.0f, errors::External("Not support scale [%f] in XPU", scale));
+    PADDLE_ENFORCE_EQ(
+        offset,
+        3.0f,
+        errors::External("Not support offset [%f] in XPU", offset));
+    int r = xpu_activation_func<Context, T, XPUType>(
+        dev_ctx, x, out, xpu::hard_swish<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_swish");
+  }
+};
+
+template <typename T>
+struct XPUReciprocalFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    int r = xpu_activation_func<Context, T, XPUType>(
+        dev_ctx, x, out, xpu::reciprocal<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "reciprocal");
+  }
+};
+
+template <typename T>
+struct XPUReluFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    const XPUType* x_data = reinterpret_cast<const XPUType*>(x.data<T>());
+    XPUType* y_data = reinterpret_cast<XPUType*>(out->data<T>());
+
+    auto xpu_context = dev_ctx.x_context();
+    int r = xpu::relu(xpu_context, x_data, y_data, x.numel(), nullptr, nullptr);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu");
+  }
+};
+
+template <typename T>
+struct XPURelu6Functor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  float threshold;
+  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    int r = xpu_activation_func<Context, T, XPUType>(
+        dev_ctx, x, out, xpu::relu6<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu6");
+  }
+};
+
+template <typename T>
+struct XPUSigmoidFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    int r = xpu_activation_func<Context, T, XPUType>(
+        dev_ctx, x, out, xpu::sigmoid<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sigmoid");
+  }
+};
+
+template <typename T>
+struct XPUSquareFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    int r = xpu_activation_func<Context, T, XPUType>(
+        dev_ctx, x, out, xpu::square<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "square");
+  }
+};
+
+template <typename T>
+struct XPUSqrtFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    int r = xpu_activation_func<Context, T, XPUType>(
+        dev_ctx, x, out, xpu::sqrt<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sqrt");
+  }
+};
+
+template <typename T>
+struct XPUMishFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  float threshold;
+  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    int r = xpu_activation_1attr_func<Context, T, XPUType>(
+        dev_ctx, x, out, threshold, xpu::mish<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "mish");
+  }
+};
+
+template <typename T, typename Context>
+void SwishKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 float beta,
+                 DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  dev_ctx.template Alloc<T>(out);
+  int r = xpu::swish(dev_ctx.x_context(),
+                     reinterpret_cast<const XPUType*>(x.data<T>()),
+                     reinterpret_cast<XPUType*>(out->data<T>()),
+                     x.numel());
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish");
+}
+
+template <typename T>
+struct XPUSoftplusFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  float beta;
+  float threshold;
+
+  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    int r = xpu_activation_2attr_func<Context, T, XPUType>(
+        dev_ctx, x, out, beta, threshold, xpu::softplus<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "softplus");
+  }
+};
+
+template <typename T>
+struct XPUTanhFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    int r = xpu_activation_func<Context, T, XPUType>(
+        dev_ctx, x, out, xpu::tanh<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "tanh");
+  }
+};
+
+DEFINE_XPU_ACTIVATION_KERNEL(Exp, XPUExpFunctor)
+DEFINE_XPU_ACTIVATION_KERNEL(Log, XPULogFunctor)
+DEFINE_XPU_ACTIVATION_KERNEL(Reciprocal, XPUReciprocalFunctor)
+DEFINE_XPU_ACTIVATION_KERNEL(Relu, XPUReluFunctor)
+DEFINE_XPU_ACTIVATION_KERNEL(Sigmoid, XPUSigmoidFunctor)
+DEFINE_XPU_ACTIVATION_KERNEL(Square, XPUSquareFunctor)
+DEFINE_XPU_ACTIVATION_KERNEL(Sqrt, XPUSqrtFunctor)
+DEFINE_XPU_ACTIVATION_KERNEL(Tanh, XPUTanhFunctor)
+
+DEFINE_XPU_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Mish, XPUMishFunctor, threshold)
+DEFINE_XPU_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu,
+                                            XPULeakyReluFunctor,
+                                            alpha)
+DEFINE_XPU_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Relu6, XPURelu6Functor, threshold)
+
+DEFINE_XPU_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus,
+                                            XPUSoftplusFunctor,
+                                            beta,
+                                            threshold)
+
+template <typename T, typename Context>
+void HardSwishKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     float threshold,
+                     float scale,
+                     float offset,
+                     DenseTensor* out) {
+  XPUHardSwishFunctor<T> functor;
+  auto attrs = functor.GetAttrs();
+  *(attrs[0].second) = threshold;
+  *(attrs[1].second) = scale;
+  *(attrs[2].second) = offset;
+  ActivationXPUImpl<T, Context, XPUHardSwishFunctor<T>>(
+      dev_ctx, x, out, functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    relu, XPU, ALL_LAYOUT, phi::ReluKernel, float, phi::dtype::float16) {}
+
+#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name, XPU, ALL_LAYOUT, phi::func, float) {}
+
+PD_REGISTER_KERNEL(
+    tanh, XPU, ALL_LAYOUT, phi::TanhKernel, float, phi::dtype::float16) {}
+
+PD_REGISTER_ACTIVATION_KERNEL(exp, ExpKernel)  // no grad
+PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel)
+PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_swish, HardSwishKernel)
+PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
+PD_REGISTER_ACTIVATION_KERNEL(pow, PowKernel)
+PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
+PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
+PD_REGISTER_ACTIVATION_KERNEL(swish, SwishKernel)
+PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel)
+PD_REGISTER_ACTIVATION_KERNEL(square, SquareKernel)
--- a/paddle/phi/kernels/xpu/transpose_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/transpose_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& out_grad,
+                         const std::vector<int>& axis,
+                         DenseTensor* x_grad) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  dev_ctx.template Alloc<T>(x_grad);
+  std::vector<int> reversed_axis(axis);
+  for (size_t i = 0; i < axis.size(); i++) {
+    reversed_axis[axis[i]] = i;
+  }
+  int ndims = axis.size();
+  std::vector<int> out_shape_host(ndims, 0);
+  for (int i = 0; i < ndims; ++i) {
+    out_shape_host[i] = out_grad.dims()[i];
+  }
+  int r = xpu::transpose<XPUType>(
+      dev_ctx.x_context(),
+      reinterpret_cast<const XPUType*>(out_grad.data<T>()),
+      reinterpret_cast<XPUType*>(x_grad->data<T>()),
+      out_shape_host,
+      reversed_axis);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose_grad");
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(transpose_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::TransposeGradKernel,
+                   float,
+                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/xpu/transpose_kernel.cc
+++ b/paddle/phi/kernels/xpu/transpose_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const std::vector<int>& axis,
+                     DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  if (out->numel() == 0) {
+    return;
+  }
+  dev_ctx.template Alloc<T>(out);
+  int ndims = axis.size();
+  std::vector<int> x_shape_host(ndims, 0);
+  for (int i = 0; i < ndims; ++i) {
+    x_shape_host[i] = x.dims()[i];
+  }
+  int r = xpu::transpose<XPUType>(dev_ctx.x_context(),
+                                  reinterpret_cast<const XPUType*>(x.data<T>()),
+                                  reinterpret_cast<XPUType*>(out->data<T>()),
+                                  x_shape_host,
+                                  axis);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(transpose,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::TransposeKernel,
+                   float,
+                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tril_triu_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        int diagonal,
+                        bool lower,
+                        DenseTensor* x_grad) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  ctx.template Alloc<T>(x_grad);
+  auto dy_shape = vectorize<int>(out_grad.dims());
+  int r = 0;
+  if (lower) {
+    r = xpu::tril(ctx.x_context(),
+                  reinterpret_cast<const XPUType*>(out_grad.data<T>()),
+                  reinterpret_cast<XPUType*>(x_grad->data<T>()),
+                  dy_shape,
+                  diagonal);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op");
+  } else {
+    r = xpu::triu(ctx.x_context(),
+                  reinterpret_cast<const XPUType*>(out_grad.data<T>()),
+                  reinterpret_cast<XPUType*>(x_grad->data<T>()),
+                  dy_shape,
+                  diagonal);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op");
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    tril_triu_grad, XPU, ALL_LAYOUT, phi::TrilTriuGradKernel, int, float) {}
--- a/paddle/phi/kernels/xpu/tril_triu_kernel.cc
+++ b/paddle/phi/kernels/xpu/tril_triu_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tril_triu_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    int diagonal,
+                    bool lower,
+                    DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  ctx.template Alloc<T>(out);
+  auto xshape = vectorize<int>(x.dims());
+  int r = 0;
+  if (lower) {
+    r = xpu::tril(ctx.x_context(),
+                  reinterpret_cast<const XPUType*>(x.data<T>()),
+                  reinterpret_cast<XPUType*>(out->data<T>()),
+                  xshape,
+                  diagonal);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op");
+  } else {
+    r = xpu::triu(ctx.x_context(),
+                  reinterpret_cast<const XPUType*>(x.data<T>()),
+                  reinterpret_cast<XPUType*>(out->data<T>()),
+                  xshape,
+                  diagonal);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op");
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    tril_triu, XPU, ALL_LAYOUT, phi::TrilTriuKernel, int, float) {}