diff --git a/paddle/fluid/operators/one_hot_v2_op_xpu.cc b/paddle/fluid/operators/one_hot_v2_op_xpu.cc
deleted file mode 100644
index 1d750dfbc131885805c2c2b9d683a7956e1755c7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/one_hot_v2_op_xpu.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_XPU
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/one_hot_op.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class OneHotV2XPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int depth = context.Attr<int>("depth");
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<Tensor>("depth_tensor");
-      auto* depth_data = depth_tensor->data<int32_t>();
-      if (platform::is_xpu_place(depth_tensor->place())) {
-        xpu_memcpy(static_cast<void*>(&depth),
-                   static_cast<const void*>(depth_data),
-                   sizeof(int32_t),
-                   XPU_DEVICE_TO_HOST);
-      } else {
-        depth = depth_data[0];
-      }
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int len = in->numel();
-    int ret = xpu::one_hot<T>(dev_ctx.x_context(),
-                              in->data<T>(),
-                              out->mutable_data<float>(context.GetPlace()),
-                              len,
-                              depth,
-                              1.0,
-                              0.0);
-
-    PADDLE_ENFORCE_EQ(ret,
-                      XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU one_hot kernel return wrong value[%d %s]",
-                          ret,
-                          XPUAPIErrorMsg[ret]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    one_hot_v2,
-    ops::OneHotV2XPUKernel<paddle::platform::XPUDeviceContext, int>,
-    ops::OneHotV2XPUKernel<paddle::platform::XPUDeviceContext, int64_t>);
-#endif
diff --git a/paddle/fluid/operators/p_norm_op_xpu.cc b/paddle/fluid/operators/p_norm_op_xpu.cc
deleted file mode 100644
index 0d2bb42790381a5f6e7bd376b47b16cfd1f313db..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/p_norm_op_xpu.cc
+++ /dev/null
@@ -1,355 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
-
-namespace paddle {
-namespace operators {
-
-inline void GetDims(
-    const phi::DDim& dim, int axis, int* m, int* t, int* n, bool asvector) {
-  *m = 1;
-  *n = 1;
-  *t = dim[axis];
-  if (asvector) {
-    *t = product(dim);
-  } else {
-    for (int i = 0; i < axis; ++i) {
-      (*m) *= dim[i];
-    }
-    for (int i = axis + 1; i < dim.size(); ++i) {
-      (*n) *= dim[i];
-    }
-  }
-}
-
-using Tensor = framework::Tensor;
-template <typename DeviceContext, typename T>
-class P_NormXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    float porder = ctx.Attr<float>("porder");
-    int axis = ctx.Attr<int>("axis");
-    bool asvector = ctx.Attr<bool>("asvector");
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto xdim = in->dims();
-    if (axis < 0) axis = xdim.size() + axis;
-    std::vector<int> r_dim;
-    std::vector<int> x_dim;
-    std::vector<int> y_dim;
-    int m = 1;
-    int n = 1;
-    int t = 1;
-    GetDims(xdim, axis, &m, &t, &n, asvector);
-    x_dim.push_back(m);
-    x_dim.push_back(t);
-    x_dim.push_back(n);
-
-    r_dim.push_back(1);
-
-    y_dim.push_back(m);
-    y_dim.push_back(n);
-
-    int r = 0;
-
-    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-    XPUType* tmp_x = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
-    PADDLE_ENFORCE_XDNN_NOT_NULL(tmp_x);
-    r = xpu::abs(dev_ctx.x_context(),
-                 reinterpret_cast<const XPUType*>(in->data<T>()),
-                 tmp_x,
-                 m * t * n);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs");
-    if (porder == INFINITY) {
-      r = xpu::reduce_max(dev_ctx.x_context(),
-                          tmp_x,
-                          reinterpret_cast<XPUType*>(out->data<T>()),
-                          x_dim,
-                          r_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_max");
-    } else if (porder == -INFINITY) {
-      r = xpu::reduce_min(dev_ctx.x_context(),
-                          tmp_x,
-                          reinterpret_cast<XPUType*>(out->data<T>()),
-                          x_dim,
-                          r_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_min");
-    } else if (porder == 0) {
-      XPUType* zeros = RAII_GUARD.alloc_l3_or_gm<XPUType>(1);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(zeros);
-      r = xpu::constant(dev_ctx.x_context(), zeros, 1, 0.0f);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
-      std::vector<int> zeros_dim(1, 1);
-
-      bool* tmp2_x = RAII_GUARD.alloc_l3_or_gm<bool>(m * t * n);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(tmp2_x);
-
-      r = xpu::broadcast_not_equal(
-          dev_ctx.x_context(), tmp_x, zeros, tmp2_x, x_dim, zeros_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_not_equal");
-
-      XPUType* x_mid = tmp_x;
-
-      r = xpu::cast<bool, XPUType>(
-          dev_ctx.x_context(), tmp2_x, x_mid, m * t * n);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
-
-      r = xpu::reduce_sum(dev_ctx.x_context(),
-                          x_mid,
-                          reinterpret_cast<XPUType*>(out->data<T>()),
-                          x_dim,
-                          r_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
-
-    } else {
-      Tensor porder_tensor;
-      framework::DDim pdim = phi::make_ddim({1});
-      porder_tensor.mutable_data<float>(pdim, in->place());
-      r = xpu::constant(
-          dev_ctx.x_context(), porder_tensor.data<float>(), 1, porder);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
-      std::vector<int> p_dim(1, 1);
-
-      XPUType* tmp2_x = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(tmp2_x);
-      r = xpu::broadcast_pow(
-          dev_ctx.x_context(),
-          reinterpret_cast<const XPUType*>(tmp_x),
-          reinterpret_cast<const XPUType*>(porder_tensor.data<float>()),
-          tmp2_x,
-          x_dim,
-          p_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
-
-      XPUType* tmp_y = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * n);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(tmp_y);
-
-      r = xpu::reduce_sum(dev_ctx.x_context(),
-                          reinterpret_cast<const XPUType*>(tmp2_x),
-                          tmp_y,
-                          x_dim,
-                          r_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
-
-      r = xpu::constant(
-          dev_ctx.x_context(), porder_tensor.data<float>(), 1, 1.0f / porder);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
-
-      r = xpu::broadcast_pow(
-          dev_ctx.x_context(),
-          reinterpret_cast<const XPUType*>(tmp_y),
-          reinterpret_cast<const XPUType*>(porder_tensor.data<float>()),
-          reinterpret_cast<XPUType*>(out->data<T>()),
-          y_dim,
-          p_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
-      dev_ctx.Wait();
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class P_NormGradXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Out");
-    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    auto xdim = x->dims();
-    float porder = ctx.Attr<float>("porder");
-    bool asvector = ctx.Attr<bool>("asvector");
-    int axis = ctx.Attr<int>("axis");
-    axis = axis < 0 ? xdim.size() + axis : axis;
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    int m, t, n;
-    GetDims(xdim, axis, &m, &t, &n, asvector);
-
-    std::vector<int> r_dim;
-    std::vector<int> x_dim;
-    std::vector<int> y_dim;
-
-    x_dim.push_back(m);
-    x_dim.push_back(t);
-    x_dim.push_back(n);
-
-    y_dim.push_back(m);
-    y_dim.push_back(1);
-    y_dim.push_back(n);
-
-    int r = 0;
-    if (porder == 0) {
-      r = xpu::constant(dev_ctx.x_context(),
-                        reinterpret_cast<XPUType*>(dx->data<T>()),
-                        m * t * n,
-                        static_cast<T>(0));
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
-    } else if (porder == INFINITY || porder == -INFINITY) {
-      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-      XPUType* x_abs = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(x_abs);
-      r = xpu::abs(dev_ctx.x_context(),
-                   reinterpret_cast<const XPUType*>(x->data<T>()),
-                   x_abs,
-                   m * t * n);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs");
-
-      bool* dx_t = RAII_GUARD.alloc_l3_or_gm<bool>(m * t * n);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(dx_t);
-
-      XPUType* dx_mid = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(dx_mid);
-
-      r = xpu::broadcast_equal<XPUType>(
-          dev_ctx.x_context(),
-          reinterpret_cast<const XPUType*>(x_abs),
-          reinterpret_cast<const XPUType*>(y->data<T>()),
-          dx_t,
-          x_dim,
-          y_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_equal");
-
-      r = xpu::cast<bool, XPUType>(
-          dev_ctx.x_context(), dx_t, dx_mid, m * t * n);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
-
-      XPUType* x_sign = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(x_sign);
-      r = xpu::sign(dev_ctx.x_context(),
-                    reinterpret_cast<const XPUType*>(x->data<T>()),
-                    x_sign,
-                    m * t * n);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "sign");
-
-      XPUType* dx_pre_dy = x_abs;
-      r = xpu::mul(dev_ctx.x_context(),
-                   reinterpret_cast<const XPUType*>(dx_mid),
-                   reinterpret_cast<const XPUType*>(x_sign),
-                   dx_pre_dy,
-                   m * t * n);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul");
-
-      r = xpu::broadcast_mul(dev_ctx.x_context(),
-                             dx_pre_dy,
-                             reinterpret_cast<const XPUType*>(dy->data<T>()),
-                             reinterpret_cast<XPUType*>(dx->data<T>()),
-                             x_dim,
-                             y_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul");
-
-    } else {
-      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-      XPUType* x_abs = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(x_abs);
-      r = xpu::abs(dev_ctx.x_context(),
-                   reinterpret_cast<const XPUType*>(x->data<T>()),
-                   x_abs,
-                   m * t * n);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs");
-
-      Tensor porder_tensor;
-      framework::DDim pdim = phi::make_ddim({1});
-      porder_tensor.mutable_data<float>(pdim, x->place());
-      r = xpu::constant(
-          dev_ctx.x_context(), porder_tensor.data<float>(), 1, porder - 1.0f);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
-      std::vector<int> p_dim(1, 1);
-
-      XPUType* x_pow = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(x_pow);
-      r = xpu::broadcast_pow(
-          dev_ctx.x_context(),
-          reinterpret_cast<const XPUType*>(x_abs),
-          reinterpret_cast<const XPUType*>(porder_tensor.data<float>()),
-          x_pow,
-          x_dim,
-          p_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
-
-      XPUType* y_pow = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * n);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(y_pow);
-      r = xpu::broadcast_pow(
-          dev_ctx.x_context(),
-          reinterpret_cast<const XPUType*>(y->data<T>()),
-          reinterpret_cast<const XPUType*>(porder_tensor.data<float>()),
-          y_pow,
-          y_dim,
-          p_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
-      dev_ctx.Wait();
-
-      XPUType* dx_t = x_abs;
-
-      r = xpu::broadcast_div(
-          dev_ctx.x_context(), x_pow, y_pow, dx_t, x_dim, y_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_div");
-
-      XPUType* x_sign = x_pow;
-      r = xpu::sign(dev_ctx.x_context(),
-                    reinterpret_cast<const XPUType*>(x->data<T>()),
-                    x_sign,
-                    m * t * n);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "sign");
-
-      XPUType* dx_mid = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(dx_mid);
-
-      r = xpu::broadcast_mul(dev_ctx.x_context(),
-                             reinterpret_cast<const XPUType*>(x_sign),
-                             reinterpret_cast<const XPUType*>(dy->data<T>()),
-                             dx_mid,
-                             x_dim,
-                             y_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul");
-
-      r = xpu::broadcast_mul(dev_ctx.x_context(),
-                             reinterpret_cast<const XPUType*>(dx_t),
-                             reinterpret_cast<const XPUType*>(dx_mid),
-                             reinterpret_cast<XPUType*>(dx->data<T>()),
-                             x_dim,
-                             x_dim);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    p_norm, ops::P_NormXPUKernel<paddle::platform::XPUDeviceContext, float>);
-REGISTER_OP_XPU_KERNEL(
-    p_norm_grad,
-    ops::P_NormGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
-
-#endif
diff --git a/paddle/phi/kernels/one_hot_kernel.cc b/paddle/phi/kernels/one_hot_kernel.cc
index 755e06752509a4d091ad95b9c0eaefe0998fa6d9..fb5e121676c1ff087f251ec4d1f27bb737d87c07 100644
--- a/paddle/phi/kernels/one_hot_kernel.cc
+++ b/paddle/phi/kernels/one_hot_kernel.cc
@@ -35,3 +35,7 @@ PD_REGISTER_KERNEL(one_hot, CPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {}
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(one_hot, GPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {}
 #endif
+
+#ifdef PADDLE_WITH_XPU
+PD_REGISTER_KERNEL(one_hot, XPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/xpu/one_hot_kernel.cc b/paddle/phi/kernels/xpu/one_hot_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a19643cac51001b952a7b0ed9d61a0fa269d49d9
--- /dev/null
+++ b/paddle/phi/kernels/xpu/one_hot_kernel.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/one_hot_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+namespace phi {
+template <typename Context, typename InT>
+struct OneHotV2OpFunctor {
+  const DenseTensor* in_;
+  DenseTensor* out_;
+  int depth_;
+  const Context& ctx_;
+
+  OneHotV2OpFunctor(const DenseTensor* in,
+                    DenseTensor* out,
+                    int depth,
+                    const Context& ctx)
+      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+
+  template <typename OutT>
+  void apply() const {
+    auto* p_in_data = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* p_out_data = ctx_.template Alloc<float>(out_);
+    int r = xpu::one_hot<InT>(
+        ctx_.x_context(), p_in_data, p_out_data, numel, depth_, 1.0, 0.0);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "one_hot");
+  }
+};
+
+template <typename T, typename Context>
+void OneHotRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const Scalar& depth,
+                     DataType dtype,
+                     bool allow_out_of_range,
+                     DenseTensor* out) {
+  auto depth_v = depth.to<int>();
+  auto out_dims = out->dims();
+  if (out_dims[out_dims.size() - 1] == -1) {
+    out_dims[out_dims.size() - 1] = depth_v;
+    out->Resize(out_dims);
+  }
+  phi::VisitDataType(dtype,
+                     OneHotV2OpFunctor<Context, T>(&x, out, depth_v, dev_ctx));
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    one_hot_raw, XPU, ALL_LAYOUT, phi::OneHotRawKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/xpu/p_norm_grad_kernel.cc b/paddle/phi/kernels/xpu/p_norm_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..883e3262a64876525546494e45b5441df98aec4a
--- /dev/null
+++ b/paddle/phi/kernels/xpu/p_norm_grad_kernel.cc
@@ -0,0 +1,202 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/p_norm_grad_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+inline void GetDims(
+    const phi::DDim& dim, int axis, int* m, int* t, int* n, bool asvector) {
+  *m = 1;
+  *n = 1;
+  *t = dim[axis];
+  if (asvector) {
+    *t = product(dim);
+  } else {
+    for (int i = 0; i < axis; ++i) {
+      (*m) *= dim[i];
+    }
+    for (int i = axis + 1; i < dim.size(); ++i) {
+      (*n) *= dim[i];
+    }
+  }
+}
+template <typename T, typename Context>
+void PNormGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& out,
+                     const DenseTensor& out_grad,
+                     float porder,
+                     int axis,
+                     float epsilon,
+                     bool keepdim,
+                     bool asvector,
+                     DenseTensor* x_grad) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  dev_ctx.template Alloc<T>(x_grad);
+  auto xdim = x.dims();
+  axis = axis < 0 ? xdim.size() + axis : axis;
+  int m, t, n;
+  GetDims(xdim, axis, &m, &t, &n, asvector);
+
+  std::vector<int> r_dim;
+  std::vector<int> x_dim;
+  std::vector<int> y_dim;
+
+  x_dim.push_back(m);
+  x_dim.push_back(t);
+  x_dim.push_back(n);
+
+  y_dim.push_back(m);
+  y_dim.push_back(1);
+  y_dim.push_back(n);
+
+  int r = 0;
+  if (porder == 0) {
+    r = xpu::constant(dev_ctx.x_context(),
+                      reinterpret_cast<XPUType*>(x_grad->data<T>()),
+                      m * t * n,
+                      static_cast<T>(0));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+  } else if (porder == INFINITY || porder == -INFINITY) {
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    XPUType* x_abs = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(x_abs);
+    r = xpu::abs(dev_ctx.x_context(),
+                 reinterpret_cast<const XPUType*>(x.data<T>()),
+                 x_abs,
+                 m * t * n);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs");
+
+    bool* dx_t = RAII_GUARD.alloc_l3_or_gm<bool>(m * t * n);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(dx_t);
+
+    XPUType* dx_mid = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(dx_mid);
+
+    r = xpu::broadcast_equal<XPUType>(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(x_abs),
+        reinterpret_cast<const XPUType*>(out.data<T>()),
+        dx_t,
+        x_dim,
+        y_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_equal");
+
+    r = xpu::cast<bool, XPUType>(dev_ctx.x_context(), dx_t, dx_mid, m * t * n);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
+
+    XPUType* x_sign = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(x_sign);
+    r = xpu::sign(dev_ctx.x_context(),
+                  reinterpret_cast<const XPUType*>(x.data<T>()),
+                  x_sign,
+                  m * t * n);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sign");
+
+    XPUType* dx_pre_dy = x_abs;
+    r = xpu::mul(dev_ctx.x_context(),
+                 reinterpret_cast<const XPUType*>(dx_mid),
+                 reinterpret_cast<const XPUType*>(x_sign),
+                 dx_pre_dy,
+                 m * t * n);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul");
+
+    r = xpu::broadcast_mul(dev_ctx.x_context(),
+                           dx_pre_dy,
+                           reinterpret_cast<const XPUType*>(out_grad.data<T>()),
+                           reinterpret_cast<XPUType*>(x_grad->data<T>()),
+                           x_dim,
+                           y_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul");
+
+  } else {
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    XPUType* x_abs = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(x_abs);
+    r = xpu::abs(dev_ctx.x_context(),
+                 reinterpret_cast<const XPUType*>(x.data<T>()),
+                 x_abs,
+                 m * t * n);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs");
+
+    DenseTensor porder_tensor;
+    phi::DDim pdim = phi::make_ddim({1});
+    porder_tensor.Resize(pdim);
+    dev_ctx.template Alloc<float>(&porder_tensor);
+    r = xpu::constant(
+        dev_ctx.x_context(), porder_tensor.data<float>(), 1, porder - 1.0f);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+    std::vector<int> p_dim(1, 1);
+
+    XPUType* x_pow = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(x_pow);
+    r = xpu::broadcast_pow(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(x_abs),
+        reinterpret_cast<const XPUType*>(porder_tensor.data<float>()),
+        x_pow,
+        x_dim,
+        p_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
+
+    XPUType* y_pow = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * n);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(y_pow);
+    r = xpu::broadcast_pow(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(out.data<T>()),
+        reinterpret_cast<const XPUType*>(porder_tensor.data<float>()),
+        y_pow,
+        y_dim,
+        p_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
+    dev_ctx.Wait();
+
+    XPUType* dx_t = x_abs;
+
+    r = xpu::broadcast_div(
+        dev_ctx.x_context(), x_pow, y_pow, dx_t, x_dim, y_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_div");
+
+    XPUType* x_sign = x_pow;
+    r = xpu::sign(dev_ctx.x_context(),
+                  reinterpret_cast<const XPUType*>(x.data<T>()),
+                  x_sign,
+                  m * t * n);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sign");
+
+    XPUType* dx_mid = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(dx_mid);
+
+    r = xpu::broadcast_mul(dev_ctx.x_context(),
+                           reinterpret_cast<const XPUType*>(x_sign),
+                           reinterpret_cast<const XPUType*>(out_grad.data<T>()),
+                           dx_mid,
+                           x_dim,
+                           y_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul");
+
+    r = xpu::broadcast_mul(dev_ctx.x_context(),
+                           reinterpret_cast<const XPUType*>(dx_t),
+                           reinterpret_cast<const XPUType*>(dx_mid),
+                           reinterpret_cast<XPUType*>(x_grad->data<T>()),
+                           x_dim,
+                           x_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul");
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(p_norm_grad, XPU, ALL_LAYOUT, phi::PNormGradKernel, float) {}
diff --git a/paddle/phi/kernels/xpu/p_norm_kernel.cc b/paddle/phi/kernels/xpu/p_norm_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7ef72c61ad3aa11ee279e2bc7fcd1839068d5b09
--- /dev/null
+++ b/paddle/phi/kernels/xpu/p_norm_kernel.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/p_norm_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+inline void GetDims(
+    const phi::DDim& dim, int axis, int* m, int* t, int* n, bool asvector) {
+  *m = 1;
+  *n = 1;
+  *t = dim[axis];
+  if (asvector) {
+    *t = product(dim);
+  } else {
+    for (int i = 0; i < axis; ++i) {
+      (*m) *= dim[i];
+    }
+    for (int i = axis + 1; i < dim.size(); ++i) {
+      (*n) *= dim[i];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void PNormKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 float porder,
+                 int axis,
+                 float epsilon,
+                 bool keepdim,
+                 bool asvector,
+                 DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  dev_ctx.template Alloc<T>(out);
+  auto xdim = x.dims();
+  if (axis < 0) axis = xdim.size() + axis;
+  std::vector<int> r_dim;
+  std::vector<int> x_dim;
+  std::vector<int> y_dim;
+  int m = 1;
+  int n = 1;
+  int t = 1;
+  GetDims(xdim, axis, &m, &t, &n, asvector);
+  x_dim.push_back(m);
+  x_dim.push_back(t);
+  x_dim.push_back(n);
+
+  r_dim.push_back(1);
+
+  y_dim.push_back(m);
+  y_dim.push_back(n);
+
+  int r = 0;
+
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  XPUType* tmp_x = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+  PADDLE_ENFORCE_XDNN_NOT_NULL(tmp_x);
+
+  r = xpu::abs(dev_ctx.x_context(),
+               reinterpret_cast<const XPUType*>(x.data<T>()),
+               tmp_x,
+               m * t * n);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs");
+  if (porder == INFINITY) {
+    r = xpu::reduce_max(dev_ctx.x_context(),
+                        tmp_x,
+                        reinterpret_cast<XPUType*>(out->data<T>()),
+                        x_dim,
+                        r_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_max");
+  } else if (porder == -INFINITY) {
+    r = xpu::reduce_min(dev_ctx.x_context(),
+                        tmp_x,
+                        reinterpret_cast<XPUType*>(out->data<T>()),
+                        x_dim,
+                        r_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_min");
+  } else if (porder == 0) {
+    XPUType* zeros = RAII_GUARD.alloc_l3_or_gm<XPUType>(1);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(zeros);
+    r = xpu::constant(dev_ctx.x_context(), zeros, 1, 0.0f);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+    std::vector<int> zeros_dim(1, 1);
+
+    bool* tmp2_x = RAII_GUARD.alloc_l3_or_gm<bool>(m * t * n);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(tmp2_x);
+
+    r = xpu::broadcast_not_equal(
+        dev_ctx.x_context(), tmp_x, zeros, tmp2_x, x_dim, zeros_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_not_equal");
+
+    XPUType* x_mid = tmp_x;
+
+    r = xpu::cast<bool, T>(dev_ctx.x_context(), tmp2_x, x_mid, m * t * n);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
+
+    r = xpu::reduce_sum(dev_ctx.x_context(),
+                        x_mid,
+                        reinterpret_cast<XPUType*>(out->data<T>()),
+                        x_dim,
+                        r_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
+
+  } else {
+    DenseTensor porder_tensor;
+    phi::DDim pdim = phi::make_ddim({1});
+    porder_tensor.Resize(pdim);
+    dev_ctx.template Alloc<T>(&porder_tensor);
+    r = xpu::constant(
+        dev_ctx.x_context(), porder_tensor.data<float>(), 1, porder);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+    std::vector<int> p_dim(1, 1);
+
+    XPUType* tmp2_x = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(tmp2_x);
+    r = xpu::broadcast_pow(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(tmp_x),
+        reinterpret_cast<const XPUType*>(porder_tensor.data<float>()),
+        reinterpret_cast<XPUType*>(tmp2_x),
+        x_dim,
+        p_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
+
+    XPUType* tmp_y = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * n);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(tmp_y);
+
+    r = xpu::reduce_sum(dev_ctx.x_context(),
+                        reinterpret_cast<const XPUType*>(tmp2_x),
+                        reinterpret_cast<XPUType*>(tmp_y),
+                        x_dim,
+                        r_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
+
+    r = xpu::constant(
+        dev_ctx.x_context(), porder_tensor.data<float>(), 1, 1.0f / porder);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+
+    r = xpu::broadcast_pow(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(tmp_y),
+        reinterpret_cast<const XPUType*>(porder_tensor.data<float>()),
+        reinterpret_cast<XPUType*>(out->data<T>()),
+        y_dim,
+        p_dim);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
+    dev_ctx.Wait();
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(p_norm, XPU, ALL_LAYOUT, phi::PNormKernel, float) {}