diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index b13a74ae72d3a0e0d52ecc08deaeb2b1599d255e..d965e1ace5fc3182f79e5e92906f0ee448bce24d 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -41,7 +41,7 @@ endif()
 if(WITH_XPU)
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                     iou_similarity_op_xpu.cc)
-  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_xpu.cc)
+  detection_library(prior_box_op SRCS prior_box_op.cc)
   detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
 elseif(WITH_MLU)
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
diff --git a/paddle/fluid/operators/detection/prior_box_op_xpu.cc b/paddle/fluid/operators/detection/prior_box_op_xpu.cc
deleted file mode 100644
index c52f64fb2c89788ab7047f5169f2ebaf33b8abfe..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/prior_box_op_xpu.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/detection/prior_box_op.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename K>
-class PriorBoxOpXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
-    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
-
-    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
-    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
-    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
-    auto variances = ctx.Attr<std::vector<float>>("variances");
-    auto flip = ctx.Attr<bool>("flip");
-    auto clip = ctx.Attr<bool>("clip");
-    auto min_max_aspect_ratios_order =
-        ctx.Attr<bool>("min_max_aspect_ratios_order");
-
-    std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
-
-    K step_w = static_cast<K>(ctx.Attr<float>("step_w"));
-    K step_h = static_cast<K>(ctx.Attr<float>("step_h"));
-    K offset = static_cast<K>(ctx.Attr<float>("offset"));
-
-    auto img_width = image->dims()[3];
-    auto img_height = image->dims()[2];
-
-    auto feature_width = input->dims()[3];
-    auto feature_height = input->dims()[2];
-
-    K step_width, step_height;
-    if (step_w == 0 || step_h == 0) {
-      step_width = static_cast<K>(img_width) / feature_width;
-      step_height = static_cast<K>(img_height) / feature_height;
-    } else {
-      step_width = step_w;
-      step_height = step_h;
-    }
-
-    int num_priors = aspect_ratios.size() * min_sizes.size();
-    if (max_sizes.size() > 0) {
-      num_priors += max_sizes.size();
-    }
-
-    boxes->mutable_data<K>(ctx.GetPlace());
-    vars->mutable_data<K>(ctx.GetPlace());
-
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::XPUDeviceContext>();
-    auto boxes_data = boxes->data<K>();
-    auto vars_data = vars->data<K>();
-    xpu::VectorParam<float> aspect_ratios_param{
-        aspect_ratios.data(), static_cast<int>(aspect_ratios.size()), nullptr};
-    xpu::VectorParam<float> min_sizes_param{
-        min_sizes.data(), static_cast<int>(min_sizes.size()), nullptr};
-    xpu::VectorParam<float> max_sizes_param{
-        max_sizes.data(), static_cast<int>(max_sizes.size()), nullptr};
-
-    int ret = xpu::gen_prior_box(dev_ctx.x_context(),
-                                 boxes_data,
-                                 aspect_ratios_param,
-                                 min_sizes_param,
-                                 max_sizes_param,
-                                 feature_height,
-                                 feature_width,
-                                 img_height,
-                                 img_width,
-                                 offset,
-                                 step_height,
-                                 step_width,
-                                 clip,
-                                 min_max_aspect_ratios_order);
-    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "gen_prior_box");
-
-    int box_num = feature_height * feature_width * num_priors;
-    int vlen = variances.size();
-    std::vector<K> var_cpu(vlen * box_num);
-    for (int i = 0; i < box_num; ++i) {
-      std::copy(variances.begin(), variances.end(), var_cpu.begin() + i * vlen);
-    }
-    ret = xpu_memcpy(vars_data,
-                     var_cpu.data(),
-                     var_cpu.size() * sizeof(K),
-                     XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-    PADDLE_ENFORCE_XPU_SUCCESS(ret);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(prior_box, ops::PriorBoxOpXPUKernel<float, float>);
-
-#endif
diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc
deleted file mode 100644
index 9c415a5f4291b40450277f1a05c1586dbdac5abf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/softmax_op_xpu.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/axis_utils.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-template <typename DeviceContext, typename T>
-class SoftmaxXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-    const int rank = x->dims().size();
-    int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
-
-    // allocate memory on device.
-    out->mutable_data<T>(context.GetPlace());
-
-    std::vector<int> x_dims;
-    for (int i = 0; i < rank; i++) {
-      x_dims.push_back(x->dims()[i]);
-    }
-    if (axis < 0) {
-      axis += rank;
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    int r = XPU_SUCCESS;
-    auto version = platform::get_xpu_version(context.GetPlace().GetDeviceId());
-    if (version == phi::backends::xpu::XPUVersion::XPU1) {
-      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-      XPUType* clip_x_data_l3 = RAII_GUARD.alloc_l3_or_gm<XPUType>(x->numel());
-      r = xpu::clip_v2(dev_ctx.x_context(),
-                       reinterpret_cast<const XPUType*>(x->data<T>()),
-                       clip_x_data_l3,
-                       x->numel(),
-                       static_cast<XPUType>(-1e20),
-                       static_cast<XPUType>(1e20));
-      PADDLE_ENFORCE_EQ(r,
-                        XPU_SUCCESS,
-                        platform::errors::External(
-                            "XPU API(clip_v2) return wrong value[%d %s]",
-                            r,
-                            XPUAPIErrorMsg[r]));
-      r = xpu::softmax<XPUType>(dev_ctx.x_context(),
-                                clip_x_data_l3,
-                                reinterpret_cast<XPUType*>(out->data<T>()),
-                                x_dims,
-                                axis);
-      PADDLE_ENFORCE_EQ(
-          r,
-          XPU_SUCCESS,
-          platform::errors::External("XPU API(softmax2d_forward) return wrong "
-                                     "value[%d %s]",
-                                     r,
-                                     XPUAPIErrorMsg[r]));
-    } else {
-      r = xpu::softmax<XPUType>(dev_ctx.x_context(),
-                                reinterpret_cast<const XPUType*>(x->data<T>()),
-                                reinterpret_cast<XPUType*>(out->data<T>()),
-                                x_dims,
-                                axis);
-      PADDLE_ENFORCE_EQ(
-          r,
-          XPU_SUCCESS,
-          platform::errors::External("XPU API(softmax2d_forward) return wrong "
-                                     "value[%d %s]",
-                                     r,
-                                     XPUAPIErrorMsg[r]));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SoftmaxGradXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Input<Tensor>("Out");
-    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    const int rank = dx->dims().size();
-    int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
-
-    // allocate memory on device.
-    dx->mutable_data<T>(context.GetPlace());
-
-    std::vector<int> x_dims;
-    for (int i = 0; i < rank; i++) {
-      x_dims.push_back(dx->dims()[i]);
-    }
-    if (axis < 0) {
-      axis += rank;
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r = xpu::softmax_grad<XPUType>(
-        dev_ctx.x_context(),
-        reinterpret_cast<const XPUType*>(out->data<T>()),
-        reinterpret_cast<const XPUType*>(dout->data<T>()),
-        reinterpret_cast<XPUType*>(dx->data<T>()),
-        x_dims,
-        axis);
-    PADDLE_ENFORCE_EQ(
-        r,
-        XPU_SUCCESS,
-        platform::errors::External("XPU API(softmax2d_backward) return wrong "
-                                   "value[%d %s]",
-                                   r,
-                                   XPUAPIErrorMsg[r]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_XPU_KERNEL(
-    softmax,
-    ops::SoftmaxXPUKernel<paddle::platform::XPUDeviceContext, float>,
-    ops::SoftmaxXPUKernel<paddle::platform::XPUDeviceContext,
-                          paddle::platform::float16>);
-REGISTER_OP_XPU_KERNEL(
-    softmax_grad,
-    ops::SoftmaxGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
-    ops::SoftmaxGradXPUKernel<paddle::platform::XPUDeviceContext,
-                              paddle::platform::float16>);
-
-#endif  // PADDLE_WITH_XPU
diff --git a/paddle/phi/kernels/xpu/prior_box_kernel.cc b/paddle/phi/kernels/xpu/prior_box_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0c6413a2e49bf326c8874dbabb04d7d325b11949
--- /dev/null
+++ b/paddle/phi/kernels/xpu/prior_box_kernel.cc
@@ -0,0 +1,112 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prior_box_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PriorBoxKernel(const Context& ctx,
+                    const DenseTensor& input,
+                    const DenseTensor& image,
+                    const std::vector<float>& min_sizes,
+                    const std::vector<float>& aspect_ratios,
+                    const std::vector<float>& variances,
+                    const std::vector<float>& max_sizes,
+                    bool flip,
+                    bool clip,
+                    float step_w,
+                    float step_h,
+                    float offset,
+                    bool min_max_aspect_ratios_order,
+                    DenseTensor* out,
+                    DenseTensor* var) {
+  std::vector<float> new_aspect_ratios;
+  ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios);
+
+  T new_step_w = static_cast<T>(step_w);
+  T new_step_h = static_cast<T>(step_h);
+  T new_offset = static_cast<T>(offset);
+
+  auto img_width = image.dims()[3];
+  auto img_height = image.dims()[2];
+
+  auto feature_width = input.dims()[3];
+  auto feature_height = input.dims()[2];
+
+  T step_width, step_height;
+  if (new_step_w == 0 || new_step_h == 0) {
+    step_width = static_cast<T>(img_width) / feature_width;
+    step_height = static_cast<T>(img_height) / feature_height;
+  } else {
+    step_width = new_step_w;
+    step_height = new_step_h;
+  }
+
+  int num_priors = new_aspect_ratios.size() * min_sizes.size();
+  if (max_sizes.size() > 0) {
+    num_priors += max_sizes.size();
+  }
+
+  ctx.template Alloc<T>(out);
+  ctx.template Alloc<T>(var);
+
+  auto boxes_data = out->data<T>();
+  auto var_data = var->data<T>();
+  xpu::VectorParam<float> aspect_ratios_param{
+      new_aspect_ratios.data(),
+      static_cast<int>(new_aspect_ratios.size()),
+      nullptr};
+  xpu::VectorParam<float> min_sizes_param{
+      min_sizes.data(), static_cast<int>(min_sizes.size()), nullptr};
+  xpu::VectorParam<float> max_sizes_param{
+      max_sizes.data(), static_cast<int>(max_sizes.size()), nullptr};
+
+  int ret = xpu::gen_prior_box(ctx.x_context(),
+                               boxes_data,
+                               aspect_ratios_param,
+                               min_sizes_param,
+                               max_sizes_param,
+                               feature_height,
+                               feature_width,
+                               img_height,
+                               img_width,
+                               new_offset,
+                               step_height,
+                               step_width,
+                               clip,
+                               min_max_aspect_ratios_order);
+  PADDLE_ENFORCE_XDNN_SUCCESS(ret, "gen_prior_box");
+
+  int box_num = feature_height * feature_width * num_priors;
+  int vlen = variances.size();
+  std::vector<T> var_cpu(vlen * box_num);
+  for (int i = 0; i < box_num; ++i) {
+    std::copy(variances.begin(), variances.end(), var_cpu.begin() + i * vlen);
+  }
+  ctx.Wait();
+  ret = xpu_memcpy(var_data,
+                   var_cpu.data(),
+                   var_cpu.size() * sizeof(T),
+                   XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  PADDLE_ENFORCE_XPU_SUCCESS(ret);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(prior_box, XPU, ALL_LAYOUT, phi::PriorBoxKernel, float) {}
diff --git a/paddle/phi/kernels/xpu/softmax_grad_kernel.cc b/paddle/phi/kernels/xpu/softmax_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d5d855e52aa94d44c7b2de283d3ffbc10999d68
--- /dev/null
+++ b/paddle/phi/kernels/xpu/softmax_grad_kernel.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxGradKernel(const Context& dev_ctx,
+                       const DenseTensor& out,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       DenseTensor* x_grad) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  const int rank = x_grad->dims().size();
+  const int calc_axis = phi::funcs::CanonicalAxis(axis, rank);
+
+  // allocate memory on device.
+  dev_ctx.template Alloc<T>(x_grad);
+  if (x_grad->numel() == 0) {
+    return;
+  }
+
+  std::vector<int> x_dims;
+  for (int i = 0; i < rank; i++) {
+    x_dims.push_back(x_grad->dims()[i]);
+  }
+
+  int r = xpu::softmax_grad<XPUType>(
+      dev_ctx.x_context(),
+      reinterpret_cast<const XPUType*>(out.data<T>()),
+      reinterpret_cast<const XPUType*>(out_grad.data<T>()),
+      reinterpret_cast<XPUType*>(x_grad->data<T>()),
+      x_dims,
+      calc_axis);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax_grad");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(softmax_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::SoftmaxGradKernel,
+                   float,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/xpu/softmax_kernel.cc b/paddle/phi/kernels/xpu/softmax_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..60b1c52ca5047f7d8e6cfd0266d14fe6902a7374
--- /dev/null
+++ b/paddle/phi/kernels/xpu/softmax_kernel.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/softmax_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int axis,
+                   DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  const int rank = x.dims().size();
+  const int calc_axis = phi::funcs::CanonicalAxis(axis, rank);
+
+  // allocate memory on device.
+  dev_ctx.template Alloc<T>(out);
+  if (out->numel() == 0) {
+    return;
+  }
+
+  std::vector<int> x_dims;
+  for (int i = 0; i < rank; i++) {
+    x_dims.push_back(x.dims()[i]);
+  }
+
+  int r = XPU_SUCCESS;
+  auto version =
+      phi::backends::xpu::get_xpu_version(dev_ctx.GetPlace().GetDeviceId());
+  if (version == phi::backends::xpu::XPUVersion::XPU1) {
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    XPUType* clip_x_data_l3 = RAII_GUARD.alloc_l3_or_gm<XPUType>(x.numel());
+    r = xpu::clip_v2(dev_ctx.x_context(),
+                     reinterpret_cast<const XPUType*>(x.data<T>()),
+                     clip_x_data_l3,
+                     x.numel(),
+                     static_cast<XPUType>(-1e20),
+                     static_cast<XPUType>(1e20));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
+    r = xpu::softmax<XPUType>(dev_ctx.x_context(),
+                              clip_x_data_l3,
+                              reinterpret_cast<XPUType*>(out->data<T>()),
+                              x_dims,
+                              calc_axis);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax");
+  } else {
+    r = xpu::softmax<XPUType>(dev_ctx.x_context(),
+                              reinterpret_cast<const XPUType*>(x.data<T>()),
+                              reinterpret_cast<XPUType*>(out->data<T>()),
+                              x_dims,
+                              calc_axis);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax");
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    softmax, XPU, ALL_LAYOUT, phi::SoftmaxKernel, float, phi::dtype::float16) {}