migrate softmax_with_cross_entropy and topk kernels to phi, test=kunlun (#45650)

0b9d4c56 · ykkk2333 · GitHub · 3b9b4c34 · 3b9b4c34 · 3b9b4c34
5 changed file
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_WITH_XPU
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/phi/kernels/funcs/axis_utils.h"
-#include "xpu/refactor/math.h"
-#include "xpu/refactor/nn.h"
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-template <typename T>
-class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_xpu_place(context.GetPlace()),
-        true,
-        platform::errors::PreconditionNotMet("This kernel only runs on XPU."));
-    const Tensor* logits = context.Input<Tensor>("Logits");
-    const Tensor* labels = context.Input<Tensor>("Label");
-    Tensor* softmax = context.Output<Tensor>("Softmax");
-    Tensor* loss = context.Output<Tensor>("Loss");
-    const int rank = logits->dims().size();
-    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
-    softmax->mutable_data<T>(context.GetPlace());
-    loss->mutable_data<T>(context.GetPlace());
-    const int n = phi::funcs::SizeToAxis(axis, logits->dims());
-    const int d = phi::funcs::SizeFromAxis(axis, logits->dims());
-    std::vector<int> logits_dims = phi::vectorize<int>(logits->dims());
-    const bool soft_label = context.Attr<bool>("soft_label");
-    int t = logits_dims[axis];
-    auto logits_data = reinterpret_cast<const XPUType*>(logits->data<T>());
-    auto softmax_data = reinterpret_cast<XPUType*>(softmax->data<T>());
-    auto loss_data = reinterpret_cast<XPUType*>(loss->data<T>());
-    // softmax
-    auto& dev_ctx =
-        context.template device_context<platform::XPUDeviceContext>();
-    int r = XPU_SUCCESS;
-    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-    if (platform::get_xpu_version(context.GetPlace().GetDeviceId()) ==
-            phi::backends::xpu::XPUVersion::XPU2 &&
-        soft_label && axis == rank - 1) {
-      auto labels_data = reinterpret_cast<const XPUType*>(labels->data<T>());
-      r = xpu::soft_softmax_with_cross_entropy<XPUType>(dev_ctx.x_context(),
-                                                        logits_data,
-                                                        labels_data,
-                                                        softmax_data,
-                                                        loss_data,
-                                                        n,
-                                                        d);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy");
-      return;
-    }
-    int len = logits->numel();
-    T* clip_logits = RAII_GUARD.alloc_l3_or_gm<T>(len);
-    PADDLE_ENFORCE_XDNN_NOT_NULL(clip_logits);
-    XPUType* clip_logits_data = reinterpret_cast<XPUType*>(clip_logits);
-    float max_val = 1e20;
-    float min_val = -1e20;
-    if (std::is_same<T, platform::float16>::value) {
-      max_val = 65504;
-      min_val = -65504;
-    }
-    r = xpu::clip_v2<XPUType>(dev_ctx.x_context(),
-                              logits_data,
-                              clip_logits_data,
-                              len,
-                              static_cast<XPUType>(min_val),
-                              static_cast<XPUType>(max_val));
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
-    r = xpu::softmax<XPUType>(
-        dev_ctx.x_context(), clip_logits_data, softmax_data, logits_dims, axis);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax");
-    // cross_entropy
-    if (axis != rank - 1) {
-      XPUType* trans_softmax = RAII_GUARD.alloc_l3_or_gm<XPUType>(n * d);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(trans_softmax);
-      r = xpu::transpose(dev_ctx.x_context(),
-                         softmax_data,
-                         trans_softmax,
-                         {n, t, d / t},
-                         {0, 2, 1});
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
-      softmax_data = trans_softmax;
-    }
-    if (soft_label) {
-      auto labels_data = reinterpret_cast<const XPUType*>(labels->data<T>());
-      if (axis != rank - 1) {
-        XPUType* trans_label = RAII_GUARD.alloc_l3_or_gm<XPUType>(n * d);
-        PADDLE_ENFORCE_XDNN_NOT_NULL(trans_label);
-        r = xpu::transpose(dev_ctx.x_context(),
-                           labels_data,
-                           trans_label,
-                           {n, t, d / t},
-                           {0, 2, 1});
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
-        labels_data = trans_label;
-      }
-      r = xpu::soft_cross_entropy<XPUType>(dev_ctx.x_context(),
-                                           softmax_data,
-                                           labels_data,
-                                           loss_data,
-                                           axis == rank - 1 ? n : n * d / t,
-                                           axis == rank - 1 ? d : t);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_cross_entropy");
-    } else {
-      auto ignore_index = context.Attr<int>("ignore_index");
-      Tensor labels_int32;
-      labels_int32.mutable_data<int32_t>(context.GetPlace(),
-                                         labels->numel() * sizeof(int32_t));
-      r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
-                                         labels->data<int64_t>(),
-                                         labels_int32.data<int32_t>(),
-                                         labels->numel());
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
-      r = xpu::hard_cross_entropy<XPUType, int32_t>(
-          dev_ctx.x_context(),
-          softmax_data,
-          labels_int32.data<int32_t>(),
-          loss_data,
-          nullptr,
-          axis == rank - 1 ? n : n * d / t,
-          axis == rank - 1 ? d : t,
-          ignore_index);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_cross_entropy");
-    }
-  }
-};
-template <typename T>
-class SoftmaxWithCrossEntropyGradXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Loss"));
-    const Tensor* labels = context.Input<Tensor>("Label");
-    Tensor* logit_grad =
-        context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->mutable_data<T>(context.GetPlace());
-    const Tensor* softmax = context.Input<Tensor>("Softmax");
-    const bool use_softmax = context.Attr<bool>("use_softmax");
-    const bool soft_label = context.Attr<bool>("soft_label");
-    auto ignore_index = context.Attr<int>("ignore_index");
-    const int rank = logit_grad->dims().size();
-    const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
-    const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims());
-    const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims());
-    auto& dev_ctx =
-        context.template device_context<platform::XPUDeviceContext>();
-    int r = XPU_SUCCESS;
-    if (axis == rank - 1) {
-      if (soft_label) {
-        r = xpu::soft_softmax_with_cross_entropy_grad<XPUType>(
-            dev_ctx.x_context(),
-            reinterpret_cast<const XPUType*>(out_grad->data<T>()),
-            reinterpret_cast<const XPUType*>(labels->data<T>()),
-            reinterpret_cast<const XPUType*>(softmax->data<T>()),
-            reinterpret_cast<XPUType*>(logit_grad->data<T>()),
-            use_softmax,
-            n,
-            d);
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad");
-      } else {
-        xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-        int* labels_int_ptr_l3 =
-            RAII_GUARD.alloc_l3_or_gm<int32_t>(labels->numel());
-        PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
-        r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
-                                           labels->data<int64_t>(),
-                                           labels_int_ptr_l3,
-                                           labels->numel());
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
-        r = xpu::hard_softmax_with_cross_entropy_grad<XPUType, int>(
-            dev_ctx.x_context(),
-            reinterpret_cast<const XPUType*>(out_grad->data<T>()),
-            labels_int_ptr_l3,
-            reinterpret_cast<const XPUType*>(softmax->data<T>()),
-            reinterpret_cast<XPUType*>(logit_grad->data<T>()),
-            ignore_index,
-            use_softmax,
-            n,
-            d);
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_softmax_with_cross_entropy_grad");
-      }
-    } else {
-      int t = logit_grad->dims()[axis];
-      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-      int len = softmax->numel();
-      XPUType* trans_logit = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(trans_logit);
-      XPUType* trans_softmax = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(trans_softmax);
-      r = xpu::transpose(dev_ctx.x_context(),
-                         reinterpret_cast<const XPUType*>(softmax->data<T>()),
-                         trans_softmax,
-                         {n, t, d / t},
-                         {0, 2, 1});
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
-      if (soft_label) {
-        XPUType* trans_labels = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
-        PADDLE_ENFORCE_XDNN_NOT_NULL(trans_labels);
-        r = xpu::transpose(dev_ctx.x_context(),
-                           reinterpret_cast<const XPUType*>(labels->data<T>()),
-                           trans_labels,
-                           {n, t, d / t},
-                           {0, 2, 1});
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
-        r = xpu::soft_softmax_with_cross_entropy_grad<XPUType>(
-            dev_ctx.x_context(),
-            reinterpret_cast<const XPUType*>(out_grad->data<T>()),
-            trans_labels,
-            trans_softmax,
-            trans_logit,
-            use_softmax,
-            n * d / t,
-            t);
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad");
-      } else {
-        int* labels_int_ptr_l3 =
-            RAII_GUARD.alloc_l3_or_gm<int32_t>(labels->numel());
-        PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
-        r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
-                                           labels->data<int64_t>(),
-                                           labels_int_ptr_l3,
-                                           labels->numel());
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
-        r = xpu::hard_softmax_with_cross_entropy_grad<XPUType, int>(
-            dev_ctx.x_context(),
-            reinterpret_cast<const XPUType*>(out_grad->data<T>()),
-            labels_int_ptr_l3,
-            trans_softmax,
-            trans_logit,
-            ignore_index,
-            use_softmax,
-            n * d / t,
-            t);
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_softmax_with_cross_entropy_grad");
-      }
-      r = xpu::transpose<XPUType>(
-          dev_ctx.x_context(),
-          trans_logit,
-          reinterpret_cast<XPUType*>(logit_grad->data<T>()),
-          {n, d / t, t},
-          {0, 2, 1});
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    softmax_with_cross_entropy,
-    ops::SoftmaxWithCrossEntropyXPUKernel<float>,
-    ops::SoftmaxWithCrossEntropyXPUKernel<paddle::platform::float16>);
-REGISTER_OP_XPU_KERNEL(
-    softmax_with_cross_entropy_grad,
-    ops::SoftmaxWithCrossEntropyGradXPUKernel<float>,
-    ops::SoftmaxWithCrossEntropyGradXPUKernel<paddle::platform::float16>);
-#endif
--- a/paddle/fluid/operators/top_k_v2_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_xpu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_WITH_XPU
-#include <memory>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "xpu/refactor/math.h"
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-template <typename T>
-class TopkV2XPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    const T* in_data = input->data<T>();
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    const auto& out_dims = output->dims();
-    const auto& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
-    const auto& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
-    PADDLE_ENFORCE_EQ(
-        sorted,
-        true,
-        platform::errors::External(
-            "XPU API does not support unsorted topk operation currently."
-            " Operator will be supported in future update."));
-    PADDLE_ENFORCE_EQ(
-        largest,
-        true,
-        platform::errors::External(
-            "XPU API does not support smallest topk operation currently."
-            " Operator will be supported in future update."));
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-    size_t k = static_cast<int>(ctx.Attr<int>("k"));
-    auto* k_t = ctx.Input<Tensor>("K");
-    if (k_t) {
-      k = k_t->data<int>()[0];
-      framework::DDim output_dims = output->dims();
-      output_dims[axis] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-    if (axis + 1 == in_dims.size()) {
-      auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
-      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-      int32_t* indices_int_data =
-          RAII_GUARD.alloc_l3_or_gm<int32_t>(indices->numel());
-      const size_t row =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const size_t col = in_dims[in_dims.size() - 1];
-      int r = xpu::sorted_topk<T>(dev_ctx.x_context(),
-                                  in_data,
-                                  output_data,
-                                  indices_int_data,
-                                  row,
-                                  col,
-                                  k);
-      PADDLE_ENFORCE_EQ(
-          r,
-          XPU_SUCCESS,
-          platform::errors::External(
-              "XPU API return wrong value[%d %s] in call kernel name "
-              "[%s], please check "
-              "where Baidu Kunlun Card is properly installed.",
-              r,
-              XPUAPIErrorMsg[r],
-              "sorted_topk"));
-      r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
-                                         (const int32_t*)indices_int_data,
-                                         indices_data,
-                                         indices->numel());
-      PADDLE_ENFORCE_EQ(
-          r,
-          XPU_SUCCESS,
-          platform::errors::External(
-              "XPU API return wrong value[%d %s] in call kernel name "
-              "[%s], please check "
-              "where Baidu Kunlun Card is properly installed.",
-              r,
-              XPUAPIErrorMsg[r],
-              "cast_v2"));
-    } else {
-      // do transpose if axis is not the last dim of input
-      std::vector<int> trans_axes;
-      for (int i = 0; i < axis; i++) {
-        trans_axes.emplace_back(i);
-      }
-      for (int i = axis + 1; i < in_dims.size(); i++) {
-        trans_axes.emplace_back(i);
-      }
-      trans_axes.emplace_back(axis);
-      // Get input and output dims for transpose
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(output->dims());
-      for (size_t i = 0; i < trans_axes.size(); i++) {
-        trans_dims[i] = in_dims[trans_axes[i]];
-        trans_out_dims[i] = out_dims[trans_axes[i]];
-      }
-      std::vector<int> x_shape_host(in_dims.size(), 0);
-      for (int i = 0; i < in_dims.size(); ++i) {
-        x_shape_host[i] = in_dims[i];
-      }
-      auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
-      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-      T* trans_in_data = RAII_GUARD.alloc_l3_or_gm<T>(input->numel());
-      // Transpose and save interval output to trans_in
-      int r = xpu::transpose<T>(dev_ctx.x_context(),
-                                in_data,
-                                trans_in_data,
-                                x_shape_host,
-                                trans_axes);
-      PADDLE_ENFORCE_EQ(
-          r,
-          xpu::Error_t::SUCCESS,
-          platform::errors::External("XPU API 1st Transpose kernel"
-                                     " returns wrong value[%d %s]!",
-                                     r,
-                                     XPUAPIErrorMsg[r]));
-      T* trans_out_data = RAII_GUARD.alloc_l3_or_gm<T>(output->numel());
-      int64_t* trans_idx_data =
-          RAII_GUARD.alloc_l3_or_gm<int64_t>(output->numel());
-      int32_t* trans_idx_int32_data =
-          RAII_GUARD.alloc_l3_or_gm<int32_t>(output->numel());
-      const size_t row =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const size_t col = trans_dims[trans_dims.size() - 1];
-      // Do top k on transposed input
-      r = xpu::sorted_topk<T>(dev_ctx.x_context(),
-                              trans_in_data,
-                              trans_out_data,
-                              trans_idx_int32_data,
-                              row,
-                              col,
-                              k);
-      PADDLE_ENFORCE_EQ(
-          r,
-          XPU_SUCCESS,
-          platform::errors::External(
-              "XPU API return wrong value[%d %s] in call kernel name "
-              "[%s], please check "
-              "where Baidu Kunlun Card is properly installed.",
-              r,
-              XPUAPIErrorMsg[r],
-              "sorted_topk"));
-      r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
-                                         (const int32_t*)trans_idx_int32_data,
-                                         trans_idx_data,
-                                         indices->numel());
-      PADDLE_ENFORCE_EQ(
-          r,
-          XPU_SUCCESS,
-          platform::errors::External(
-              "XPU API return wrong value[%d %s in call kernel name "
-              "[%s], please check "
-              "where Baidu Kunlun Card is properly installed.",
-              r,
-              XPUAPIErrorMsg[r],
-              "cast_v2"));
-      // Transpose back to original dims
-      std::vector<int> trans_back_axes;
-      for (int i = 0; i < axis; i++) {
-        trans_axes.emplace_back(i);
-      }
-      trans_axes.emplace_back(trans_out_dims.size() - 1);
-      for (int i = axis; i < trans_out_dims.size() - 1; i++) {
-        trans_axes.emplace_back(i);
-      }
-      std::vector<int> trans_out_shape_host(trans_back_axes.size(), 0);
-      for (size_t i = 0; i < trans_back_axes.size(); ++i) {
-        trans_out_shape_host[i] = trans_out_dims[i];
-      }
-      r = xpu::transpose<T>(dev_ctx.x_context(),
-                            trans_out_data,
-                            output_data,
-                            trans_out_shape_host,
-                            trans_back_axes);
-      PADDLE_ENFORCE_EQ(
-          r,
-          xpu::Error_t::SUCCESS,
-          platform::errors::External("XPU API 2nd Transpose kernel"
-                                     " returns wrong value[%d %s]",
-                                     r,
-                                     XPUAPIErrorMsg[r]));
-      r = xpu::transpose<int64_t>(dev_ctx.x_context(),
-                                  trans_idx_data,
-                                  indices_data,
-                                  trans_out_shape_host,
-                                  trans_back_axes);
-      PADDLE_ENFORCE_EQ(
-          r,
-          xpu::Error_t::SUCCESS,
-          platform::errors::External("XPU API 3rd Transpose kernel"
-                                     " returns wrong value[%d %s]",
-                                     r,
-                                     XPUAPIErrorMsg[r]));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(top_k_v2, ops::TopkV2XPUKernel<float>);
-#endif
--- a/paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/kernels/cross_entropy_grad_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+namespace phi {
+template <typename T, typename Context>
+void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx,
+                                       const DenseTensor& labels,
+                                       const DenseTensor& softmax,
+                                       const DenseTensor& loss_grad,
+                                       bool soft_label,
+                                       bool use_softmax,
+                                       bool numeric_stable_mode,
+                                       int ignore_index,
+                                       int axis_in,
+                                       DenseTensor* logit_grad) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  dev_ctx.template Alloc<T>(logit_grad);
+  const int rank = logit_grad->dims().size();
+  const int axis = phi::funcs::CanonicalAxis(axis_in, rank);
+  const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims());
+  const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims());
+  int r = XPU_SUCCESS;
+  if (axis == rank - 1) {
+    if (soft_label) {
+      r = xpu::soft_softmax_with_cross_entropy_grad<XPUType>(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType*>(loss_grad.data<T>()),
+          reinterpret_cast<const XPUType*>(labels.data<T>()),
+          reinterpret_cast<const XPUType*>(softmax.data<T>()),
+          reinterpret_cast<XPUType*>(logit_grad->data<T>()),
+          use_softmax,
+          n,
+          d);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad");
+    } else {
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      int* labels_int_ptr_l3 =
+          RAII_GUARD.alloc_l3_or_gm<int32_t>(labels.numel());
+      PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
+      r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
+                                         labels.data<int64_t>(),
+                                         labels_int_ptr_l3,
+                                         labels.numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+      r = xpu::hard_softmax_with_cross_entropy_grad<XPUType, int>(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType*>(loss_grad.data<T>()),
+          labels_int_ptr_l3,
+          reinterpret_cast<const XPUType*>(softmax.data<T>()),
+          reinterpret_cast<XPUType*>(logit_grad->data<T>()),
+          ignore_index,
+          use_softmax,
+          n,
+          d);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_softmax_with_cross_entropy_grad");
+    }
+  } else {
+    int t = logit_grad->dims()[axis];
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    int len = softmax.numel();
+    XPUType* trans_logit = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(trans_logit);
+    XPUType* trans_softmax = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(trans_softmax);
+    r = xpu::transpose(dev_ctx.x_context(),
+                       reinterpret_cast<const XPUType*>(softmax.data<T>()),
+                       trans_softmax,
+                       {n, t, d / t},
+                       {0, 2, 1});
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
+    if (soft_label) {
+      XPUType* trans_labels = RAII_GUARD.alloc_l3_or_gm<XPUType>(len);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(trans_labels);
+      r = xpu::transpose(dev_ctx.x_context(),
+                         reinterpret_cast<const XPUType*>(labels.data<T>()),
+                         trans_labels,
+                         {n, t, d / t},
+                         {0, 2, 1});
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
+      r = xpu::soft_softmax_with_cross_entropy_grad<XPUType>(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType*>(loss_grad.data<T>()),
+          trans_labels,
+          trans_softmax,
+          trans_logit,
+          use_softmax,
+          n * d / t,
+          t);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy_grad");
+    } else {
+      int* labels_int_ptr_l3 =
+          RAII_GUARD.alloc_l3_or_gm<int32_t>(labels.numel());
+      PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
+      r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
+                                         labels.data<int64_t>(),
+                                         labels_int_ptr_l3,
+                                         labels.numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
+      r = xpu::hard_softmax_with_cross_entropy_grad<XPUType, int>(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType*>(loss_grad.data<T>()),
+          labels_int_ptr_l3,
+          trans_softmax,
+          trans_logit,
+          ignore_index,
+          use_softmax,
+          n * d / t,
+          t);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_softmax_with_cross_entropy_grad");
+    }
+    r = xpu::transpose<XPUType>(
+        dev_ctx.x_context(),
+        trans_logit,
+        reinterpret_cast<XPUType*>(logit_grad->data<T>()),
+        {n, d / t, t},
+        {0, 2, 1});
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::CrossEntropyWithSoftmaxGradKernel,
+                   float,
+                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/xpu/cross_entropy_kernel.cc
+++ b/paddle/phi/kernels/xpu/cross_entropy_kernel.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/kernels/cross_entropy_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+namespace phi {
+template <typename T, typename Context>
+void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx,
+                                   const DenseTensor& logits,
+                                   const DenseTensor& labels,
+                                   bool soft_label,
+                                   bool use_softmax,
+                                   bool numeric_stable_mode,
+                                   int ignore_index,
+                                   int axis_in,
+                                   DenseTensor* softmax,
+                                   DenseTensor* loss) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  PADDLE_ENFORCE_EQ(
+      logits.place().GetType() == phi::AllocationType::XPU,
+      true,
+      errors::PreconditionNotMet("This kernel only runs on XPU."));
+  const int rank = logits.dims().size();
+  const int axis = phi::funcs::CanonicalAxis(axis_in, rank);
+  dev_ctx.template Alloc<T>(softmax);
+  dev_ctx.template Alloc<T>(loss);
+  const int n = phi::funcs::SizeToAxis(axis, logits.dims());
+  const int d = phi::funcs::SizeFromAxis(axis, logits.dims());
+  std::vector<int> logits_dims = phi::vectorize<int>(logits.dims());
+  int t = logits_dims[axis];
+  auto logits_data = reinterpret_cast<const XPUType*>(logits.data<T>());
+  auto softmax_data = reinterpret_cast<XPUType*>(softmax->data<T>());
+  auto loss_data = reinterpret_cast<XPUType*>(loss->data<T>());
+  // softmax
+  int r = XPU_SUCCESS;
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  if (phi::backends::xpu::get_xpu_version(dev_ctx.GetPlace().GetDeviceId()) ==
+          phi::backends::xpu::XPUVersion::XPU2 &&
+      soft_label && axis == rank - 1) {
+    auto labels_data = reinterpret_cast<const XPUType*>(labels.data<T>());
+    r = xpu::soft_softmax_with_cross_entropy<XPUType>(dev_ctx.x_context(),
+                                                      logits_data,
+                                                      labels_data,
+                                                      softmax_data,
+                                                      loss_data,
+                                                      n,
+                                                      d);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_softmax_with_cross_entropy");
+    return;
+  }
+  int len = logits.numel();
+  T* clip_logits = RAII_GUARD.alloc_l3_or_gm<T>(len);
+  PADDLE_ENFORCE_XDNN_NOT_NULL(clip_logits);
+  XPUType* clip_logits_data = reinterpret_cast<XPUType*>(clip_logits);
+  float max_val = 1e20;
+  float min_val = -1e20;
+  if (std::is_same<T, dtype::float16>::value) {
+    max_val = 65504;
+    min_val = -65504;
+  }
+  r = xpu::clip_v2<XPUType>(dev_ctx.x_context(),
+                            logits_data,
+                            clip_logits_data,
+                            len,
+                            static_cast<XPUType>(min_val),
+                            static_cast<XPUType>(max_val));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
+  r = xpu::softmax<XPUType>(
+      dev_ctx.x_context(), clip_logits_data, softmax_data, logits_dims, axis);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax");
+  // cross_entropy
+  if (axis != rank - 1) {
+    XPUType* trans_softmax = RAII_GUARD.alloc_l3_or_gm<XPUType>(n * d);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(trans_softmax);
+    r = xpu::transpose(dev_ctx.x_context(),
+                       softmax_data,
+                       trans_softmax,
+                       {n, t, d / t},
+                       {0, 2, 1});
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
+    softmax_data = trans_softmax;
+  }
+  if (soft_label) {
+    auto labels_data = reinterpret_cast<const XPUType*>(labels.data<T>());
+    if (axis != rank - 1) {
+      XPUType* trans_label = RAII_GUARD.alloc_l3_or_gm<XPUType>(n * d);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(trans_label);
+      r = xpu::transpose(dev_ctx.x_context(),
+                         labels_data,
+                         trans_label,
+                         {n, t, d / t},
+                         {0, 2, 1});
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
+      labels_data = trans_label;
+    }
+    r = xpu::soft_cross_entropy<XPUType>(dev_ctx.x_context(),
+                                         softmax_data,
+                                         labels_data,
+                                         loss_data,
+                                         axis == rank - 1 ? n : n * d / t,
+                                         axis == rank - 1 ? d : t);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "soft_cross_entropy");
+  } else {
+    DenseTensor labels_int32;
+    int* labels_int_ptr_l3 = RAII_GUARD.alloc_l3_or_gm<int32_t>(labels.numel());
+    PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
+    r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
+                                       labels.data<int64_t>(),
+                                       labels_int_ptr_l3,
+                                       labels.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
+    r = xpu::hard_cross_entropy<XPUType, int32_t>(
+        dev_ctx.x_context(),
+        softmax_data,
+        labels_int_ptr_l3,
+        loss_data,
+        nullptr,
+        axis == rank - 1 ? n : n * d / t,
+        axis == rank - 1 ? d : t,
+        ignore_index);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_cross_entropy");
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(cross_entropy_with_softmax,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::CrossEntropyWithSoftmaxKernel,
+                   float,
+                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/xpu/top_k_kernel.cc
+++ b/paddle/phi/kernels/xpu/top_k_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/top_k_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  const T* in_data = x.data<T>();
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  const auto& out_dims = out->dims();
+  PADDLE_ENFORCE_EQ(
+      sorted,
+      true,
+      errors::External(
+          "XPU API does not support unsorted topk operation currently."
+          " Operator will be supported in future update."));
+  PADDLE_ENFORCE_EQ(
+      largest,
+      true,
+      errors::External(
+          "XPU API does not support smallest topk operation currently."
+          " Operator will be supported in future update."));
+  if (axis < 0) axis += in_dims.size();
+  size_t k = k_scalar.to<int>();
+  if (axis + 1 == in_dims.size()) {
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    int32_t* indices_int_data =
+        RAII_GUARD.alloc_l3_or_gm<int32_t>(indices->numel());
+    const size_t row =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const size_t col = in_dims[in_dims.size() - 1];
+    int r = xpu::sorted_topk<T>(dev_ctx.x_context(),
+                                in_data,
+                                output_data,
+                                indices_int_data,
+                                row,
+                                col,
+                                k);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk");
+    r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
+                                       (const int32_t*)indices_int_data,
+                                       indices_data,
+                                       indices->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+  } else {
+    // do transpose if axis is not the last dim of input
+    std::vector<int> trans_axes;
+    for (int i = 0; i < axis; i++) {
+      trans_axes.emplace_back(i);
+    }
+    for (int i = axis + 1; i < in_dims.size(); i++) {
+      trans_axes.emplace_back(i);
+    }
+    trans_axes.emplace_back(axis);
+    // Get input and output dims for transpose
+    DDim trans_dims(in_dims);
+    DDim trans_out_dims(out->dims());
+    for (size_t i = 0; i < trans_axes.size(); i++) {
+      trans_dims[i] = in_dims[trans_axes[i]];
+      trans_out_dims[i] = out_dims[trans_axes[i]];
+    }
+    std::vector<int> x_shape_host(in_dims.size(), 0);
+    for (int i = 0; i < in_dims.size(); ++i) {
+      x_shape_host[i] = in_dims[i];
+    }
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    T* trans_in_data = RAII_GUARD.alloc_l3_or_gm<T>(x.numel());
+    // Transpose and save interval output to trans_in
+    int r = xpu::transpose<T>(
+        dev_ctx.x_context(), in_data, trans_in_data, x_shape_host, trans_axes);
+    PADDLE_ENFORCE_EQ(r,
+                      xpu::Error_t::SUCCESS,
+                      errors::External("XPU API 1st Transpose kernel"
+                                       " returns wrong value[%d %s]!",
+                                       r,
+                                       XPUAPIErrorMsg[r]));
+    T* trans_out_data = RAII_GUARD.alloc_l3_or_gm<T>(out->numel());
+    int64_t* trans_idx_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(out->numel());
+    int32_t* trans_idx_int32_data =
+        RAII_GUARD.alloc_l3_or_gm<int32_t>(out->numel());
+    const size_t row =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const size_t col = trans_dims[trans_dims.size() - 1];
+    // Do top k on transposed input
+    r = xpu::sorted_topk<T>(dev_ctx.x_context(),
+                            trans_in_data,
+                            trans_out_data,
+                            trans_idx_int32_data,
+                            row,
+                            col,
+                            k);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk");
+    r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
+                                       (const int32_t*)trans_idx_int32_data,
+                                       trans_idx_data,
+                                       indices->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+    // Transpose back to original dims
+    std::vector<int> trans_back_axes;
+    for (int i = 0; i < axis; i++) {
+      trans_axes.emplace_back(i);
+    }
+    trans_axes.emplace_back(trans_out_dims.size() - 1);
+    for (int i = axis; i < trans_out_dims.size() - 1; i++) {
+      trans_axes.emplace_back(i);
+    }
+    std::vector<int> trans_out_shape_host(trans_back_axes.size(), 0);
+    for (size_t i = 0; i < trans_back_axes.size(); ++i) {
+      trans_out_shape_host[i] = trans_out_dims[i];
+    }
+    r = xpu::transpose<T>(dev_ctx.x_context(),
+                          trans_out_data,
+                          output_data,
+                          trans_out_shape_host,
+                          trans_back_axes);
+    PADDLE_ENFORCE_EQ(r,
+                      xpu::Error_t::SUCCESS,
+                      errors::External("XPU API 2nd Transpose kernel"
+                                       " returns wrong value[%d %s]",
+                                       r,
+                                       XPUAPIErrorMsg[r]));
+    r = xpu::transpose<int64_t>(dev_ctx.x_context(),
+                                trans_idx_data,
+                                indices_data,
+                                trans_out_shape_host,
+                                trans_back_axes);
+    PADDLE_ENFORCE_EQ(r,
+                      xpu::Error_t::SUCCESS,
+                      errors::External("XPU API 3rd Transpose kernel"
+                                       " returns wrong value[%d %s]",
+                                       r,
+                                       XPUAPIErrorMsg[r]));
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(top_k, XPU, ALL_LAYOUT, phi::TopkKernel, float) {}