[Phi] Migrate mode_op and mode_grad_op into Phi (#40571)

* [Phi] Migrate mode_op and mode_grad_op into Phi * fix omp * add ifdef * migrate infershape * modify according reviewer

[Phi] Migrate mode_op and mode_grad_op into Phi (#40571)
* [Phi] Migrate mode_op and mode_grad_op into Phi * fix omp * add ifdef * migrate infershape * modify according reviewer
00183a93 · Aurelius84 · GitHub · 7004f65c · 00183a93 · 7004f65c
13 changed file
--- a/paddle/fluid/operators/mode_op.cc
+++ b/paddle/fluid/operators/mode_op.cc
@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/mode_op.h"
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace operators {
@@ -23,43 +27,6 @@ class ModeOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mode");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mode");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "mode");
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_EQ(
-        (axis < dim_size) && (axis >= (-1 * dim_size)), true,
-        paddle::platform::errors::InvalidArgument(
-            "the axis of ModeOp must be [-%d, %d), but you set axis is %d",
-            dim_size, dim_size, axis));
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of ModeOp must have >= 1d shape"));
-    if (axis < 0) axis += dim_size;
-    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
-    std::vector<int64_t> dimvec;
-    for (int64_t i = 0; i < axis; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    if (keepdim) {
-      dimvec.emplace_back(static_cast<int64_t>(1));
-    }
-    for (int64_t i = axis + 1; i < dim_size; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    framework::DDim dims = phi::make_ddim(dimvec);
-    PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
-                                                "input shape should >= 1d"));
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -138,18 +105,11 @@ class ModeGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(mode, ModeInferShapeFunctor,
+                            PD_INFER_META(phi::ModeInferMeta));
 REGISTER_OPERATOR(mode, ops::ModeOp, ops::ModeOpMaker,
                  ops::ModeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ModeGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ModeGradOpMaker<paddle::imperative::OpBase>,
-REGISTER_OP_CPU_KERNEL(mode,
+                  ModeInferShapeFunctor);
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, float>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, double>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int32_t>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int64_t>);
 REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    mode_grad, ops::ModeGradCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, double>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/fluid/operators/mode_op.cu
+++ b/paddle/fluid/operators/mode_op.cu
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/inner_product.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mode_op.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-namespace paddle {
-namespace operators {
-int ComputeBlockSize(int col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256 && col <= 512)
-    return 512;
-  else if (col > 128 && col <= 256)
-    return 256;
-  else if (col > 64 && col <= 128)
-    return 128;
-  else
-    return 64;
-}
-template <typename T>
-void getModebySort(const platform::CUDADeviceContext& ctx,
-                   const framework::Tensor* input_tensor,
-                   const int64_t num_cols, const int64_t num_rows,
-                   T* out_tensor, int64_t* indices_tensor) {
-  framework::Tensor input_tmp;
-  framework::TensorCopy(*input_tensor, ctx.GetPlace(), &input_tmp);
-  T* input_tmp_data = input_tmp.mutable_data<T>(ctx.GetPlace());
-  input_tmp.Resize(phi::make_ddim({num_rows, num_cols}));
-  thrust::device_ptr<T> out_tensor_ptr(out_tensor);
-  thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
-  for (int64_t i = 0; i < num_rows; ++i) {
-    T* begin = input_tmp_data + num_cols * i;
-    T* end = input_tmp_data + num_cols * (i + 1);
-    thrust::device_vector<int64_t> indices_data(num_cols);
-    thrust::sequence(thrust::device, indices_data.begin(),
-                     indices_data.begin() + num_cols);
-    thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
-    int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1,
-                                           begin + 1, 0, thrust::plus<int>(),
-                                           thrust::not_equal_to<T>());
-    thrust::device_vector<T> keys_data(unique);
-    thrust::device_vector<int64_t> cnts_data(unique);
-    thrust::reduce_by_key(thrust::device, begin, end,
-                          thrust::constant_iterator<int>(1), keys_data.begin(),
-                          cnts_data.begin());
-    auto it = thrust::max_element(thrust::device, cnts_data.begin(),
-                                  cnts_data.begin() + unique);
-    T mode = keys_data[it - cnts_data.begin()];
-    int64_t counts = cnts_data[it - cnts_data.begin()];
-    auto pos = thrust::find(thrust::device, begin, end, mode);
-    int64_t index = indices_data[pos - begin + counts - 1];
-    out_tensor_ptr[i] = static_cast<T>(mode);
-    indices_tensor_ptr[i] = static_cast<int64_t>(index);
-  }
-}
-template <typename DeviceContext, typename T>
-class ModeOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
-    // get the input dims
-    const auto& in_dims = input->dims();
-    // calcluate the real axis
-    if (axis < 0) axis += in_dims.size();
-    auto out_dims = output->dims();
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      getModebySort<T>(dev_ctx, input, input_width, input_height, output_data,
-                       indices_data);
-    } else {
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dim);
-        indices->Resize(tmp_out_dim);
-      }
-      framework::DDim trans_shape(in_dims);
-      framework::DDim trans_out_shape(in_dims);
-      for (int i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = in_dims[trans_axis[i]];
-        trans_out_shape[i] = in_dims[trans_axis[i]];
-      }
-      trans_out_shape[in_dims.size() - 1] = 1;
-      // second step, tranpose the input
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_shape, ctx.GetPlace());
-      int ndims = trans_axis.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans_axis);
-      framework::Tensor trans_ind;
-      int64_t* trans_ind_data =
-          trans_ind.mutable_data<int64_t>(trans_out_shape, ctx.GetPlace());
-      framework::Tensor trans_out;
-      T* trans_out_data =
-          trans_out.mutable_data<T>(trans_out_shape, ctx.GetPlace());
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
-      const int64_t input_width = trans_shape[trans_shape.size() - 1];
-      getModebySort<T>(dev_ctx, &trans_input, input_width, input_height,
-                       trans_out_data, trans_ind_data);
-      // last step, tranpose back the indices and output
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans_axis);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans_axis);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class ModeOpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-    const auto& in_dims = x->dims();
-    auto out_dims = indices->dims();
-    if (axis < 0) axis += in_dims.size();
-    // allocate the cuda memory for the x_grad
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-    // calcluate the block and grid num
-    auto& dev_ctx = context.cuda_device_context();
-    int block_size = ComputeBlockSize(post);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    mode, ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    mode_grad,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/mode_op.h
+++ b/paddle/fluid/operators/mode_op.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-namespace paddle {
-namespace operators {
-template <typename T, typename Type>
-static void getMode(Type input_height, Type input_width, int input_dim,
-                    const framework::Tensor* input, T* t_out, Type* t_indices) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    std::sort(col_vec.begin(), col_vec.end(),
-              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                return (!std::isnan(static_cast<double>(l.first)) &&
-                        std::isnan(static_cast<double>(r.first))) ||
-                       (l.first < r.first);
-              });
-    T mode = 0;
-    int64_t indice = 0;
-    int64_t cur_freq = 0;
-    int64_t max_freq = 0;
-    for (int64_t i = 0; i < input_width; ++i) {
-      ++cur_freq;
-      if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) {
-        if (cur_freq > max_freq) {
-          max_freq = cur_freq;
-          mode = col_vec[i].first;
-          indice = col_vec[i].second;
-        }
-        cur_freq = 0;
-      }
-    }
-    t_out[i] = mode;
-    t_indices[i] = indice;
-  }
-}
-template <typename T, typename Type>
-static void ModeAssign(const Type& input_height, const Type& input_width,
-                       const int& input_dim, const framework::Tensor* input,
-                       const framework::Tensor* indices, T* output_data) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      output_data[i * input_width + e_indices(0)] = e_input(0);
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
-    }
-  }
-}
-template <typename DeviceContext, typename T>
-class ModeCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("X");
-    auto* output = context.Output<framework::Tensor>("Out");
-    auto* indices = context.Output<framework::Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-    // axis < 0, cacluate the real axis
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    auto out_dims = output->dims();
-    // if axis is not the last dim, transpose it to the last dim, do the
-    // calculation,
-    // then tranpose it back to orginal axis.
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      getMode<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                          output_data, indices_data);
-    } else {
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dim);
-        indices->Resize(tmp_out_dim);
-      }
-      // get the trans input_dims, out_dims
-      framework::DDim trans_shape(in_dims);
-      framework::DDim trans_out_shape(in_dims);
-      for (size_t i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = in_dims[trans_axis[i]];
-        trans_out_shape[i] = in_dims[trans_axis[i]];
-      }
-      trans_out_shape[in_dims.size() - 1] = 1;
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_shape, context.GetPlace());
-      int ndims = trans_axis.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-      // transpose the input value
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_input, trans_axis);
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
-      const int64_t input_width = trans_shape[trans_shape.size() - 1];
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_shape, context.GetPlace());
-      framework::Tensor tmp_indices;
-      auto* t_ind = tmp_indices.mutable_data<int64_t>(trans_out_shape,
-                                                      context.GetPlace());
-      getMode<T, int64_t>(input_height, input_width, in_dims.size(),
-                          &trans_input, t_out, t_ind);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans_axis);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans_axis);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class ModeGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-    auto in_dims = x->dims();
-    auto out_dims = indices->dims();
-    // axis < 0, get the real axis
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    if (!keepdim) {
-      std::vector<int> tmp_out_shape;
-      for (int i = 0; i < axis; i++) {
-        tmp_out_shape.emplace_back(out_dims[i]);
-      }
-      tmp_out_shape.emplace_back(1);
-      for (int i = axis + 1; i < in_dims.size(); i++) {
-        tmp_out_shape.emplace_back(out_dims[i - 1]);
-      }
-      out_dims = phi::make_ddim(tmp_out_shape);
-    }
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis == in_dims.size() - 1) {
-      // allocate the memory for the input_grad
-      // assign the out_grad to input_grad directly
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      // init the output grad with 0, because some input elements has no grad
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      // Assign the output_grad to input_grad
-      if (keepdim) {
-        ModeAssign(input_height, input_width, in_dims.size(), out_grad, indices,
-                   x_grad_data);
-      } else {
-        auto& dev_context =
-            context.template device_context<platform::CPUDeviceContext>();
-        framework::Tensor out_grad_tmp;
-        framework::Tensor indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        ModeAssign(input_height, input_width, in_dims.size(), &out_grad_tmp,
-                   &indices_tmp, x_grad_data);
-      }
-    } else {
-      // can not assign grad to input_grad, must do the transpose
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-      framework::DDim trans_shape(out_dims);
-      framework::DDim trans_in_shape(in_dims);
-      for (size_t i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = out_dims[trans_axis[i]];
-        trans_in_shape[i] = in_dims[trans_axis[i]];
-      }
-      // transpose the out_grad, indices
-      framework::Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_shape, context.GetPlace());
-      framework::Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_shape, context.GetPlace());
-      int ndims = trans_axis.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-      if (keepdim) {
-        // Do transpose
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, *out_grad, &trans_dO, trans_axis);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, *indices, &trans_ind, trans_axis);
-      } else {
-        framework::Tensor out_grad_tmp;
-        framework::Tensor indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        // Do transpose
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, out_grad_tmp, &trans_dO, trans_axis);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, indices_tmp, &trans_ind, trans_axis);
-      }
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
-      const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
-      // Assign the out_grad to tranpose input_grad
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_shape, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-      ModeAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                             &trans_dO, &trans_ind, t_out);
-      // Transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans_axis);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -648,6 +648,49 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
  mask->set_dtype(paddle::experimental::CppTypeToDataType<int>::Type());
 }
+void ModeInferMeta(const MetaTensor& x,
+                   int axis,
+                   bool keepdim,
+                   MetaTensor* out,
+                   MetaTensor* indices) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_EQ(
+      (axis < dim_size) && (axis >= (-1 * dim_size)),
+      true,
+      errors::InvalidArgument(
+          "the axis of ModeOp must be [-%d, %d), but you set axis is %d",
+          dim_size,
+          dim_size,
+          axis));
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      errors::InvalidArgument("input of ModeOp must have >= 1d shape"));
+  if (axis < 0) axis += dim_size;
+  std::vector<int64_t> dimvec;
+  for (int64_t i = 0; i < axis; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  if (keepdim) {
+    dimvec.emplace_back(static_cast<int64_t>(1));
+  }
+  for (int64_t i = axis + 1; i < dim_size; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  DDim dims = phi::make_ddim(dimvec);
+  PADDLE_ENFORCE_GE(input_dims.size(),
+                    1,
+                    errors::InvalidArgument("input shape should >= 1d"));
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(x.dtype());
+}
 void MultinomialInferMeta(const MetaTensor& x,
                          int num_samples,
                          bool replacement,

--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -112,6 +112,12 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
                               MetaTensor* mask,
                               MetaConfig config = MetaConfig());
+void ModeInferMeta(const MetaTensor& x,
+                   int axis,
+                   bool keepdim,
+                   MetaTensor* out,
+                   MetaTensor* indices);
 void MultinomialInferMeta(const MetaTensor& x,
                          int num_samples,
                          bool replacement,

--- a/paddle/phi/kernels/cpu/mode_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/mode_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+namespace phi {
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad) {
+  auto in_dims = x.dims();
+  auto out_dims = indices.dims();
+  // axis < 0, get the real axis
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  if (!keepdim) {
+    std::vector<int> tmp_out_shape;
+    for (int i = 0; i < axis; i++) {
+      tmp_out_shape.emplace_back(out_dims[i]);
+    }
+    tmp_out_shape.emplace_back(1);
+    for (int i = axis + 1; i < in_dims.size(); i++) {
+      tmp_out_shape.emplace_back(out_dims[i - 1]);
+    }
+    out_dims = phi::make_ddim(tmp_out_shape);
+  }
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  if (axis == in_dims.size() - 1) {
+    // allocate the memory for the input_grad
+    // assign the out_grad to input_grad directly
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    // init the output grad with 0, because some input elements has no grad
+    memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+    // Assign the output_grad to input_grad
+    if (keepdim) {
+      funcs::ModeAssign(input_height,
+                        input_width,
+                        in_dims.size(),
+                        &out_grad,
+                        &indices,
+                        x_grad_data);
+    } else {
+      DenseTensor out_grad_tmp;
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      DenseTensor indices_tmp;
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+      phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      funcs::ModeAssign(input_height,
+                        input_width,
+                        in_dims.size(),
+                        &out_grad_tmp,
+                        &indices_tmp,
+                        x_grad_data);
+    }
+  } else {
+    // can not assign grad to input_grad, must do the transpose
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+    DDim trans_shape(out_dims);
+    DDim trans_in_shape(in_dims);
+    for (size_t i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = out_dims[trans_axis[i]];
+      trans_in_shape[i] = in_dims[trans_axis[i]];
+    }
+    // transpose the out_grad, indices
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_shape);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    int ndims = trans_axis.size();
+    if (keepdim) {
+      // Do transpose
+      funcs::TransCompute<CPUContext, T>(
+          ndims, dev_ctx, out_grad, &trans_dO, trans_axis);
+      funcs::TransCompute<CPUContext, int64_t>(
+          ndims, dev_ctx, indices, &trans_ind, trans_axis);
+    } else {
+      DenseTensor out_grad_tmp;
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      DenseTensor indices_tmp;
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+      phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      // Do transpose
+      funcs::TransCompute<CPUContext, T>(
+          ndims, dev_ctx, out_grad_tmp, &trans_dO, trans_axis);
+      funcs::TransCompute<CPUContext, int64_t>(
+          ndims, dev_ctx, indices_tmp, &trans_ind, trans_axis);
+    }
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
+    const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
+    // Assign the out_grad to tranpose input_grad
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_shape);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, x_grad->numel() * sizeof(T));
+    funcs::ModeAssign<T, int64_t>(input_height,
+                                  input_width,
+                                  in_dims.size(),
+                                  &trans_dO,
+                                  &trans_ind,
+                                  t_out);
+    // Transpose back
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, tmp_out, x_grad, trans_axis);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(mode_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ModeGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
--- a/paddle/phi/kernels/cpu/mode_kernel.cc
+++ b/paddle/phi/kernels/cpu/mode_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/mode_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+namespace phi {
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  auto out_dims = out->dims();
+  // axis < 0, cacluate the real axis
+  if (axis < 0) axis += in_dims.size();
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  // if axis is not the last dim, transpose it to the last dim, do the
+  // calculation, then tranpose it back to original axis.
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    funcs::GetMode<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &x,
+                               output_data,
+                               indices_data);
+  } else {
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
+      out->Resize(tmp_out_dim);
+      indices->Resize(tmp_out_dim);
+    }
+    // get the trans input_dims, out_dims
+    DDim trans_shape(in_dims);
+    DDim trans_out_shape(in_dims);
+    for (size_t i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = in_dims[trans_axis[i]];
+      trans_out_shape[i] = in_dims[trans_axis[i]];
+    }
+    trans_out_shape[in_dims.size() - 1] = 1;
+    DenseTensor trans_input;
+    trans_input.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_input);
+    int ndims = trans_axis.size();
+    // transpose the input value
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, x, &trans_input, trans_axis);
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+    const int64_t input_width = trans_shape[trans_shape.size() - 1];
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_out_shape);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    DenseTensor tmp_indices;
+    tmp_indices.Resize(trans_out_shape);
+    int64_t* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+    funcs::GetMode<T, int64_t>(
+        input_height, input_width, in_dims.size(), &trans_input, t_out, t_ind);
+    // transpose back
+    funcs::TransCompute<CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans_axis);
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, tmp_out, out, trans_axis);
+    if (!keepdim) {
+      out->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    mode, CPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {}
--- a/paddle/phi/kernels/funcs/mode.h
+++ b/paddle/phi/kernels/funcs/mode.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/extrema.h>
+#include <thrust/functional.h>
+#include <thrust/inner_product.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#endif
+#include <algorithm>
+#include <cmath>
+#include <utility>
+#include <vector>
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+namespace funcs {
+static int ComputeBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+static inline void GetDims(
+    const phi::DDim& dim, int axis, int* pre, int* n, int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+template <typename T, typename Type>
+static void GetMode(Type input_height,
+                    Type input_width,
+                    int input_dim,
+                    const DenseTensor* input,
+                    T* t_out,
+                    Type* t_indices) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    std::sort(col_vec.begin(),
+              col_vec.end(),
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+    T mode = 0;
+    int64_t indice = 0;
+    int64_t cur_freq = 0;
+    int64_t max_freq = 0;
+    for (int64_t i = 0; i < input_width; ++i) {
+      ++cur_freq;
+      if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) {
+        if (cur_freq > max_freq) {
+          max_freq = cur_freq;
+          mode = col_vec[i].first;
+          indice = col_vec[i].second;
+        }
+        cur_freq = 0;
+      }
+    }
+    t_out[i] = mode;
+    t_indices[i] = indice;
+  }
+}
+template <typename T, typename Type>
+static void ModeAssign(const Type& input_height,
+                       const Type& input_width,
+                       const int& input_dim,
+                       const DenseTensor* input,
+                       const DenseTensor* indices,
+                       T* output_data) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      output_data[i * input_width + e_indices(0)] = e_input(0);
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
+    }
+  }
+}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+static void GetModebySort(const phi::GPUContext& dev_ctx,
+                          const DenseTensor* input_tensor,
+                          const int64_t num_cols,
+                          const int64_t num_rows,
+                          T* out_tensor,
+                          int64_t* indices_tensor) {
+  DenseTensor input_tmp;
+  input_tmp.Resize(phi::make_ddim({num_rows, num_cols}));
+  T* input_tmp_data = dev_ctx.Alloc<T>(&input_tmp);
+  phi::Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, &input_tmp);
+  thrust::device_ptr<T> out_tensor_ptr(out_tensor);
+  thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
+  for (int64_t i = 0; i < num_rows; ++i) {
+    T* begin = input_tmp_data + num_cols * i;
+    T* end = input_tmp_data + num_cols * (i + 1);
+    thrust::device_vector<int64_t> indices_data(num_cols);
+    thrust::sequence(
+        thrust::device, indices_data.begin(), indices_data.begin() + num_cols);
+    thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
+    int unique = 1 + thrust::inner_product(thrust::device,
+                                           begin,
+                                           end - 1,
+                                           begin + 1,
+                                           0,
+                                           thrust::plus<int>(),
+                                           thrust::not_equal_to<T>());
+    thrust::device_vector<T> keys_data(unique);
+    thrust::device_vector<int64_t> cnts_data(unique);
+    thrust::reduce_by_key(thrust::device,
+                          begin,
+                          end,
+                          thrust::constant_iterator<int>(1),
+                          keys_data.begin(),
+                          cnts_data.begin());
+    auto it = thrust::max_element(
+        thrust::device, cnts_data.begin(), cnts_data.begin() + unique);
+    T mode = keys_data[it - cnts_data.begin()];
+    int64_t counts = cnts_data[it - cnts_data.begin()];
+    auto pos = thrust::find(thrust::device, begin, end, mode);
+    int64_t index = indices_data[pos - begin + counts - 1];
+    out_tensor_ptr[i] = static_cast<T>(mode);
+    indices_tensor_ptr[i] = static_cast<int64_t>(index);
+  }
+}
+#endif
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/mode_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/mode_grad_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+namespace phi {
+template <typename T>
+__global__ void AssignGradWithAxis(const T* grad_out,
+                                   const int64_t* indices,
+                                   T* grad_in,
+                                   int pre,
+                                   int post,
+                                   int raw_height,
+                                   int k) {
+  // raw_height is the length of topk axis
+  for (int i = blockIdx.x; i < pre; i += gridDim.x) {
+    int base_index = i * post * k;
+    int base_grad = i * post * raw_height;
+    for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) {
+      grad_in[base_grad + j] = static_cast<T>(0);
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < k * post; j += blockDim.x) {
+      int64_t idx_ij = indices[base_index + j];
+      int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
+      grad_in[in_ij] = grad_out[base_index + j];
+    }
+  }
+}
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  auto out_dims = indices.dims();
+  if (axis < 0) axis += in_dims.size();
+  // allocate the cuda memory for the x_grad
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  const T* out_grad_data = out_grad.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+  int pre, n, post;
+  funcs::GetDims(in_dims, axis, &pre, &n, &post);
+  // calcluate the block and grid num
+  int block_size = funcs::ComputeBlockSize(post);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+  AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(mode_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ModeGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
--- a/paddle/phi/kernels/gpu/mode_kernel.cu
+++ b/paddle/phi/kernels/gpu/mode_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/mode_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+namespace phi {
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  // get the input dims
+  const auto& in_dims = x.dims();
+  // calcluate the real axis
+  if (axis < 0) axis += in_dims.size();
+  auto out_dims = out->dims();
+  const T* input_data = x.data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    funcs::GetModebySort<T>(
+        dev_ctx, &x, input_width, input_height, output_data, indices_data);
+  } else {
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
+      out->Resize(tmp_out_dim);
+      indices->Resize(tmp_out_dim);
+    }
+    DDim trans_shape(in_dims);
+    DDim trans_out_shape(in_dims);
+    for (int i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = in_dims[trans_axis[i]];
+      trans_out_shape[i] = in_dims[trans_axis[i]];
+    }
+    trans_out_shape[in_dims.size() - 1] = 1;
+    // second step, tranpose the input
+    DenseTensor trans_input;
+    trans_input.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_input);
+    int ndims = trans_axis.size();
+    funcs::TransCompute<Context, T>(
+        ndims, dev_ctx, x, &trans_input, trans_axis);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_out_shape);
+    int64_t* trans_ind_data = dev_ctx.template Alloc<int64_t>(&trans_ind);
+    DenseTensor trans_out;
+    trans_out.Resize(trans_out_shape);
+    T* trans_out_data = dev_ctx.template Alloc<T>(&trans_out);
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+    const int64_t input_width = trans_shape[trans_shape.size() - 1];
+    funcs::GetModebySort<T>(dev_ctx,
+                            &trans_input,
+                            input_width,
+                            input_height,
+                            trans_out_data,
+                            trans_ind_data);
+    // last step, tranpose back the indices and output
+    funcs::TransCompute<Context, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans_axis);
+    funcs::TransCompute<Context, T>(ndims, dev_ctx, trans_out, out, trans_axis);
+    if (!keepdim) {
+      out->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    mode, GPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {}
--- a/paddle/phi/kernels/mode_grad_kernel.h
+++ b/paddle/phi/kernels/mode_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad);
+}  // namespace phi
--- a/paddle/phi/kernels/mode_kernel.h
+++ b/paddle/phi/kernels/mode_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices);
+}  // namespace phi
--- a/paddle/phi/ops/compat/mode_sig.cc
+++ b/paddle/phi/ops/compat/mode_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "mode", {"X"}, {"axis", "keepdim"}, {"Out", "Indices"});
+}
+KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("mode_grad",
+                         {"X", "Indices", GradVarName("Out")},
+                         {"axis", "keepdim"},
+                         {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(mode, phi::ModeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(mode_grad, phi::ModeGradOpArgumentMapping);