[phi] tranfer kthvalue from fluid to phi (#40676)

* tranfer kthvalue from fluid to phi * transfer infershape

[phi] tranfer kthvalue from fluid to phi (#40676)
* tranfer kthvalue from fluid to phi * transfer infershape
d7ccd6bf · xiongkun · GitHub · 8c713223 · d7ccd6bf · 8c713223
12 changed file
--- a/paddle/fluid/operators/kthvalue_op.cc
+++ b/paddle/fluid/operators/kthvalue_op.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/kthvalue_op.h"
 #include <memory>
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"

 namespace paddle {
 namespace operators {
@@ -25,54 +26,6 @@ class KthvalueOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kthvalue");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kthvalue");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "kthvalue");
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_LT(axis, dim_size,
-                      paddle::platform::errors::InvalidArgument(
-                          "the axis must be [-%d, %d), but received %d .",
-                          dim_size, dim_size, axis));
-    PADDLE_ENFORCE_GE(axis, -dim_size,
-                      paddle::platform::errors::InvalidArgument(
-                          "the axis must be [-%d, %d), but received %d .",
-                          dim_size, dim_size, axis));
-    if (axis < 0) axis += dim_size;
-    int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
-    PADDLE_ENFORCE_GE(
-        k, 1, paddle::platform::errors::InvalidArgument(
-                  "the k in the kthvalue must >= 1, but received %d .", k));
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of kthvalue must have >= 1d shape"));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(
-          input_dims[axis], k,
-          paddle::platform::errors::InvalidArgument(
-              "input of kthvalue must have >= %d columns in axis of %d", k,
-              axis));
-    }
-    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
-    std::vector<int64_t> dimvec;
-    for (int64_t i = 0; i < axis; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    if (keepdim) {
-      dimvec.emplace_back(static_cast<int64_t>(1));
-    }
-    for (int64_t i = axis + 1; i < dim_size; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    framework::DDim dims = phi::make_ddim(dimvec);
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -155,20 +108,13 @@ class KthvalueGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle

+DECLARE_INFER_SHAPE_FUNCTOR(kthvalue, KthvalueInferShapeFunctor,
+                            PD_INFER_META(phi::KthvalueInferMeta));
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(kthvalue, ops::KthvalueOp, ops::KthvalueOpMaker,
                  ops::KthvalueGradOpMaker<paddle::framework::OpDesc>,
-                  ops::KthvalueGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    kthvalue, ops::KthvalueCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, double>,
-    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, int64_t>);
+                  ops::KthvalueGradOpMaker<paddle::imperative::OpBase>,
+                  KthvalueInferShapeFunctor);

 REGISTER_OPERATOR(kthvalue_grad, ops::KthvalueOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    kthvalue_grad,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, double>,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/fluid/operators/kthvalue_op.cu
+++ b/paddle/fluid/operators/kthvalue_op.cu
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/kthvalue_op.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-#endif
-
-namespace paddle {
-namespace operators {
-
-int getBlockSize(int col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256 && col <= 512)
-    return 512;
-  else if (col > 128 && col <= 256)
-    return 256;
-  else if (col > 64 && col <= 128)
-    return 128;
-  else
-    return 64;
-}
-
-template <typename T>
-bool SortKthvalue(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor* input_tensor, const int64_t num_cols,
-                  const int64_t num_rows, const int k,
-                  framework::Tensor* out_tensor,
-                  framework::Tensor* indices_tensor) {
-  auto cu_stream = ctx.stream();
-  framework::Tensor input_indices;
-  const std::vector<int64_t> dims = {num_rows, num_cols};
-  auto dim = phi::make_ddim(dims);
-  input_indices.Resize(dim);
-  input_indices.mutable_data<int64_t>(ctx.GetPlace());
-  size_t temp_storage_bytes = -1;
-  int block_size = getBlockSize(num_cols);
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  unsigned int grid_size = num_rows < maxGridDimX
-                               ? static_cast<unsigned int>(num_rows)
-                               : maxGridDimX;
-  InitIndex<int64_t><<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<int64_t>(), num_rows, num_cols);
-  cub::CountingInputIterator<int64_t> counting_iter(0);
-  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
-                              cub::CountingInputIterator<int64_t>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-  T* sorted_values_ptr;
-  int64_t* sorted_indices_ptr;
-  framework::Tensor temp_values, temp_indices;
-  const T* input = input_tensor->data<T>();
-  T* values = out_tensor->data<T>();
-  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
-  temp_values.Resize(dim);
-  temp_indices.Resize(dim);
-  sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
-  sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
-  auto err = cub::DeviceSegmentedRadixSort::SortPairs(
-      nullptr, temp_storage_bytes, input, sorted_values_ptr,
-      input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
-      num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-      cu_stream);
-#ifdef __HIPCC__
-  if (err != hipSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
-               << hipGetErrorString(err);
-    return false;
-  }
-#else
-  if (err != cudaSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "cub::DeviceSegmentedRadixSort::SortPairs, status: "
-               << cudaGetErrorString(err);
-    return false;
-  }
-#endif
-  framework::Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  err = cub::DeviceSegmentedRadixSort::SortPairs(
-      temp_storage.data<uint8_t>(), temp_storage_bytes, input,
-      sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
-      num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
-      0, sizeof(T) * 8, cu_stream);
-#ifdef __HIPCC__
-  if (err != hipSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "hipcub::DeviceSegmentedRadixSort::SortPairs, "
-               << temp_storage_bytes << ", status: " << hipGetErrorString(err);
-    return false;
-  }
-#else
-  if (err != cudaSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "cub::DeviceSegmentedRadixSort::SortPairs, "
-               << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
-    return false;
-  }
-#endif
-  auto& dev = *ctx.eigen_device();
-  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, k - 1};
-  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, 1};
-  auto e_indices = framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
-  auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(
-      static_cast<const framework::Tensor>(temp_indices));
-  std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(1)};
-  dim = phi::make_ddim(odims);
-  auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
-  auto e_tmp_values = framework::EigenMatrix<T>::From(
-      static_cast<const framework::Tensor>(temp_values));
-
-  EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
-      dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
-  EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
-      dev, e_values, e_tmp_values, slice_indices, slice_sizes);
-  return true;
-}
-
-template <typename DeviceContext, typename T>
-class KthvalueOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int k = static_cast<int>(ctx.Attr<int>("k"));
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
-    const auto& in_dims = input->dims();
-    if (axis < 0) axis += in_dims.size();
-    auto out_dims = output->dims();
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      PADDLE_ENFORCE_EQ(SortKthvalue<T>(dev_ctx, input, input_width,
-                                        input_height, k, output, indices),
-                        true, platform::errors::External(
-                                  "KthvalueOP: Error when use cub sorting"));
-      return;
-    } else {
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dims);
-        indices->Resize(tmp_out_dims);
-      }
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-        trans_out_dims[i] = in_dims[trans[i]];
-      }
-      trans_out_dims[in_dims.size() - 1] = 1;
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans);
-      framework::Tensor trans_ind, trans_out;
-      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
-      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-      PADDLE_ENFORCE_EQ(
-          SortKthvalue<T>(dev_ctx, &trans_input, input_width, input_height, k,
-                          &trans_out, &trans_ind),
-          true,
-          platform::errors::External("KthvalueOP: Error when use cub sorting"));
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KthvalueOpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-    int k = static_cast<int>(context.Attr<int>("k"));
-    const auto& in_dims = x->dims();
-    auto out_dims = indices->dims();
-    if (axis < 0) axis += in_dims.size();
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-    auto& dev_ctx = context.cuda_device_context();
-    int block_size = getBlockSize(post * k);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    kthvalue,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    kthvalue_grad,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
--- a/paddle/fluid/operators/kthvalue_op.h
+++ b/paddle/fluid/operators/kthvalue_op.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-template <typename T, typename Type>
-static void getKthvalue(Type input_height, Type input_width, int input_dim,
-                        const framework::Tensor* input, T* t_out,
-                        Type* t_indices, const int& k) {
-  bool partial_sort_flag = (k * 64) < input_width;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    if (partial_sort_flag) {
-      std::partial_sort(
-          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
-          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-            return (!std::isnan(static_cast<double>(l.first)) &&
-                    std::isnan(static_cast<double>(r.first))) ||
-                   (l.first < r.first);
-          });
-    } else {
-      std::nth_element(
-          col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-            return (!std::isnan(static_cast<double>(l.first)) &&
-                    std::isnan(static_cast<double>(r.first))) ||
-                   (l.first < r.first);
-          });
-    }
-    t_out[i] = col_vec[k - 1].first;
-    t_indices[i] = col_vec[k - 1].second;
-  }
-}
-
-template <typename T, typename Type>
-static void kthvalueAssign(const Type& input_height, const Type& input_width,
-                           const int& input_dim, const framework::Tensor* input,
-                           const framework::Tensor* indices, T* output_data) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      output_data[i * input_width + e_indices(0)] = e_input(0);
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class KthvalueCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("X");
-    auto* output = context.Output<framework::Tensor>("Out");
-    auto* indices = context.Output<framework::Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    int k = static_cast<int>(context.Attr<int>("k"));
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    auto out_dims = output->dims();
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      getKthvalue<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                              output_data, indices_data, k);
-    } else {
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dims);
-        indices->Resize(tmp_out_dims);
-      }
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(in_dims);
-
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-        trans_out_dims[i] = in_dims[trans[i]];
-      }
-      trans_out_dims[in_dims.size() - 1] = 1;
-      framework::Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-      framework::Tensor tmp_out, tmp_indices;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
-
-      getKthvalue<T, int64_t>(input_height, input_width, in_dims.size(),
-                              &trans_inp, t_out, t_ind, k);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KthvalueGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-    auto in_dims = x->dims();
-    auto out_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    if (!keepdim) {
-      std::vector<int> tmp_out_shape;
-      for (int i = 0; i < axis; i++) {
-        tmp_out_shape.emplace_back(out_dims[i]);
-      }
-      tmp_out_shape.emplace_back(1);
-      for (int i = axis + 1; i < in_dims.size(); i++) {
-        tmp_out_shape.emplace_back(out_dims[i - 1]);
-      }
-      out_dims = phi::make_ddim(tmp_out_shape);
-    }
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis == in_dims.size() - 1) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      if (keepdim) {
-        kthvalueAssign(input_height, input_width, in_dims.size(), out_grad,
-                       indices, x_grad_data);
-      } else {
-        auto& dev_context =
-            context.template device_context<platform::CPUDeviceContext>();
-        framework::Tensor out_grad_tmp, indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        kthvalueAssign(input_height, input_width, in_dims.size(), &out_grad_tmp,
-                       &indices_tmp, x_grad_data);
-      }
-    } else {
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      framework::DDim trans_dims(out_dims);
-      framework::DDim trans_in_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = out_dims[trans[i]];
-        trans_in_dims[i] = in_dims[trans[i]];
-      }
-      framework::Tensor trans_dO, trans_ind;
-      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
-      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-      if (keepdim) {
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, *out_grad, &trans_dO, trans);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, *indices, &trans_ind, trans);
-      } else {
-        framework::Tensor out_grad_tmp, indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, out_grad_tmp, &trans_dO, trans);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, indices_tmp, &trans_ind, trans);
-      }
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
-      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-      kthvalueAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                                 &trans_dO, &trans_ind, t_out);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -554,6 +554,67 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
  out->set_dtype(DataType::BOOL);
 }

+void KthvalueInferMeta(const MetaTensor& x,
+                       int k,
+                       int axis,
+                       bool keepdim,
+                       MetaTensor* out,
+                       MetaTensor* indices,
+                       MetaConfig config) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_LT(axis,
+                    dim_size,
+                    phi::errors::InvalidArgument(
+                        "the axis must be [-%d, %d), but received %d .",
+                        dim_size,
+                        dim_size,
+                        axis));
+  PADDLE_ENFORCE_GE(axis,
+                    -dim_size,
+                    phi::errors::InvalidArgument(
+                        "the axis must be [-%d, %d), but received %d .",
+                        dim_size,
+                        dim_size,
+                        axis));
+  if (axis < 0) axis += dim_size;
+  PADDLE_ENFORCE_GE(
+      k,
+      1,
+      phi::errors::InvalidArgument(
+          "the k in the kthvalue must >= 1, but received %d .", k));
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      phi::errors::InvalidArgument("input of kthvalue must have >= 1d shape"));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_GE(
+        input_dims[axis],
+        k,
+        phi::errors::InvalidArgument(
+            "input of kthvalue must have >= %d columns in axis of %d",
+            k,
+            axis));
+  }
+  std::vector<int64_t> dimvec;
+  for (int64_t i = 0; i < axis; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  if (keepdim) {
+    dimvec.emplace_back(static_cast<int64_t>(1));
+  }
+  for (int64_t i = axis + 1; i < dim_size; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  DDim dims = phi::make_ddim(dimvec);
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(x.dtype());
+}
+
 void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) {
  auto dims = x.dims();
  auto n_dim = dims.size();

--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -100,6 +100,14 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out);

 void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out);

+void KthvalueInferMeta(const MetaTensor& x,
+                       int k,
+                       int axis,
+                       bool keepdim,
+                       MetaTensor* out,
+                       MetaTensor* indices,
+                       MetaConfig = MetaConfig());
+
 void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out);

 void MaxPoolWithIndexInferMeta(const MetaTensor& x,

--- a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+template <typename T, typename Type>
+static void kthvalueAssign(const Type& input_height,
+                           const Type& input_width,
+                           const int& input_dim,
+                           const DenseTensor* input,
+                           const DenseTensor* indices,
+                           T* output_data) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      output_data[i * input_width + e_indices(0)] = e_input(0);
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void KthvalueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& d_out,
+                        const DenseTensor& x,
+                        const DenseTensor& indices,
+                        int k,
+                        int axis,
+                        bool keepdim,
+                        DenseTensor* d_x) {
+  auto in_dims = x.dims();
+  auto out_dims = indices.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  if (!keepdim) {
+    std::vector<int> tmp_out_shape;
+    for (int i = 0; i < axis; i++) {
+      tmp_out_shape.emplace_back(out_dims[i]);
+    }
+    tmp_out_shape.emplace_back(1);
+    for (int i = axis + 1; i < in_dims.size(); i++) {
+      tmp_out_shape.emplace_back(out_dims[i - 1]);
+    }
+    out_dims = phi::make_ddim(tmp_out_shape);
+  }
+  T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
+  if (axis == in_dims.size() - 1) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    memset(x_grad_data, 0, d_x->numel() * sizeof(T));
+    if (keepdim) {
+      kthvalueAssign(input_height,
+                     input_width,
+                     in_dims.size(),
+                     &d_out,
+                     &indices,
+                     x_grad_data);
+    } else {
+      DenseTensor out_grad_tmp, indices_tmp;
+      out_grad_tmp.Resize(d_out.dims());
+      indices_tmp.Resize(indices.dims());
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+      Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      kthvalueAssign(input_height,
+                     input_width,
+                     in_dims.size(),
+                     &out_grad_tmp,
+                     &indices_tmp,
+                     x_grad_data);
+    }
+  } else {
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    DDim trans_dims(out_dims);
+    DDim trans_in_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = out_dims[trans[i]];
+      trans_in_dims[i] = in_dims[trans[i]];
+    }
+    DenseTensor trans_dO, trans_ind;
+    trans_dO.Resize(trans_dims);
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    int ndims = trans.size();
+    if (keepdim) {
+      funcs::TransCompute<phi::CPUContext, T>(
+          ndims, dev_ctx, d_out, &trans_dO, trans);
+      funcs::TransCompute<phi::CPUContext, int64_t>(
+          ndims, dev_ctx, indices, &trans_ind, trans);
+    } else {
+      DenseTensor out_grad_tmp, indices_tmp;
+      out_grad_tmp.Resize(d_out.dims());
+      indices_tmp.Resize(indices.dims());
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+      Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      funcs::TransCompute<phi::CPUContext, T>(
+          ndims, dev_ctx, out_grad_tmp, &trans_dO, trans);
+      funcs::TransCompute<phi::CPUContext, int64_t>(
+          ndims, dev_ctx, indices_tmp, &trans_ind, trans);
+    }
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+    const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, d_x->numel() * sizeof(T));
+    kthvalueAssign<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &trans_dO,
+                               &trans_ind,
+                               t_out);
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, d_x, trans);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/cpu/kthvalue_kernel.cc
+++ b/paddle/phi/kernels/cpu/kthvalue_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+template <typename T, typename Type>
+static void getKthvalue(Type input_height,
+                        Type input_width,
+                        int input_dim,
+                        const DenseTensor* input,
+                        T* t_out,
+                        Type* t_indices,
+                        const int& k) {
+  bool partial_sort_flag = (k * 64) < input_width;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(),
+          col_vec.begin() + k,
+          col_vec.end(),
+          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            return (!std::isnan(static_cast<double>(l.first)) &&
+                    std::isnan(static_cast<double>(r.first))) ||
+                   (l.first < r.first);
+          });
+    } else {
+      std::nth_element(
+          col_vec.begin(),
+          col_vec.begin() + k - 1,
+          col_vec.end(),
+          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            return (!std::isnan(static_cast<double>(l.first)) &&
+                    std::isnan(static_cast<double>(r.first))) ||
+                   (l.first < r.first);
+          });
+    }
+    t_out[i] = col_vec[k - 1].first;
+    t_indices[i] = col_vec[k - 1].second;
+  }
+}
+
+template <typename T, typename Context>
+void KthvalueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int k,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* output,
+                    DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  if (axis < 0) axis += in_dims.size();
+  T* output_data = dev_ctx.template Alloc<T>(output);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  auto out_dims = output->dims();
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    getKthvalue<T, int64_t>(input_height,
+                            input_width,
+                            in_dims.size(),
+                            &x,
+                            output_data,
+                            indices_data,
+                            k);
+  } else {
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
+      output->Resize(tmp_out_dims);
+      indices->Resize(tmp_out_dims);
+    }
+    DDim trans_dims(in_dims);
+    DDim trans_out_dims(in_dims);
+
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+      trans_out_dims[i] = in_dims[trans[i]];
+    }
+    trans_out_dims[in_dims.size() - 1] = 1;
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    int ndims = trans.size();
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, x, &trans_inp, trans);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+    DenseTensor tmp_out, tmp_indices;
+    tmp_out.Resize(trans_out_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    tmp_indices.Resize(trans_out_dims);
+    int64_t* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+    getKthvalue<T, int64_t>(
+        input_height, input_width, in_dims.size(), &trans_inp, t_out, t_ind, k);
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans);
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, output, trans);
+    if (!keepdim) {
+      output->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+static int getBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+template <typename T, typename Context>
+void KthvalueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& d_out,
+                        const DenseTensor& x,
+                        const DenseTensor& indices,
+                        int k,
+                        int axis,
+                        bool keepdim,
+                        DenseTensor* d_x) {
+  const auto& in_dims = x.dims();
+  auto out_dims = indices.dims();
+  if (axis < 0) axis += in_dims.size();
+  T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
+  const T* out_grad_data = d_out.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+  int pre, n, post;
+  paddle::operators::GetDims(in_dims, axis, &pre, &n, &post);
+  int block_size = getBlockSize(post * k);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+  paddle::operators::AssignGradWithAxis<
+      T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/gpu/kthvalue_kernel.cu
+++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+inline int getBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+template <typename T>
+bool SortKthvalue(const phi::GPUContext& dev_ctx,
+                  const DenseTensor* input_tensor,
+                  const int64_t num_cols,
+                  const int64_t num_rows,
+                  const int k,
+                  DenseTensor* out_tensor,
+                  DenseTensor* indices_tensor) {
+  auto cu_stream = dev_ctx.stream();
+  DenseTensor input_indices;
+  const std::vector<int64_t> dims = {num_rows, num_cols};
+  auto dim = phi::make_ddim(dims);
+  input_indices.Resize(dim);
+  dev_ctx.template Alloc<int64_t>(&input_indices);
+  size_t temp_storage_bytes = -1;
+  int block_size = getBlockSize(num_cols);
+  unsigned int maxGridDimX = dev_ctx.GetCUDAMaxGridDimSize()[0];
+  unsigned int grid_size = num_rows < maxGridDimX
+                               ? static_cast<unsigned int>(num_rows)
+                               : maxGridDimX;
+  paddle::operators::InitIndex<
+      int64_t><<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<int64_t>(), num_rows, num_cols);
+  cub::CountingInputIterator<int64_t> counting_iter(0);
+  cub::TransformInputIterator<int64_t,
+                              paddle::operators::SegmentOffsetIter,
+                              cub::CountingInputIterator<int64_t>>
+      segment_offsets_t(counting_iter,
+                        paddle::operators::SegmentOffsetIter(num_cols));
+  T* sorted_values_ptr;
+  int64_t* sorted_indices_ptr;
+  DenseTensor temp_values, temp_indices;
+  const T* input = input_tensor->data<T>();
+  T* values = out_tensor->data<T>();
+  int64_t* indices = indices_tensor->mutable_data<int64_t>(dev_ctx.GetPlace());
+  temp_values.Resize(dim);
+  temp_indices.Resize(dim);
+  sorted_values_ptr = dev_ctx.template Alloc<T>(&temp_values);
+  sorted_indices_ptr = dev_ctx.template Alloc<int64_t>(&temp_indices);
+  auto err =
+      cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
+                                               temp_storage_bytes,
+                                               input,
+                                               sorted_values_ptr,
+                                               input_indices.data<int64_t>(),
+                                               sorted_indices_ptr,
+                                               num_cols * num_rows,
+                                               num_rows,
+                                               segment_offsets_t,
+                                               segment_offsets_t + 1,
+                                               0,
+                                               sizeof(T) * 8,
+                                               cu_stream);
+#ifdef __HIPCC__
+  if (err != hipSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
+               << hipGetErrorString(err);
+    return false;
+  }
+#else
+  if (err != cudaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, status: "
+               << cudaGetErrorString(err);
+    return false;
+  }
+#endif
+  DenseTensor temp_storage;
+  temp_storage.Resize({static_cast<int>(temp_storage_bytes / sizeof(uint8_t))});
+  uint8_t* temp_storage_data = dev_ctx.template Alloc<uint8_t>(&temp_storage);
+
+  err = cub::DeviceSegmentedRadixSort::SortPairs(temp_storage_data,
+                                                 temp_storage_bytes,
+                                                 input,
+                                                 sorted_values_ptr,
+                                                 input_indices.data<int64_t>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+#ifdef __HIPCC__
+  if (err != hipSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "hipcub::DeviceSegmentedRadixSort::SortPairs, "
+               << temp_storage_bytes << ", status: " << hipGetErrorString(err);
+    return false;
+  }
+#else
+  if (err != cudaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, "
+               << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
+    return false;
+  }
+#endif
+  auto& dev = *dev_ctx.eigen_device();
+  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, k - 1};
+  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, 1};
+  auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
+  auto e_tmp_indices =
+      EigenMatrix<int64_t>::From(static_cast<const DenseTensor>(temp_indices));
+  std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(1)};
+  dim = phi::make_ddim(odims);
+  auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
+  auto e_tmp_values =
+      EigenMatrix<T>::From(static_cast<const DenseTensor>(temp_values));
+
+  funcs::EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
+      dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
+  funcs::EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
+      dev, e_values, e_tmp_values, slice_indices, slice_sizes);
+  return true;
+}
+
+template <typename T, typename Context>
+void KthvalueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int k,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* output,
+                    DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  if (axis < 0) axis += in_dims.size();
+  auto out_dims = output->dims();
+  const T* input_data = x.data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(output);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    PADDLE_ENFORCE_EQ(
+        SortKthvalue<T>(
+            dev_ctx, &x, input_width, input_height, k, output, indices),
+        true,
+        phi::errors::External("KthvalueOP: Error when use cub sorting"));
+    return;
+  } else {
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
+      output->Resize(tmp_out_dims);
+      indices->Resize(tmp_out_dims);
+    }
+    DDim trans_dims(in_dims);
+    DDim trans_out_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+      trans_out_dims[i] = in_dims[trans[i]];
+    }
+    trans_out_dims[in_dims.size() - 1] = 1;
+    DenseTensor trans_input;
+    trans_input.mutable_data<T>(trans_dims, dev_ctx.GetPlace());
+    int ndims = trans.size();
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, x, &trans_input, trans);
+    DenseTensor trans_ind, trans_out;
+    trans_ind.mutable_data<int64_t>(trans_out_dims, dev_ctx.GetPlace());
+    trans_out.mutable_data<T>(trans_out_dims, dev_ctx.GetPlace());
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+    PADDLE_ENFORCE_EQ(
+        SortKthvalue<T>(dev_ctx,
+                        &trans_input,
+                        input_width,
+                        input_height,
+                        k,
+                        &trans_out,
+                        &trans_ind),
+        true,
+        phi::errors::External("KthvalueOP: Error when use cub sorting"));
+    funcs::TransCompute<phi::GPUContext, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans);
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, trans_out, output, trans);
+    if (!keepdim) {
+      output->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/kthvalue_grad_kernel.h
+++ b/paddle/phi/kernels/kthvalue_grad_kernel.h
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+template <typename T, typename Context>
+void KthvalueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& d_out,
+                        const DenseTensor& x,
+                        const DenseTensor& indices,
+                        int k,
+                        int axis,
+                        bool keepdim,
+                        DenseTensor* d_x);
+}  // namespace phi
--- a/paddle/phi/kernels/kthvalue_kernel.h
+++ b/paddle/phi/kernels/kthvalue_kernel.h
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KthvalueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int k,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* out,
+                    DenseTensor* indices);
+}  // namespace phi
--- a/paddle/phi/ops/compat/kthvalue_sig.cc
+++ b/paddle/phi/ops/compat/kthvalue_sig.cc
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature KthvalueGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("kthvalue_grad",
+                         {GradVarName("Out"), "X", "Indices"},
+                         {"k", "axis", "keepdim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(kthvalue_grad, phi::KthvalueGradOpArgumentMapping);