From 74894cd7e9fec3fdead32545ff546d6bc12d8b00 Mon Sep 17 00:00:00 2001
From: csy0225 <78470701+csy0225@users.noreply.github.com>
Date: Thu, 31 Mar 2022 17:50:43 +0800
Subject: [PATCH] fix conflict (#40851)

---
 paddle/fluid/operators/range_op.cc            |  57 +-
 paddle/fluid/operators/range_op.cu            |  61 --
 paddle/fluid/operators/range_op_npu_test.cc   |   2 +-
 paddle/fluid/operators/stack_op.cc            |  70 +-
 paddle/fluid/operators/stack_op.cu            | 207 ------
 paddle/fluid/operators/stack_op.h             | 120 ----
 paddle/fluid/operators/stack_op_npu.cc        |   2 +-
 paddle/fluid/operators/stack_op_xpu.cc        |   2 +-
 paddle/fluid/operators/unique_op.cc           | 113 ++--
 paddle/fluid/operators/unique_op.cu           | 474 --------------
 paddle/fluid/operators/unstack_op.cc          |  59 +-
 paddle/fluid/operators/unstack_op.cu          |  32 -
 paddle/fluid/operators/unstack_op.h           | 174 -----
 paddle/fluid/operators/unstack_op_npu.cc      |   2 +-
 paddle/phi/core/utils/data_type.h             |  21 +
 paddle/phi/infermeta/multiary.cc              |  46 ++
 paddle/phi/infermeta/multiary.h               |   4 +
 paddle/phi/infermeta/ternary.cc               |  50 ++
 paddle/phi/infermeta/ternary.h                |   5 +
 paddle/phi/infermeta/unary.cc                 | 126 ++++
 paddle/phi/infermeta/unary.h                  |  28 +
 paddle/phi/kernels/cpu/range_kernel.cc        |  45 ++
 paddle/phi/kernels/cpu/stack_grad_kernel.cc   |  59 ++
 paddle/phi/kernels/cpu/stack_kernel.cc        |  62 ++
 paddle/phi/kernels/cpu/unique_kernel.cc       | 131 ++++
 paddle/phi/kernels/cpu/unstack_grad_kernel.cc |  27 +
 paddle/phi/kernels/cpu/unstack_kernel.cc      |  22 +
 paddle/phi/kernels/funcs/range_function.h     |  49 ++
 paddle/phi/kernels/funcs/stack_functor.h      |  83 +++
 paddle/phi/kernels/funcs/unique_functor.h     | 426 ++++++++++++
 paddle/phi/kernels/gpu/range_kernel.cu        |  57 ++
 paddle/phi/kernels/gpu/stack_grad_kernel.cu   | 143 ++++
 paddle/phi/kernels/gpu/stack_kernel.cu        | 113 ++++
 paddle/phi/kernels/gpu/unique_kernel.cu       | 615 ++++++++++++++++++
 paddle/phi/kernels/gpu/unstack_grad_kernel.cu |  29 +
 paddle/phi/kernels/gpu/unstack_kernel.cu      |  29 +
 .../kernels/impl/unstack_grad_kernel_impl.h   |  70 ++
 paddle/phi/kernels/impl/unstack_kernel_impl.h |  62 ++
 paddle/phi/kernels/range_kernel.h             |  28 +
 paddle/phi/kernels/stack_grad_kernel.h        |  27 +
 paddle/phi/kernels/stack_kernel.h             |  27 +
 paddle/phi/kernels/unique_kernel.h            |  48 ++
 paddle/phi/kernels/unstack_grad_kernel.h      |  27 +
 paddle/phi/kernels/unstack_kernel.h           |  28 +
 paddle/phi/ops/compat/stack_sig.cc            |  23 +
 paddle/phi/ops/compat/unique_sig.cc           |  42 ++
 paddle/phi/ops/compat/unstack_sig.cc          |  24 +
 47 files changed, 2641 insertions(+), 1310 deletions(-)
 delete mode 100644 paddle/fluid/operators/range_op.cu
 delete mode 100644 paddle/fluid/operators/stack_op.cu
 delete mode 100644 paddle/fluid/operators/stack_op.h
 delete mode 100644 paddle/fluid/operators/unique_op.cu
 delete mode 100644 paddle/fluid/operators/unstack_op.cu
 delete mode 100644 paddle/fluid/operators/unstack_op.h
 create mode 100644 paddle/phi/kernels/cpu/range_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/stack_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/stack_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/unique_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/unstack_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/unstack_kernel.cc
 create mode 100644 paddle/phi/kernels/funcs/range_function.h
 create mode 100644 paddle/phi/kernels/funcs/stack_functor.h
 create mode 100644 paddle/phi/kernels/funcs/unique_functor.h
 create mode 100644 paddle/phi/kernels/gpu/range_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/stack_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/stack_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/unique_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/unstack_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/unstack_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/unstack_kernel_impl.h
 create mode 100644 paddle/phi/kernels/range_kernel.h
 create mode 100644 paddle/phi/kernels/stack_grad_kernel.h
 create mode 100644 paddle/phi/kernels/stack_kernel.h
 create mode 100644 paddle/phi/kernels/unique_kernel.h
 create mode 100644 paddle/phi/kernels/unstack_grad_kernel.h
 create mode 100644 paddle/phi/kernels/unstack_kernel.h
 create mode 100644 paddle/phi/ops/compat/stack_sig.cc
 create mode 100644 paddle/phi/ops/compat/unique_sig.cc
 create mode 100644 paddle/phi/ops/compat/unstack_sig.cc

diff --git a/paddle/fluid/operators/range_op.cc b/paddle/fluid/operators/range_op.cc
index 3c2fe8b9e5..ddfbdbace0 100644
--- a/paddle/fluid/operators/range_op.cc
+++ b/paddle/fluid/operators/range_op.cc
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/range_op.h"
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,51 +26,6 @@ class RangeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->HasInput("Start")) {
-      auto s_dims = ctx->GetInputDim("Start");
-      PADDLE_ENFORCE_EQ(
-          s_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The dim of the shape of Input(Start) should be 1, but got %d",
-              s_dims.size()));
-
-      PADDLE_ENFORCE_EQ(s_dims[0], 1,
-                        platform::errors::InvalidArgument(
-                            "The first dim of the shape of Input(Start) should "
-                            "be 1, but got %d",
-                            s_dims[0]));
-    }
-    if (ctx->HasInput("End")) {
-      auto e_dims = ctx->GetInputDim("End");
-      PADDLE_ENFORCE_EQ(
-          e_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The dim of the shape of Input(End) should be 1, but got %d",
-              e_dims.size()));
-
-      PADDLE_ENFORCE_EQ(e_dims[0], 1, platform::errors::InvalidArgument(
-                                          "The first dim of the shape of "
-                                          "Input(End) should be 1, but got %d",
-                                          e_dims[0]));
-    }
-    if (ctx->HasInput("Step")) {
-      auto step_dims = ctx->GetInputDim("Step");
-      PADDLE_ENFORCE_EQ(
-          step_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The dim of the shape of Input(Step) should be 1, but got %d",
-              step_dims.size()));
-
-      PADDLE_ENFORCE_EQ(step_dims[0], 1,
-                        platform::errors::InvalidArgument(
-                            "The first dim of the shape of Input(Step) should "
-                            "be 1, but got %d",
-                            step_dims[0]));
-    }
-    ctx->SetOutputDim("Out", {-1});
-  }
-
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const framework::Tensor &tensor,
@@ -101,7 +60,7 @@ class RangeOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker);
-REGISTER_OP_CPU_KERNEL(range, ops::CPURangeKernel<int>,
-                       ops::CPURangeKernel<float>, ops::CPURangeKernel<double>,
-                       ops::CPURangeKernel<int64_t>);
+DECLARE_INFER_SHAPE_FUNCTOR(range, RangeInferMetaFunctor,
+                            PD_INFER_META(phi::RangeInferMeta));
+REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker,
+                             RangeInferMetaFunctor);
diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu
deleted file mode 100644
index 1b1d41ae4c..0000000000
--- a/paddle/fluid/operators/range_op.cu
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/range_op.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void RangeKernel(T start, T step, int64_t size, T* out) {
-  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
-}
-
-template <typename T>
-class CUDARangeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* start_t = context.Input<framework::Tensor>("Start");
-    auto* end_t = context.Input<framework::Tensor>("End");
-    auto* step_t = context.Input<framework::Tensor>("Step");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    T start = GetValue<T>(start_t);
-    T end = GetValue<T>(end_t);
-    T step = GetValue<T>(step_t);
-
-    int64_t size = 0;
-    GetSize(start, end, step, &size);
-    out->Resize(phi::make_ddim({size}));
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    auto stream = context.cuda_device_context().stream();
-    int block = std::min(size, static_cast<int64_t>(256));
-    int grid = (size + block - 1) / block;
-    RangeKernel<T><<<grid, block, 0, stream>>>(start, step, size, out_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(range, ops::CUDARangeKernel<int>,
-                        ops::CUDARangeKernel<int64_t>,
-                        ops::CUDARangeKernel<float>,
-                        ops::CUDARangeKernel<double>);
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
index c7e91ba35d..ac32170d93 100644
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -30,7 +30,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(range);
+USE_OP_ITSELF(range);
 USE_OP_DEVICE_KERNEL(range, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc
index af03ed668e..a9fa78c4e4 100644
--- a/paddle/fluid/operators/stack_op.cc
+++ b/paddle/fluid/operators/stack_op.cc
@@ -12,9 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/stack_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
@@ -26,52 +29,6 @@ class StackOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_GT(ctx->Inputs("X").size(), 0,
-                      platform::errors::InvalidArgument(
-                          "Number of Inputs(X) must be larger than 0, but"
-                          " received value is:%d.",
-                          ctx->Inputs("X").size()));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Y) of stack_op should not be null."));
-
-    auto input_dims = ctx->GetInputsDim("X");
-    for (size_t i = 1; i < input_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
-                        platform::errors::InvalidArgument(
-                            "Dims of all Inputs(X) must be the same, but"
-                            " received input %d dim is:%d not equal to input 0"
-                            " dim:%d.",
-                            i, input_dims[i], input_dims[0]));
-    }
-
-    // Only lod of X[0] would be shared with Y
-    ctx->ShareLoD("X", /*->*/ "Y");
-
-    int axis = ctx->Attrs().Get<int>("axis");
-    int rank = input_dims[0].size();
-    PADDLE_ENFORCE_GE(
-        axis, -(rank + 1),
-        platform::errors::InvalidArgument(
-            "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
-            "but received axis is:%d.",
-            rank, axis));
-
-    PADDLE_ENFORCE_LT(
-        axis, rank + 1,
-        platform::errors::InvalidArgument(
-            "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
-            "but received axis is:%d",
-            rank, axis));
-
-    if (axis < 0) axis += (rank + 1);
-
-    auto vec = phi::vectorize<int>(input_dims[0]);
-    vec.insert(vec.begin() + axis, input_dims.size());
-    ctx->SetOutputDim("Y", phi::make_ddim(vec));
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
@@ -168,21 +125,10 @@ class StackGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(stack, StackInferMetaFunctor,
+                            PD_INFER_META(phi::StackInferMeta));
 REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker,
                   ops::StackGradOpMaker<paddle::framework::OpDesc>,
-                  ops::StackGradOpMaker<paddle::imperative::OpBase>);
+                  ops::StackGradOpMaker<paddle::imperative::OpBase>,
+                  StackInferMetaFunctor);
 REGISTER_OPERATOR(stack_grad, ops::StackOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    stack, ops::StackKernel<plat::CPUDeviceContext, float>,
-    ops::StackKernel<plat::CPUDeviceContext, double>,
-    ops::StackKernel<plat::CPUDeviceContext, int>,
-    ops::StackKernel<plat::CPUDeviceContext, int64_t>,
-    ops::StackKernel<plat::CPUDeviceContext, paddle::platform::bfloat16>);
-
-REGISTER_OP_CPU_KERNEL(
-    stack_grad, ops::StackGradKernel<plat::CPUDeviceContext, float>,
-    ops::StackGradKernel<plat::CPUDeviceContext, double>,
-    ops::StackGradKernel<plat::CPUDeviceContext, int>,
-    ops::StackGradKernel<plat::CPUDeviceContext, int64_t>,
-    ops::StackGradKernel<plat::CPUDeviceContext, paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu
deleted file mode 100644
index a56dd6aef4..0000000000
--- a/paddle/fluid/operators/stack_op.cu
+++ /dev/null
@@ -1,207 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <limits>
-#include <vector>
-#include "paddle/fluid/operators/stack_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename IntType>
-__global__ void StackCUDAKernel(T** input_ptrs, int split_size, int rows,
-                                int cols, T* __restrict__ output) {
-  IntType grid_x = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (; grid_x < cols; grid_x += blockDim.x * gridDim.x) {
-    IntType grid_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    IntType split = grid_x / split_size;
-    const T* input_ptr = input_ptrs[split];
-    IntType col_offset = grid_x % split_size;
-#pragma unroll
-    for (; grid_y < rows; grid_y += blockDim.y * gridDim.y) {
-      output[grid_y * cols + grid_x] =
-          input_ptr[grid_y * split_size + col_offset];
-    }
-  }
-}
-
-template <typename T>
-class StackGPUKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto x = ctx.MultiInput<Tensor>("X");
-    auto* y = ctx.Output<Tensor>("Y");
-
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-
-    int n = static_cast<int>(x.size());
-    auto* y_data = y->mutable_data<T>(ctx.GetPlace());
-    std::vector<const T*> x_datas(n);
-    for (int i = 0; i < n; i++) {
-      x_datas[i] = x[i]->data<T>();
-    }
-
-    auto& dev_ctx = ctx.template device_context<plat::CUDADeviceContext>();
-    auto tmp_x_data = memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*));
-    memory::Copy(dev_ctx.GetPlace(), tmp_x_data->ptr(), platform::CPUPlace(),
-                 reinterpret_cast<void*>(x_datas.data()),
-                 x_datas.size() * sizeof(T*), dev_ctx.stream());
-
-    // Split x dim from axis to matrix
-    int x_row = 1, x_col = 1;
-    for (int i = 0; i < axis; ++i) {
-      x_row *= x[0]->dims()[i];
-    }
-    x_col = x[0]->numel() / x_row;
-    int out_col = x_col * n;
-
-    auto config = GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);
-
-    if (y->numel() < std::numeric_limits<int32_t>::max()) {
-      StackCUDAKernel<T,
-                      int32_t><<<config.block_per_grid, config.thread_per_block,
-                                 0, dev_ctx.stream()>>>(
-          reinterpret_cast<T**>(tmp_x_data->ptr()), x_col, x_row, out_col,
-          y_data);
-    } else {
-      StackCUDAKernel<T,
-                      int64_t><<<config.block_per_grid, config.thread_per_block,
-                                 0, dev_ctx.stream()>>>(
-          reinterpret_cast<T**>(tmp_x_data->ptr()), x_col, x_row, out_col,
-          y_data);
-    }
-  }
-};
-
-template <typename T, typename IntType>
-__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input,
-                                        int pre_dim_size, int split_dim_size,
-                                        int suf_dim_size, int num_split,
-                                        T** output_ptrs) {
-  assert(blockDim.y == 1);
-  assert(blockDim.z == 1);
-  // In this case they are equal
-  assert(split_dim_size % num_split == 0);
-
-  IntType size = pre_dim_size * split_dim_size * suf_dim_size;
-  IntType each_dim_size = split_dim_size / num_split;
-
-  for (IntType offset = blockIdx.x * blockDim.x + threadIdx.x; offset < size;
-       offset += blockDim.x * gridDim.x) {
-    IntType i = offset / (split_dim_size * suf_dim_size);
-    IntType j = (offset % (split_dim_size * suf_dim_size)) / suf_dim_size;
-    IntType k = offset % suf_dim_size;
-
-    T* output = output_ptrs[j / each_dim_size];
-    if (output == nullptr) {
-      return;
-    }
-    IntType output_ind = i * each_dim_size * suf_dim_size +
-                         (j % each_dim_size) * suf_dim_size + k;
-    *(output + output_ind) = input[offset];
-  }
-}
-
-template <typename T>
-class StackGradGPUKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += dy->dims().size();
-
-    int n = dy->dims()[axis];
-    PADDLE_ENFORCE_EQ(n, dx.size(),
-                      platform::errors::InvalidArgument(
-                          "Output dx size should be equal to n, but"
-                          " received n is:%d dx size is:%d.",
-                          n, dx.size()));
-
-    // dx is output, so save each data address, then copy each dy into dx_data
-    std::vector<T*> outputs(n);
-    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
-    for (size_t j = 0; j < dx.size(); ++j) {
-      if (dx[j] == nullptr) {
-        outputs[j] = nullptr;
-      }
-      if (out_var_names[j] != framework::kEmptyVarName &&
-          dx[j]->numel() != 0UL) {
-        T* ptr = dx[j]->mutable_data<T>(ctx.GetPlace());
-        outputs[j] = ptr;
-      } else {
-        outputs[j] = nullptr;
-      }
-    }
-    auto dy_data = dy->data<T>();
-    // each dx should have same shape
-    int dy_pre = 1, dy_suf = 1;
-    auto dy_dims = dy->dims();
-    int split_dim = n;
-    for (int i = 0; i < axis; ++i) {
-      dy_pre *= dy_dims[i];
-    }
-    dy_suf = dy->numel() / (split_dim * dy_pre);
-
-    auto& dev_ctx = ctx.template device_context<plat::CUDADeviceContext>();
-    auto tmp_out_data = memory::Alloc(dev_ctx, outputs.size() * sizeof(T*));
-    memory::Copy(dev_ctx.GetPlace(), tmp_out_data->ptr(), platform::CPUPlace(),
-                 reinterpret_cast<void*>(outputs.data()),
-                 outputs.size() * sizeof(T*), dev_ctx.stream());
-
-    auto config = GetGpuLaunchConfig1D(dev_ctx, dy_pre * split_dim * dy_suf);
-
-    if (dy->numel() < std::numeric_limits<int32_t>::max()) {
-      UnStackHelperCUDAKernel<
-          T, int32_t><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        dev_ctx.stream()>>>(
-          dy_data, dy_pre, split_dim, dy_suf, split_dim,
-          reinterpret_cast<T**>(tmp_out_data->ptr()));
-    } else {
-      UnStackHelperCUDAKernel<
-          T, int64_t><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        dev_ctx.stream()>>>(
-          dy_data, dy_pre, split_dim, dy_suf, split_dim,
-          reinterpret_cast<T**>(tmp_out_data->ptr()));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(stack, ops::StackGPUKernel<float>,
-                        ops::StackGPUKernel<double>, ops::StackGPUKernel<int>,
-                        ops::StackGPUKernel<int64_t>,
-                        ops::StackGPUKernel<plat::float16>,
-                        ops::StackGPUKernel<plat::bfloat16>);
-
-REGISTER_OP_CUDA_KERNEL(stack_grad, ops::StackGradGPUKernel<float>,
-                        ops::StackGradGPUKernel<double>,
-                        ops::StackGradGPUKernel<int>,
-                        ops::StackGradGPUKernel<int64_t>,
-                        ops::StackGradGPUKernel<plat::float16>,
-                        ops::StackGradGPUKernel<plat::bfloat16>);
diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
deleted file mode 100644
index 03d5324528..0000000000
--- a/paddle/fluid/operators/stack_op.h
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename VecDxType, typename T>
-struct StackGradFunctor {
-  HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
-      : dx_(dx), dy_(dy), n_(n), post_(post) {}
-
-  HOSTDEVICE void operator()(int idx) {
-    int i = idx / (n_ * post_);
-    int which_x = idx / post_ - i * n_;
-    int x_index = i * post_ + idx % post_;
-    if (dx_[which_x] != nullptr) dx_[which_x][x_index] = dy_[idx];
-  }
-
- private:
-  VecDxType dx_;
-  const T *dy_;
-  int n_;
-  int post_;
-};
-
-template <typename DeviceContext, typename VecDxType, typename T>
-static inline void StackGradFunctorForRange(const DeviceContext &ctx,
-                                            const VecDxType &dx, const T *dy,
-                                            int total_num, int n, int post) {
-  platform::ForRange<DeviceContext> for_range(ctx, total_num);
-  for_range(StackGradFunctor<VecDxType, T>(dx, dy, n, post));
-}
-
-template <typename DeviceContext, typename T>
-class StackKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto x = ctx.MultiInput<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Y");
-
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-
-    int n = static_cast<int>(x.size());
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    std::vector<const T *> x_datas(n);
-    for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
-
-    int pre = 1, post = 1;
-    auto &dim = x[0]->dims();
-    for (auto i = 0; i < axis; ++i) pre *= dim[i];
-    for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
-
-    auto x_data_arr = x_datas.data();
-
-    size_t x_offset = 0;
-    size_t y_offset = 0;
-    for (int i = 0; i < pre; i++) {
-      for (int j = 0; j < n; j++) {
-        std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset,
-                    post * sizeof(T));
-        y_offset += post;
-      }
-      x_offset += post;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class StackGradKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += dy->dims().size();
-    int n = dy->dims()[axis];
-    std::vector<T *> dx_datas(n);  // NOLINT
-
-    for (int i = 0; i < n; i++) {
-      if (dx[i] == nullptr) {
-        dx_datas[i] = nullptr;
-      } else {
-        dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
-      }
-    }
-    auto dy_data = dy->data<T>();
-    int pre = 1;
-    for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
-    int total_num = dy->numel();
-    int post = total_num / (n * pre);
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    auto dx_data_arr = dx_datas.data();
-    StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc
index 3a6e5b2aca..9d4ef0ffa2 100644
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ b/paddle/fluid/operators/stack_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/stack_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc
index c5a20ed4d1..baaa2b4884 100644
--- a/paddle/fluid/operators/stack_op_xpu.cc
+++ b/paddle/fluid/operators/stack_op_xpu.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/stack_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/concat_op.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
index 9044a881b7..5c103e088b 100644
--- a/paddle/fluid/operators/unique_op.cc
+++ b/paddle/fluid/operators/unique_op.cc
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unique_op.h"
-#include "paddle/fluid/framework/op_version_registry.h"
+#include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,62 +29,54 @@ class UniqueOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique");
-    auto in_dims = ctx->GetInputDim("X");
-    if (!ctx->Attrs().Get<bool>("is_sorted")) {
-      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
-      PADDLE_ENFORCE_EQ(in_dims.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The Input(X) should be 1-D Tensor, "
-                            "But now the dims of Input(X) is %d.",
-                            in_dims.size()));
-
-      ctx->SetOutputDim("Out", {-1});
-      ctx->SetOutputDim("Index", in_dims);
-      return;
-    }
 
     bool return_index = ctx->Attrs().Get<bool>("return_index");
     bool return_inverse = ctx->Attrs().Get<bool>("return_inverse");
     bool return_counts = ctx->Attrs().Get<bool>("return_counts");
     auto axis_vec = ctx->Attrs().Get<std::vector<int>>("axis");
+    auto data_type =
+        static_cast<phi::DataType>(static_cast<framework::proto::VarType::Type>(
+            ctx->Attrs().Get<int>("dtype")));
+
+    // Construct MetaTensor for InferMeta Func
+    using CompatMetaTensor = framework::CompatMetaTensor;
+    CompatMetaTensor x(ctx->GetInputVarPtrs("X")[0], ctx->IsRuntime());
+    CompatMetaTensor out(ctx->GetOutputVarPtrs("Out")[0], ctx->IsRuntime());
+    std::unique_ptr<CompatMetaTensor> indices(nullptr);
+    std::unique_ptr<CompatMetaTensor> index(nullptr);
+    std::unique_ptr<CompatMetaTensor> counts(nullptr);
 
     if (return_index) {
       OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique");
+      indices =
+          std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
+              ctx->GetOutputVarPtrs("Indices")[0], ctx->IsRuntime())));
     }
     if (return_inverse) {
       OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
+      index = std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
+          ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime())));
     }
     if (return_counts) {
       OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique");
+      counts = std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
+          ctx->GetOutputVarPtrs("Counts")[0], ctx->IsRuntime())));
     }
-
-    if (axis_vec.empty()) {
-      ctx->SetOutputDim("Out", {-1});
-      if (return_inverse) {
-        ctx->SetOutputDim("Index", {phi::product(in_dims)});
-      }
+    bool is_sorted = ctx->Attrs().Get<bool>("is_sorted");
+    if (is_sorted) {
+      phi::UniqueInferMeta(x, return_index, return_inverse, return_counts,
+                           axis_vec, data_type, &out, indices.get(),
+                           index.get(), counts.get());
     } else {
-      int axis = axis_vec[0];
-      if (axis < 0) {
-        axis += in_dims.size();
-      }
-      PADDLE_ENFORCE_LT(
-          axis, in_dims.size(),
-          platform::errors::InvalidArgument("The axis(%d) should be less than "
-                                            "the dimension size(%d) of x.",
-                                            axis, in_dims.size()));
-      auto out_dims = in_dims;
-      out_dims[axis] = -1;
-      ctx->SetOutputDim("Out", out_dims);
-      if (return_inverse) {
-        ctx->SetOutputDim("Index", {in_dims[axis]});
+      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
+      if (index == nullptr) {
+        index =
+            std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
+                ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime())));
       }
-    }
-    if (return_index) {
-      ctx->SetOutputDim("Indices", {-1});
-    }
-    if (return_counts) {
-      ctx->SetOutputDim("Counts", {-1});
+      phi::UniqueRawInferMeta(x, return_index, return_inverse, return_counts,
+                              axis_vec, data_type, is_sorted, &out,
+                              indices.get(), index.get(), counts.get());
     }
   }
 
@@ -152,40 +148,5 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    unique, ops::UniqueKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UniqueKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_VERSION(unique)
-    .AddCheckpoint(
-        R"ROC(
-        Upgrade unique, add 2 outputs [Indices, Counts] and 5 attribute
-        [return_index, return_inverse, return_counts, axis, is_sorted].
-      )ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewOutput("Indices",
-                       "The indices of the input tensor that result in the "
-                       "unique tensor.")
-            .NewOutput("Counts", "The counts for each unique element.")
-            .NewAttr("return_index",
-                     "If True, also return the indices of the input"
-                     " tensor that result in the unique Tensor.",
-                     false)
-            .NewAttr("return_inverse",
-                     "If True, also return the indices for where elements"
-                     " in the original input ended up in the returned unique "
-                     "tensor.",
-                     false)
-            .NewAttr("return_counts",
-                     "If True, also return the counts for each unique element.",
-                     false)
-            .NewAttr("axis",
-                     "The axis to apply unique. If None, the input will be "
-                     "flattened.",
-                     std::vector<int>{})
-            .NewAttr("is_sorted",
-                     "If True, the unique elements of X are in ascending order."
-                     "Otherwise, the unique elements are not sorted.",
-                     false));
diff --git a/paddle/fluid/operators/unique_op.cu b/paddle/fluid/operators/unique_op.cu
deleted file mode 100644
index 871274c999..0000000000
--- a/paddle/fluid/operators/unique_op.cu
+++ /dev/null
@@ -1,474 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <thrust/adjacent_difference.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/scatter.h>
-#include <thrust/sequence.h>
-#include <thrust/unique.h>
-#include <iostream>
-#include <vector>
-#include "paddle/fluid/framework/tensor_util.h"  // TensorToVector()
-#include "paddle/fluid/operators/unique_op.h"    // TransComute()
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-// Binary function 'less than'
-template <typename InT>
-struct LessThan {
-  int col;
-  const InT* in_trans_data;
-
-  LessThan(int64_t _col, const InT* _in_trans_data)
-      : col(_col), in_trans_data(_in_trans_data) {}
-
-  __device__ bool operator()(int64_t a, int64_t b) const {
-    for (int i = 0; i < col; ++i) {
-      InT lhs = in_trans_data[i + a * col];
-      InT rhs = in_trans_data[i + b * col];
-      if (lhs < rhs) {
-        return true;
-      } else if (lhs > rhs) {
-        return false;
-      }
-    }
-    return false;
-  }
-};
-
-// Binary function 'equal_to'
-template <typename InT>
-struct BinaryEqual {
-  int64_t col;
-  const InT* in_trans_data;
-
-  BinaryEqual(int64_t _col, const InT* _in_trans_data)
-      : col(_col), in_trans_data(_in_trans_data) {}
-
-  __device__ bool operator()(int64_t a, int64_t b) const {
-    for (int64_t i = 0; i < col; ++i) {
-      InT lhs = in_trans_data[i + a * col];
-      InT rhs = in_trans_data[i + b * col];
-      if (lhs != rhs) {
-        return false;
-      }
-    }
-    return true;
-  }
-};
-
-// Binary function 'not_equal_to'
-template <typename InT>
-struct BinaryNotEqual {
-  int64_t col;
-  const InT* in_trans_data;
-
-  BinaryNotEqual(int64_t _col, const InT* _in_trans_data)
-      : col(_col), in_trans_data(_in_trans_data) {}
-
-  __device__ bool operator()(int64_t a, int64_t b) const {
-    for (int64_t i = 0; i < col; ++i) {
-      InT lhs = in_trans_data[i + a * col];
-      InT rhs = in_trans_data[i + b * col];
-      if (lhs != rhs) {
-        return true;
-      }
-    }
-    return false;
-  }
-};
-
-// index_select() function for Tensor
-template <typename InT, typename IndexT>
-void IndexSelect(const framework::ExecutionContext& context,
-                 const Tensor& input, const Tensor& index, Tensor* output,
-                 int dim) {
-  auto input_dim = input.dims();
-  auto input_dim_size = input_dim.size();
-  auto output_dim = output->dims();
-
-  auto slice_size = 1;
-  for (auto i = dim + 1; i < input_dim_size; i++) {
-    slice_size *= input_dim[i];
-  }
-
-  auto input_width = slice_size * input_dim[dim];
-  auto output_width = slice_size * output_dim[dim];
-
-  auto outer_nums = 1;
-  for (auto i = 0; i < dim; i++) {
-    outer_nums *= input_dim[i];
-  }
-
-  auto index_size = index.dims()[0];
-
-  std::vector<InT> input_vec;
-  std::vector<IndexT> index_vec;
-  paddle::framework::TensorToVector(input, context.device_context(),
-                                    &input_vec);
-  paddle::framework::TensorToVector(index, context.device_context(),
-                                    &index_vec);
-  std::vector<InT> out_vec(output->numel());
-
-  for (int i = 0; i < index_size; i++) {
-    PADDLE_ENFORCE_GE(
-        index_vec[i], 0,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_select) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            input_dim[dim], index_vec[i]));
-    PADDLE_ENFORCE_LT(
-        index_vec[i], input_dim[dim],
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_select) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            input_dim[dim], index_vec[i]));
-  }
-
-  for (auto i = 0; i < outer_nums; i++) {
-    auto input_start_offset = i * input_width;
-    auto output_start_offset = i * output_width;
-
-    for (auto j = 0; j < index_size; j++) {
-      IndexT index_value = index_vec[j];
-      for (auto k = 0; k < slice_size; k++) {
-        out_vec[output_start_offset + j * slice_size + k] =
-            input_vec[input_start_offset + index_value * slice_size + k];
-      }
-    }
-  }
-  output->mutable_data<InT>(context.GetPlace());
-  framework::TensorFromVector(out_vec, context.device_context(), output);
-  output->Resize(output_dim);
-}
-
-// The core logic of computing Unique for a flattend Tensor
-template <typename InT, typename IndexT, typename equal_T, typename not_equal_T>
-static void UniqueFlattendCUDATensor(const framework::ExecutionContext& context,
-                                     const Tensor& in, Tensor* out,
-                                     bool return_index, bool return_inverse,
-                                     bool return_counts, equal_T equal,
-                                     not_equal_T not_equal, int64_t num_input) {
-  // 0. Prepration
-  Tensor in_hat;
-  framework::TensorCopy(in, context.GetPlace(), &in_hat);
-  auto in_data_hat = in_hat.mutable_data<InT>(context.GetPlace());
-
-  Tensor* sorted_indices = context.Output<Tensor>("Indices");
-  sorted_indices->Resize(phi::make_ddim({num_input}));
-  auto sorted_indices_data =
-      sorted_indices->mutable_data<IndexT>(context.GetPlace());
-  thrust::sequence(thrust::device, sorted_indices_data,
-                   sorted_indices_data + num_input);
-  thrust::sort_by_key(thrust::device, in_data_hat, in_data_hat + num_input,
-                      sorted_indices_data);
-
-  // 1. Calculate op result: 'out'
-  Tensor range;
-  range.Resize(phi::make_ddim({num_input + 1}));
-  auto range_data_ptr = range.mutable_data<IndexT>(context.GetPlace());
-  thrust::sequence(thrust::device, range_data_ptr,
-                   range_data_ptr + num_input + 1);
-  framework::TensorCopy(in_hat, context.GetPlace(), out);
-  int num_out;
-  auto out_data = out->mutable_data<InT>(context.GetPlace());
-  num_out = thrust::unique_by_key(thrust::device, out_data,
-                                  out_data + num_input, range_data_ptr, equal)
-                .first -
-            out_data;
-  out->Resize(phi::make_ddim({num_out}));
-
-  // 3. Calculate inverse index: 'inverse'
-  if (return_inverse) {
-    Tensor* inverse = context.Output<Tensor>("Index");
-    inverse->Resize(phi::make_ddim({num_input}));
-    auto inverse_data = inverse->mutable_data<IndexT>(context.GetPlace());
-    Tensor inv_loc;
-    inv_loc.Resize(phi::make_ddim({num_input}));
-    auto inv_loc_data_ptr = inv_loc.mutable_data<IndexT>(context.GetPlace());
-    thrust::adjacent_difference(thrust::device, in_data_hat,
-                                in_data_hat + num_input, inv_loc_data_ptr,
-                                not_equal);
-    thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
-    inv_loc_data_dev[0] = 0;  // without device_ptr, segmentation fault
-    thrust::inclusive_scan(thrust::device, inv_loc_data_ptr,
-                           inv_loc_data_ptr + num_input, inv_loc_data_ptr);
-    thrust::scatter(thrust::device, inv_loc_data_ptr,
-                    inv_loc_data_ptr + num_input, sorted_indices_data,
-                    inverse_data);
-  }
-
-  // 2. Calculate sorted index: 'sorted_indices'
-  if (return_index) {
-    Tensor indices;
-    indices.Resize(phi::make_ddim({num_input}));
-    auto indices_data_ptr = indices.mutable_data<IndexT>(context.GetPlace());
-    thrust::copy(thrust::device, in_data_hat, in_data_hat + num_input,
-                 indices_data_ptr);
-    thrust::unique_by_key(thrust::device, indices_data_ptr,
-                          indices_data_ptr + num_input, sorted_indices_data,
-                          equal);
-    sorted_indices->Resize(phi::make_ddim({num_out}));
-  }
-
-  // 4. Calculate 'counts'
-  if (return_counts) {
-    Tensor* counts = context.Output<Tensor>("Counts");
-    counts->Resize(phi::make_ddim({num_out}));
-    auto count_data = counts->mutable_data<IndexT>(context.GetPlace());
-    // init 'count_data' as 0
-    thrust::fill(thrust::device, count_data, count_data + num_out, 0);
-    thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr);
-    range_data_ptr_dev[num_out] = num_input;
-    thrust::adjacent_difference(thrust::device, range_data_ptr + 1,
-                                range_data_ptr + num_out + 1, count_data);
-  }
-}
-
-// The logic of compute unique with axis required, it's a little different
-// from above function
-template <typename InT, typename IndexT, typename equal_T, typename not_equal_T>
-static void ComputeUniqueDims(const framework::ExecutionContext& context,
-                              Tensor* sorted_indices,
-                              IndexT* sorted_indices_data, Tensor* out,
-                              bool return_index, bool return_inverse,
-                              bool return_counts, equal_T equal,
-                              not_equal_T not_equal, int64_t row) {
-  // 1. inverse indices: 'inverse'
-  Tensor* inverse = context.Output<Tensor>("Index");
-  inverse->Resize(phi::make_ddim({row}));
-  auto inverse_data = inverse->mutable_data<IndexT>(context.GetPlace());
-  Tensor inv_loc;
-  inv_loc.Resize(phi::make_ddim({row}));
-  auto inv_loc_data_ptr = inv_loc.mutable_data<IndexT>(context.GetPlace());
-  thrust::adjacent_difference(thrust::device, sorted_indices_data,
-                              sorted_indices_data + row, inv_loc_data_ptr,
-                              not_equal);
-  thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
-  inv_loc_data_dev[0] = 0;
-  thrust::inclusive_scan(thrust::device, inv_loc_data_ptr,
-                         inv_loc_data_ptr + row, inv_loc_data_ptr);
-  thrust::scatter(thrust::device, inv_loc_data_ptr, inv_loc_data_ptr + row,
-                  sorted_indices_data, inverse_data);
-
-  // 2. sorted indices
-  Tensor range;
-  range.Resize(phi::make_ddim({row + 1}));
-  auto range_data_ptr = range.mutable_data<IndexT>(context.GetPlace());
-  thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1);
-  int num_out;
-  num_out =
-      thrust::unique_by_key(thrust::device, sorted_indices_data,
-                            sorted_indices_data + row, range_data_ptr, equal)
-          .first -
-      sorted_indices_data;
-  thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr);
-  range_data_ptr_dev[num_out] = row;
-  sorted_indices->Resize(phi::make_ddim({num_out}));
-
-  // 3. counts: 'counts'
-  Tensor* counts = context.Output<Tensor>("Counts");
-  counts->Resize(phi::make_ddim({num_out}));
-  auto count_data = counts->mutable_data<IndexT>(context.GetPlace());
-  thrust::fill(thrust::device, count_data, count_data + row, 0);
-  thrust::adjacent_difference(thrust::device, range_data_ptr + 1,
-                              range_data_ptr + row + 1, count_data);
-}
-
-// Calculate unique when 'axis' is set
-template <typename DeviceContext, typename InT, typename IndexT>
-static void UniqueDimsCUDATensor(const framework::ExecutionContext& context,
-                                 const Tensor& in, Tensor* out,
-                                 bool return_index, bool return_inverse,
-                                 bool return_counts, int axis) {
-  // 1. Transpose & reshape
-  // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
-  std::vector<int> permute(in.dims().size());
-  std::iota(permute.begin(), permute.end(), 0);
-  permute[axis] = 0;
-  permute[0] = axis;
-  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
-  in_trans_dims_vec[axis] = in.dims()[0];
-  in_trans_dims_vec[0] = in.dims()[axis];
-  framework::Tensor in_trans;
-  framework::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec);
-  in_trans.Resize(in_trans_dims);
-  in_trans.mutable_data<InT>(context.GetPlace());
-  auto& dev_ctx = context.cuda_device_context();
-  TransCompute<DeviceContext, InT>(in.dims().size(),  // num of dims
-                                   dev_ctx,           // device
-                                   in,                // original Tensor
-                                   &in_trans,         // Tensor after reshape
-                                   permute);          // index of axis
-
-  // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
-  framework::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
-  in_trans.Resize(in_trans_flat_dims);
-
-  // now 'in_trans' is 2D
-  int64_t col = in_trans.dims()[1];
-  int64_t row = in_trans.dims()[0];
-  const InT* in_trans_data = in_trans.data<InT>();
-
-  Tensor* sorted_indices = context.Output<Tensor>("Indices");
-  sorted_indices->Resize(phi::make_ddim({row}));
-  auto sorted_indices_data =
-      sorted_indices->mutable_data<IndexT>(context.GetPlace());
-
-  // 2. Calculate 'sorted_indices', 'inverse', 'counts'
-  // Init index and sort
-  thrust::sequence(thrust::device, sorted_indices_data,
-                   sorted_indices_data + row);
-  thrust::sort(thrust::device, sorted_indices_data, sorted_indices_data + row,
-               LessThan<InT>(col, in_trans_data));
-  ComputeUniqueDims<InT, IndexT>(
-      context, sorted_indices, sorted_indices_data, out, return_index,
-      return_inverse, return_counts, BinaryEqual<InT>(col, in_trans_data),
-      BinaryNotEqual<InT>(col, in_trans_data), row);
-
-  // 3. Select indices and reshape back to get 'out'
-  Tensor out_trans;
-  std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
-  out_trans_dims_vec[0] = sorted_indices->numel();
-  out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
-  out_trans.mutable_data<InT>(context.GetPlace());
-
-  IndexSelect<InT, IndexT>(context, in_trans, *sorted_indices, &out_trans, 0);
-
-  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
-  out->Resize(phi::make_ddim(out_trans_dims_vec));
-  out->mutable_data<InT>(context.GetPlace());
-  std::vector<framework::Tensor> out_trans_unbind = Unbind(out_trans);
-  math::ConcatFunctor<DeviceContext, InT> concat_functor;
-  concat_functor(dev_ctx, out_trans_unbind, 0, &out_trans);
-  TransCompute<DeviceContext, InT>(out_trans.dims().size(), dev_ctx, out_trans,
-                                   out, permute);
-}
-
-// functor for processing a flattend Tensor
-template <typename DeviceContext, typename InT>
-struct UniqueFlattendCUDAFunctor {
-  const framework::ExecutionContext& ctx_;
-  const Tensor& in_;
-  Tensor* out_;
-  const bool return_index_;
-  const bool return_inverse_;
-  const bool return_counts_;
-
-  UniqueFlattendCUDAFunctor(const framework::ExecutionContext& context,
-                            const Tensor& in, Tensor* out, bool return_index,
-                            bool return_inverse, bool return_counts)
-      : ctx_(context),
-        in_(in),
-        out_(out),
-        return_index_(return_index),
-        return_inverse_(return_inverse),
-        return_counts_(return_counts) {}
-
-  template <typename IndexT>
-  void apply() const {
-    UniqueFlattendCUDATensor<InT, IndexT>(
-        ctx_, in_, out_, return_index_, return_inverse_, return_counts_,
-        thrust::equal_to<InT>(), thrust::not_equal_to<InT>(), in_.numel());
-  }
-};
-
-// functor for processing a multi-dimentional Tensor
-template <typename DeviceContext, typename InT>
-struct UniqueDimsCUDAFunctor {
-  const framework::ExecutionContext& ctx_;
-  const Tensor& in_;
-  Tensor* out_;
-  const int axis_;
-  const bool return_index_;
-  const bool return_inverse_;
-  const bool return_counts_;
-
-  UniqueDimsCUDAFunctor(const framework::ExecutionContext& context,
-                        const Tensor& in, Tensor* out, const int axis,
-                        bool return_index, bool return_inverse,
-                        bool return_counts)
-      : ctx_(context),
-        in_(in),
-        out_(out),
-        axis_(axis),
-        return_index_(return_index),
-        return_inverse_(return_inverse),
-        return_counts_(return_counts) {}
-
-  template <typename IndexT>
-  void apply() const {
-    UniqueDimsCUDATensor<DeviceContext, InT, IndexT>(
-        ctx_, in_, out_, return_index_, return_inverse_, return_counts_, axis_);
-  }
-};
-
-// Unique_op CUDA implementation.
-template <typename InT>
-class UniqueKernel<platform::CUDADeviceContext, InT>
-    : public framework::OpKernel<InT> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-    if (data_type == framework::proto::VarType::INT32) {
-      PADDLE_ENFORCE_LE(
-          x->numel() + 1, INT_MAX,
-          platform::errors::InvalidArgument(
-              "The number of elements in Input(X) should be less than or "
-              "equal to INT_MAX, but received num is %d. Please set `dtype` to "
-              "int64.",
-              x->numel()));
-    }
-
-    std::vector<int> axis_vec = context.Attr<std::vector<int>>("axis");
-    bool return_index = context.Attr<bool>("return_index");
-    bool return_inverse = context.Attr<bool>("return_inverse");
-    bool return_counts = context.Attr<bool>("return_counts");
-
-    // if 'axis' is not required, flatten the Tensor.
-    if (axis_vec.empty()) {
-      framework::VisitDataTypeTiny(
-          data_type,
-          UniqueFlattendCUDAFunctor<platform::CUDADeviceContext, InT>(
-              context, *x, out, return_index, return_inverse, return_counts));
-    } else {
-      // 'axis' is required.
-      int axis = axis_vec[0];
-      framework::VisitDataTypeTiny(
-          data_type, UniqueDimsCUDAFunctor<platform::CUDADeviceContext, InT>(
-                         context, *x, out, axis, return_index, return_inverse,
-                         return_counts));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    unique, ops::UniqueKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UniqueKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::UniqueKernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::UniqueKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc
index 96320202b7..8c8684bf4b 100644
--- a/paddle/fluid/operators/unstack_op.cc
+++ b/paddle/fluid/operators/unstack_op.cc
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/unstack_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,43 +27,6 @@ namespace operators {
 class UnStackOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "UnStack");
-    int axis = ctx->Attrs().Get<int>("axis");
-    int num = ctx->Attrs().Get<int>("num");
-    auto x_dim = ctx->GetInputDim("X");
-    int rank = x_dim.size();
-    PADDLE_ENFORCE_GE(axis, -rank,
-                      platform::errors::InvalidArgument(
-                          "The attribute axis is out of range, it must be "
-                          "inside [-rank, rank), where rank = %d",
-                          rank));
-    PADDLE_ENFORCE_LT(axis, rank,
-                      platform::errors::InvalidArgument(
-                          "The attribute axis is out of range, it must be "
-                          "inside [-rank, rank), where rank = %d",
-                          rank));
-    if (axis < 0) axis += rank;
-
-    PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast<size_t>(num),
-                      platform::errors::InvalidArgument(
-                          "Number of Outputs(Y) is wrong. Got %d , but it must "
-                          "equal to attribute num which is %d.",
-                          ctx->Outputs("Y").size(), static_cast<size_t>(num)));
-    if (x_dim[axis] > 0) {
-      PADDLE_ENFORCE_EQ(
-          num, x_dim[axis],
-          platform::errors::InvalidArgument(
-              "The number of attribute num is not equal to the length of the "
-              "%d axis of Input(X). Expect %d but got %d.",
-              axis, x_dim[axis], num));
-    }
-    auto vec = phi::vectorize<int>(x_dim);
-    vec.erase(vec.begin() + axis);
-    ctx->SetOutputsDim("Y", std::vector<framework::DDim>(  // NOLINT
-                                x_dim[axis], phi::make_ddim(vec)));
-  }
 };
 
 class UnStackOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -141,20 +106,12 @@ class UnStackGradOp : public framework::OperatorWithKernel {
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(unstack, UnStackInferMetaFunctor,
+                            PD_INFER_META(phi::UnStackInferMeta));
+
 REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker,
                   ops::UnStackGradOpMaker<paddle::framework::OpDesc>,
-                  ops::UnStackGradOpMaker<paddle::imperative::OpBase>);
+                  ops::UnStackGradOpMaker<paddle::imperative::OpBase>,
+                  UnStackInferMetaFunctor);
 
 REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp);
-
-REGISTER_OP_CPU_KERNEL(unstack,
-                       ops::UnStackKernel<plat::CPUDeviceContext, float>,
-                       ops::UnStackKernel<plat::CPUDeviceContext, double>,
-                       ops::UnStackKernel<plat::CPUDeviceContext, int>,
-                       ops::UnStackKernel<plat::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(unstack_grad,
-                       ops::UnStackGradKernel<plat::CPUDeviceContext, float>,
-                       ops::UnStackGradKernel<plat::CPUDeviceContext, double>,
-                       ops::UnStackGradKernel<plat::CPUDeviceContext, int>,
-                       ops::UnStackGradKernel<plat::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/unstack_op.cu b/paddle/fluid/operators/unstack_op.cu
deleted file mode 100644
index b591898a4d..0000000000
--- a/paddle/fluid/operators/unstack_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unstack_op.h"
-
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    unstack, ops::UnStackKernel<plat::CUDADeviceContext, float>,
-    ops::UnStackKernel<plat::CUDADeviceContext, double>,
-    ops::UnStackKernel<plat::CUDADeviceContext, int>,
-    ops::UnStackKernel<plat::CUDADeviceContext, int64_t>,
-    ops::UnStackKernel<plat::CUDADeviceContext, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    unstack_grad, ops::UnStackGradKernel<plat::CUDADeviceContext, float>,
-    ops::UnStackGradKernel<plat::CUDADeviceContext, double>,
-    ops::UnStackGradKernel<plat::CUDADeviceContext, int>,
-    ops::UnStackGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::UnStackGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/unstack_op.h b/paddle/fluid/operators/unstack_op.h
deleted file mode 100644
index 413470e3db..0000000000
--- a/paddle/fluid/operators/unstack_op.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include <thrust/device_vector.h>
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename VecXType, typename T>
-struct StackFunctor {
-  HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
-      : x_(x), y_(y), n_(n), post_(post) {}
-
-  HOSTDEVICE void operator()(int idx) {
-    int i = idx / (n_ * post_);
-    int which_x = idx / post_ - i * n_;
-    int x_index = i * post_ + idx % post_;
-    y_[idx] = x_[which_x][x_index];
-  }
-
- private:
-  VecXType x_;
-  T *y_;
-  int n_;
-  int post_;
-};
-
-template <typename VecDxType, typename T>
-struct StackGradFunctor {
-  HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
-      : dx_(dx), dy_(dy), n_(n), post_(post) {}
-
-  HOSTDEVICE void operator()(int idx) {
-    int i = idx / (n_ * post_);
-    int which_x = idx / post_ - i * n_;
-    int x_index = i * post_ + idx % post_;
-    dx_[which_x][x_index] = dy_[idx];
-  }
-
- private:
-  VecDxType dx_;
-  const T *dy_;
-  int n_;
-  int post_;
-};
-
-template <typename DeviceContext, typename VecXType, typename T>
-static inline void StackFunctorForRange(const DeviceContext &ctx,
-                                        const VecXType &x, T *y, int total_num,
-                                        int n, int post) {
-  platform::ForRange<DeviceContext> for_range(ctx, total_num);
-  for_range(StackFunctor<VecXType, T>(x, y, n, post));
-}
-
-template <typename DeviceContext, typename VecDxType, typename T>
-static inline void StackGradFunctorForRange(const DeviceContext &ctx,
-                                            const VecDxType &dx, const T *dy,
-                                            int total_num, int n, int post) {
-  platform::ForRange<DeviceContext> for_range(ctx, total_num);
-  for_range(StackGradFunctor<VecDxType, T>(dx, dy, n, post));
-}
-
-template <typename DeviceContext, typename T>
-class UnStackGradKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto x = ctx.MultiInput<Tensor>(framework::GradVarName("Y"));
-    auto *y = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-
-    int n = static_cast<int>(x.size());
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    std::vector<const T *> x_datas(n);
-    for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
-
-    int pre = 1;
-    int post = 1;
-    auto &dim = x[0]->dims();
-    for (auto i = 0; i < axis; ++i) pre *= dim[i];
-    for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-    int total_num = pre * n * post;
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-
-    thrust::device_vector<const T *> device_x_vec(x_datas);
-    auto x_data_arr = device_x_vec.data().get();
-
-    StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
-
-    // Wait() must be called because device_x_vec may be destructed before
-    // kernel ends
-    dev_ctx.Wait();
-#else
-    auto x_data_arr = x_datas.data();
-
-    size_t x_offset = 0;
-    size_t y_offset = 0;
-    for (int i = 0; i < pre; i++) {
-      for (int j = 0; j < n; j++) {
-        std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset,
-                    post * sizeof(T));
-        y_offset += post;
-      }
-      x_offset += post;
-    }
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T>
-class UnStackKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *dy = ctx.Input<Tensor>("X");
-    auto dx = ctx.MultiOutput<Tensor>("Y");
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += dy->dims().size();
-
-    int n = dy->dims()[axis];
-    std::vector<T *> dx_datas(n);  // NOLINT
-    for (int i = 0; i < n; i++) {
-      dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
-    }
-    auto dy_data = dy->data<T>();
-    if (dy->numel() == 0) return;
-    int pre = 1;
-    for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
-    int total_num = dy->numel();
-    int post = total_num / (n * pre);
-
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-#if defined(__NVCC__) || defined(__HIPCC__)
-    thrust::device_vector<T *> device_dx_vec(dx_datas);
-    auto dx_data_arr = device_dx_vec.data().get();
-#else
-    auto dx_data_arr = dx_datas.data();
-#endif
-    StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
-#if defined(__NVCC__) || defined(__HIPCC__)
-    // Wait() must be called because device_dx_vec may be destructed before
-    // kernel ends
-    dev_ctx.Wait();
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/unstack_op_npu.cc b/paddle/fluid/operators/unstack_op_npu.cc
index fb88566e34..c55ec1fcf9 100644
--- a/paddle/fluid/operators/unstack_op_npu.cc
+++ b/paddle/fluid/operators/unstack_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/unstack_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h
index a190b222f8..9ef8e8a356 100644
--- a/paddle/phi/core/utils/data_type.h
+++ b/paddle/phi/core/utils/data_type.h
@@ -44,6 +44,10 @@ namespace phi {
   _PhiForEachDataTypeHelper_(                                       \
       callback, ::phi::dtype::complex<double>, DataType::COMPLEX128);
 
+#define _PhiForEachDataTypeTiny_(callback)                    \
+  _PhiForEachDataTypeHelper_(callback, int, DataType::INT32); \
+  _PhiForEachDataTypeHelper_(callback, int64_t, DataType::INT64);
+
 template <typename Visitor>
 inline void VisitDataType(phi::DataType type, Visitor visitor) {
 #define PhiVisitDataTypeCallback(cpp_type, data_type) \
@@ -59,4 +63,21 @@ inline void VisitDataType(phi::DataType type, Visitor visitor) {
   PADDLE_THROW(phi::errors::Unimplemented(
       "Not supported phi::DataType(%d) as data type.", static_cast<int>(type)));
 }
+
+template <typename Visitor>
+inline void VisitDataTypeTiny(phi::DataType type, Visitor visitor) {
+#define PhiVisitDataTypeCallbackTiny(cpp_type, data_type) \
+  do {                                                    \
+    if (type == data_type) {                              \
+      visitor.template apply<cpp_type>();                 \
+      return;                                             \
+    }                                                     \
+  } while (0)
+
+  _PhiForEachDataTypeTiny_(PhiVisitDataTypeCallbackTiny);
+#undef PhiVisitDataTypeCallbackTiny
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "Not supported phi::DataType(%d) as data type.", static_cast<int>(type)));
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index f196744c04..c6940492ce 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1167,6 +1167,52 @@ void RnnInferMeta(const MetaTensor& x,
   }
 }
 
+void StackInferMeta(const std::vector<MetaTensor*>& x,
+                    int axis,
+                    MetaTensor* out) {
+  PADDLE_ENFORCE_GT(x.size(),
+                    0UL,
+                    phi::errors::InvalidArgument(
+                        "Number of Inputs(x) must be larger than 0, but"
+                        " received value is:%d.",
+                        x.size()));
+  const auto& input_dims = GetMetaTensorsDim(x);
+  for (size_t i = 1; i < input_dims.size(); ++i) {
+    PADDLE_ENFORCE_EQ(input_dims[i],
+                      input_dims[0],
+                      phi::errors::InvalidArgument(
+                          "Dims of all Inputs(X) must be the same, but"
+                          " received input %d dim is:%d not equal to input 0"
+                          " dim:%d.",
+                          i,
+                          input_dims[i],
+                          input_dims[0]));
+  }
+  int rank = input_dims[0].size();
+  PADDLE_ENFORCE_GE(
+      axis,
+      -(rank + 1),
+      phi::errors::InvalidArgument(
+          "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
+          "but received axis is:%d.",
+          rank,
+          axis));
+  PADDLE_ENFORCE_LT(
+      axis,
+      rank + 1,
+      phi::errors::InvalidArgument(
+          "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
+          "but received axis is:%d",
+          rank,
+          axis));
+  if (axis < 0) axis += (rank + 1);
+  auto vec = phi::vectorize<int>(input_dims[0]);
+  vec.insert(vec.begin() + axis, input_dims.size());
+  out->set_dims(phi::make_ddim(vec));
+  out->set_dtype(x.at(0)->dtype());
+  out->share_lod(*x.at(0));
+}
+
 void WarpctcInferMeta(const MetaTensor& logits,
                       const MetaTensor& label,
                       const paddle::optional<const MetaTensor&> logits_length,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 6abbf1c0ef..4a8020aefc 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -231,6 +231,10 @@ void RnnInferMeta(const MetaTensor& x,
                   std::vector<MetaTensor*> state,
                   MetaTensor* reserve);
 
+void StackInferMeta(const std::vector<MetaTensor*>& x,
+                    int axis,
+                    MetaTensor* out);
+
 void WarpctcInferMeta(const MetaTensor& logits,
                       const MetaTensor& label,
                       const paddle::optional<const MetaTensor&> logits_length,
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 0376d4e79e..582dcb0137 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -345,6 +345,56 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void RangeInferMeta(const MetaTensor& start,
+                    const MetaTensor& end,
+                    const MetaTensor& step,
+                    MetaTensor* out) {
+  auto start_dims = start.dims();
+  auto end_dims = end.dims();
+  auto step_dims = step.dims();
+  PADDLE_ENFORCE_EQ(
+      start_dims.size(),
+      1,
+      phi::errors::InvalidArgument(
+          "The dim of the shape of Input(Start) should be 1, but got %d",
+          start_dims.size()));
+
+  PADDLE_ENFORCE_EQ(start_dims[0],
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The first dim of the shape of Input(Start) should "
+                        "be 1, but got %d",
+                        start_dims[0]));
+  PADDLE_ENFORCE_EQ(
+      end_dims.size(),
+      1,
+      phi::errors::InvalidArgument(
+          "The dim of the shape of Input(End) should be 1, but got %d",
+          end_dims.size()));
+
+  PADDLE_ENFORCE_EQ(
+      end_dims[0],
+      1,
+      phi::errors::InvalidArgument("The first dim of the shape of "
+                                   "Input(End) should be 1, but got %d",
+                                   end_dims[0]));
+  PADDLE_ENFORCE_EQ(
+      step_dims.size(),
+      1,
+      phi::errors::InvalidArgument(
+          "The dim of the shape of Input(Step) should be 1, but got %d",
+          step_dims.size()));
+
+  PADDLE_ENFORCE_EQ(step_dims[0],
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The first dim of the shape of Input(Step) should "
+                        "be 1, but got %d",
+                        step_dims[0]));
+  out->set_dims({-1});
+  out->set_dtype(start.dtype());
+}
+
 void RoiAlignInferMeta(const MetaTensor& x,
                        const MetaTensor& boxes,
                        paddle::optional<const MetaTensor&> boxes_num,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 30fdb4e612..c18dde42f1 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -81,6 +81,11 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
                            const std::string& reduce,
                            MetaTensor* out);
 
+void RangeInferMeta(const MetaTensor& start,
+                    const MetaTensor& end,
+                    const MetaTensor& step,
+                    MetaTensor* out);
+
 void RoiAlignInferMeta(const MetaTensor& x,
                        const MetaTensor& boxes,
                        paddle::optional<const MetaTensor&> boxes_num,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index cb6ca34c11..d763b23ef5 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -2552,6 +2552,85 @@ void UnfoldInferMeta(const MetaTensor& x,
   out->set_dims(phi::make_ddim(out_dims));
 }
 
+void UniqueInferMeta(const MetaTensor& x,
+                     bool return_index,
+                     bool return_inverse,
+                     bool return_counts,
+                     const std::vector<int>& axis,
+                     DataType dtype,
+                     MetaTensor* out,
+                     MetaTensor* indices,
+                     MetaTensor* index,
+                     MetaTensor* counts) {
+  bool is_sorted = true;
+  UniqueRawInferMeta(x,
+                     return_index,
+                     return_inverse,
+                     return_counts,
+                     axis,
+                     dtype,
+                     is_sorted,
+                     out,
+                     indices,
+                     index,
+                     counts);
+}
+
+void UniqueRawInferMeta(const MetaTensor& x,
+                        bool return_index,
+                        bool return_inverse,
+                        bool return_counts,
+                        const std::vector<int>& axis,
+                        DataType dtype,
+                        bool is_sorted,
+                        MetaTensor* out,
+                        MetaTensor* indices,
+                        MetaTensor* index,
+                        MetaTensor* counts) {
+  if (!is_sorted) {
+    PADDLE_ENFORCE_EQ(
+        x.dims().size(),
+        1,
+        phi::errors::InvalidArgument("The Input(X) should be 1-D Tensor, "
+                                     "But now the dims of Input(X) is %d.",
+                                     x.dims().size()));
+    out->set_dims(phi::make_ddim({-1}));
+    index->set_dims(x.dims());
+    return;
+  }
+
+  if (axis.empty()) {
+    out->set_dims(phi::make_ddim({-1}));
+    if (return_inverse) {
+      index->set_dims(phi::make_ddim({phi::product(x.dims())}));
+    }
+  } else {
+    int axis_value = axis[0];
+    if (axis_value < 0) {
+      axis_value += x.dims().size();
+    }
+    PADDLE_ENFORCE_LT(
+        axis_value,
+        x.dims().size(),
+        phi::errors::InvalidArgument("The axis(%d) should be less than "
+                                     "the dimension size(%d) of x.",
+                                     axis_value,
+                                     x.dims().size()));
+    auto out_dims = x.dims();
+    out_dims[axis_value] = -1;
+    out->set_dims(out_dims);
+    if (return_inverse) {
+      index->set_dims(phi::make_ddim({x.dims()[axis_value]}));
+    }
+  }
+  if (return_index) {
+    indices->set_dims(phi::make_ddim({-1}));
+  }
+  if (return_counts) {
+    counts->set_dims(phi::make_ddim({-1}));
+  }
+}
+
 void UnsqueezeInferMeta(const MetaTensor& x,
                         const IntArray& axes,
                         MetaTensor* xshape,
@@ -2595,6 +2674,53 @@ void UnsqueezeInferMeta(const MetaTensor& x,
   xshape->set_dtype(x.dtype());
 }
 
+void UnStackInferMeta(const MetaTensor& x,
+                      int axis,
+                      int num,
+                      std::vector<MetaTensor*> outs) {
+  auto x_dim = x.dims();
+  int rank = x_dim.size();
+  PADDLE_ENFORCE_GE(axis,
+                    -rank,
+                    phi::errors::InvalidArgument(
+                        "The attribute axis is out of range, it must be "
+                        "inside [-rank, rank), where rank = %d",
+                        rank));
+  PADDLE_ENFORCE_LT(axis,
+                    rank,
+                    phi::errors::InvalidArgument(
+                        "The attribute axis is out of range, it must be "
+                        "inside [-rank, rank), where rank = %d",
+                        rank));
+  if (axis < 0) axis += rank;
+
+  size_t output_count = outs.size();
+  PADDLE_ENFORCE_EQ(output_count,
+                    static_cast<size_t>(num),
+                    phi::errors::InvalidArgument(
+                        "Number of Outputs(Y) is wrong. Got %d , but it must "
+                        "equal to attribute num which is %d.",
+                        output_count,
+                        static_cast<size_t>(num)));
+  if (x_dim[axis] > 0) {
+    PADDLE_ENFORCE_EQ(
+        num,
+        x_dim[axis],
+        phi::errors::InvalidArgument(
+            "The number of attribute num is not equal to the length of the "
+            "%d axis of Input(X). Expect %d but got %d.",
+            axis,
+            x_dim[axis],
+            num));
+  }
+  auto vec = phi::vectorize<int>(x_dim);
+  vec.erase(vec.begin() + axis);
+  for (size_t i = 0; i < output_count; i++) {
+    outs[i]->set_dims(phi::make_ddim(vec));
+    outs[i]->set_dtype(x.dtype());
+  }
+}
+
 void OneHotRawInferMeta(const MetaTensor& x,
                         int32_t depth,
                         DataType dtype,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index bef802727c..7ab0f3df2a 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -360,12 +360,40 @@ void UnfoldInferMeta(const MetaTensor& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
+void UniqueInferMeta(const MetaTensor& x,
+                     bool return_index,
+                     bool return_inverse,
+                     bool return_counts,
+                     const std::vector<int>& axis,
+                     DataType dtype,
+                     MetaTensor* out,
+                     MetaTensor* indices,
+                     MetaTensor* index,
+                     MetaTensor* counts);
+
+void UniqueRawInferMeta(const MetaTensor& x,
+                        bool return_index,
+                        bool return_inverse,
+                        bool return_counts,
+                        const std::vector<int>& axis,
+                        DataType dtype,
+                        bool is_sorted,
+                        MetaTensor* out,
+                        MetaTensor* indices,
+                        MetaTensor* index,
+                        MetaTensor* counts);
+
 void UnsqueezeInferMeta(const MetaTensor& x,
                         const IntArray& axes,
                         MetaTensor* xshape,
                         MetaTensor* out,
                         MetaConfig config = MetaConfig());
 
+void UnStackInferMeta(const MetaTensor& x,
+                      int axis,
+                      int num,
+                      std::vector<MetaTensor*> outs);
+
 void OneHotRawInferMeta(const MetaTensor& x,
                         int32_t depth,
                         DataType dtype,
diff --git a/paddle/phi/kernels/cpu/range_kernel.cc b/paddle/phi/kernels/cpu/range_kernel.cc
new file mode 100644
index 0000000000..8731696f61
--- /dev/null
+++ b/paddle/phi/kernels/cpu/range_kernel.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/range_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/range_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RangeKernel(const Context& dev_ctx,
+                 const DenseTensor& start,
+                 const DenseTensor& end,
+                 const DenseTensor& step,
+                 DenseTensor* out) {
+  T start_value = start.data<T>()[0];
+  T end_value = end.data<T>()[0];
+  T step_value = step.data<T>()[0];
+  int64_t size = 0;
+  phi::funcs::GetSize(start_value, end_value, step_value, &size);
+  out->Resize(phi::make_ddim({size}));
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  T value = start_value;
+  for (int64_t i = 0; i < size; ++i) {
+    out_data[i] = value;
+    value += step_value;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    range, CPU, ALL_LAYOUT, phi::RangeKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/stack_grad_kernel.cc b/paddle/phi/kernels/cpu/stack_grad_kernel.cc
new file mode 100644
index 0000000000..018705333e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/stack_grad_kernel.cc
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/stack_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/stack_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void StackGradKernel(const Context& dev_ctx,
+                     const DenseTensor& out,
+                     int axis,
+                     std::vector<DenseTensor*> x_grad) {
+  if (axis < 0) axis += out.dims().size();
+  int n = out.dims()[axis];
+  std::vector<T*> dx_datas(n);  // NOLINT
+
+  for (int i = 0; i < n; i++) {
+    if (x_grad[i] == nullptr) {
+      dx_datas[i] = nullptr;
+    } else {
+      dx_datas[i] = dev_ctx.template Alloc<T>(x_grad[i]);
+    }
+  }
+  auto dy_data = out.data<T>();
+  int pre = 1;
+  for (int i = 0; i < axis; ++i) pre *= out.dims()[i];
+  int total_num = out.numel();
+  int post = total_num / (n * pre);
+  auto dx_data_arr = dx_datas.data();
+  phi::funcs::StackGradFunctorForRange(
+      dev_ctx, dx_data_arr, dy_data, total_num, n, post);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(stack_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::StackGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/stack_kernel.cc b/paddle/phi/kernels/cpu/stack_kernel.cc
new file mode 100644
index 0000000000..5eb1cf061b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/stack_kernel.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/stack_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void StackKernel(const Context& dev_ctx,
+                 const std::vector<const DenseTensor*>& x,
+                 int axis,
+                 DenseTensor* out) {
+  if (axis < 0) axis += (x[0]->dims().size() + 1);
+  int n = static_cast<int>(x.size());
+  T* y_data = dev_ctx.template Alloc<T>(out);
+  std::vector<const T*> x_datas(n);
+  for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
+
+  int pre = 1, post = 1;
+  auto& dim = x[0]->dims();
+  for (auto i = 0; i < axis; ++i) pre *= dim[i];
+  for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
+
+  auto x_data_arr = x_datas.data();
+
+  size_t x_offset = 0;
+  size_t y_offset = 0;
+  for (int i = 0; i < pre; i++) {
+    for (int j = 0; j < n; j++) {
+      std::memcpy(
+          y_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T));
+      y_offset += post;
+    }
+    x_offset += post;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(stack,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::StackKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/unique_kernel.cc b/paddle/phi/kernels/cpu/unique_kernel.cc
new file mode 100644
index 0000000000..853b401315
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unique_kernel.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unique_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/unique_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UniqueKernel(const Context& context,
+                  const DenseTensor& x,
+                  bool return_index,
+                  bool return_inverse,
+                  bool return_counts,
+                  const std::vector<int>& axis,
+                  DataType dtype,
+                  DenseTensor* out,
+                  DenseTensor* indices,
+                  DenseTensor* index,
+                  DenseTensor* counts) {
+  bool is_sorted = true;
+  UniqueRawKernel<T, Context>(context,
+                              x,
+                              return_index,
+                              return_inverse,
+                              return_counts,
+                              axis,
+                              dtype,
+                              is_sorted,
+                              out,
+                              indices,
+                              index,
+                              counts);
+}
+
+template <typename T, typename Context>
+void UniqueRawKernel(const Context& context,
+                     const DenseTensor& x,
+                     bool return_index,
+                     bool return_inverse,
+                     bool return_counts,
+                     const std::vector<int>& axis,
+                     DataType dtype,
+                     bool is_sorted,
+                     DenseTensor* out,
+                     DenseTensor* indices,
+                     DenseTensor* index,
+                     DenseTensor* counts) {
+  if (dtype == phi::DataType::INT32) {
+    PADDLE_ENFORCE_LE(
+        x.numel(),
+        INT_MAX,
+        phi::errors::InvalidArgument(
+            "The number of elements in Input(X) should be less than or "
+            "equal to INT_MAX, but received num is %d. Please set `dtype` to "
+            "int64.",
+            x.numel()));
+  }
+  if (!is_sorted) {
+    phi::VisitDataType(
+        dtype,
+        phi::funcs::UniqueOpFunctor<Context, T>(context, out, index, &x));
+    return;
+  }
+
+  if (x.numel() == 0) {
+    context.template Alloc<T>(out);
+    return;
+  }
+  if (axis.empty()) {
+    phi::VisitDataTypeTiny(
+        dtype,
+        phi::funcs::UniqueFlattendTensorFunctor<Context, T>(context,
+                                                            x,
+                                                            out,
+                                                            indices,
+                                                            index,
+                                                            counts,
+                                                            return_index,
+                                                            return_inverse,
+                                                            return_counts));
+  } else {
+    int axis_value = axis[0];
+    phi::VisitDataTypeTiny(
+        dtype,
+        phi::funcs::UniqueDimFunctor<Context, T>(context,
+                                                 x,
+                                                 out,
+                                                 indices,
+                                                 index,
+                                                 counts,
+                                                 axis_value,
+                                                 return_index,
+                                                 return_inverse,
+                                                 return_counts));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(unique,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniqueKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(unique_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniqueRawKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/unstack_grad_kernel.cc b/paddle/phi/kernels/cpu/unstack_grad_kernel.cc
new file mode 100644
index 0000000000..9c2dce808d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unstack_grad_kernel.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/unstack_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unstack_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UnStackGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/unstack_kernel.cc b/paddle/phi/kernels/cpu/unstack_kernel.cc
new file mode 100644
index 0000000000..3d233e9ec4
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unstack_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/unstack_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unstack_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    unstack, CPU, ALL_LAYOUT, phi::UnStackKernel, float, double, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/funcs/range_function.h b/paddle/phi/kernels/funcs/range_function.h
new file mode 100644
index 0000000000..5ace32f46a
--- /dev/null
+++ b/paddle/phi/kernels/funcs/range_function.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+void GetSize(T start, T end, T step, int64_t* size) {
+  PADDLE_ENFORCE_NE(
+      step,
+      0,
+      phi::errors::InvalidArgument("The step of range op should not be 0."));
+
+  if (start < end) {
+    PADDLE_ENFORCE_GT(
+        step,
+        0,
+        phi::errors::InvalidArgument(
+            "The step should be greater than 0 while start < end."));
+  }
+
+  if (start > end) {
+    PADDLE_ENFORCE_LT(step,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The step should be less than 0 while start > end."));
+  }
+
+  *size = std::is_integral<T>::value
+              ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step))
+              : std::ceil(std::abs((end - start) / step));
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/stack_functor.h b/paddle/phi/kernels/funcs/stack_functor.h
new file mode 100644
index 0000000000..68379c2705
--- /dev/null
+++ b/paddle/phi/kernels/funcs/stack_functor.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename VecXType, typename T>
+struct StackFunctor {
+  HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
+      : x_(x), y_(y), n_(n), post_(post) {}
+
+  HOSTDEVICE void operator()(int idx) {
+    int i = idx / (n_ * post_);
+    int which_x = idx / post_ - i * n_;
+    int x_index = i * post_ + idx % post_;
+    y_[idx] = x_[which_x][x_index];
+  }
+
+ private:
+  VecXType x_;
+  T *y_;
+  int n_;
+  int post_;
+};
+
+template <typename VecDxType, typename T>
+struct StackGradFunctor {
+  HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
+      : dx_(dx), dy_(dy), n_(n), post_(post) {}
+
+  HOSTDEVICE void operator()(int idx) {
+    int i = idx / (n_ * post_);
+    int which_x = idx / post_ - i * n_;
+    int x_index = i * post_ + idx % post_;
+    if (dx_[which_x] != nullptr) dx_[which_x][x_index] = dy_[idx];
+  }
+
+ private:
+  VecDxType dx_;
+  const T *dy_;
+  int n_;
+  int post_;
+};
+
+template <typename DeviceContext, typename VecXType, typename T>
+static inline void StackFunctorForRange(const DeviceContext &ctx,
+                                        const VecXType &x,
+                                        T *y,
+                                        int total_num,
+                                        int n,
+                                        int post) {
+  phi::funcs::ForRange<DeviceContext> for_range(ctx, total_num);
+  for_range(StackFunctor<VecXType, T>(x, y, n, post));
+}
+
+template <typename DeviceContext, typename VecDxType, typename T>
+static inline void StackGradFunctorForRange(const DeviceContext &ctx,
+                                            const VecDxType &dx,
+                                            const T *dy,
+                                            int total_num,
+                                            int n,
+                                            int post) {
+  phi::funcs::ForRange<DeviceContext> for_range(ctx, total_num);
+  for_range(StackGradFunctor<VecDxType, T>(dx, dy, n, post));
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/unique_functor.h b/paddle/phi/kernels/funcs/unique_functor.h
new file mode 100644
index 0000000000..2bb51cdab6
--- /dev/null
+++ b/paddle/phi/kernels/funcs/unique_functor.h
@@ -0,0 +1,426 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename InT>
+struct UniqueOpFunctor {
+  const Context& context_;
+  DenseTensor* out_;
+  DenseTensor* index_;
+  const DenseTensor* in_;
+  DenseTensor* count_;
+
+  UniqueOpFunctor(const Context& context,
+                  DenseTensor* out,
+                  DenseTensor* index,
+                  const DenseTensor* in,
+                  DenseTensor* count = nullptr)
+      : context_(context), out_(out), index_(index), in_(in), count_(count) {}
+
+  template <typename IndexT>
+  void apply() const {
+    auto* in_data = in_->data<InT>();
+    auto* index_data = context_.template Alloc<IndexT>(index_);
+
+    int64_t j = 0;
+
+    // TODO(fangzeyang): Should optimize performance here.
+    std::unordered_map<InT, int64_t> dict;
+    std::vector<InT> uniq;
+
+    PADDLE_ENFORCE_LT(
+        in_->numel(),
+        pow(2, 31),
+        phi::errors::InvalidArgument(
+            "The num of Input(X) elements should be less then INT_MAX, "
+            "but received num is %d.",
+            in_->numel()));
+
+    for (auto i = 0; i < in_->numel(); i++) {
+      auto it = dict.find(in_data[i]);
+      if (it == dict.end()) {
+        dict.emplace(std::make_pair(in_data[i], j));
+        uniq.emplace_back(in_data[i]);
+        index_data[i] = static_cast<IndexT>(j);
+        j++;
+      } else {
+        index_data[i] = static_cast<IndexT>(it->second);
+      }
+    }
+
+    if (count_ != nullptr) {
+      // Resize the count tensor dims to allocate the memory
+      count_->Resize(phi::make_ddim({static_cast<int64_t>(uniq.size())}));
+      IndexT* count_data = context_.template Alloc<IndexT>(count_);
+      // init count_data to 0
+      memset(count_data, 0, uniq.size() * sizeof(IndexT));
+
+      const auto& index_type = index_->dtype();
+      bool index_type_match =
+          index_type == DataType::INT32 || index_type == DataType::INT64;
+      PADDLE_ENFORCE_EQ(
+          index_type_match,
+          true,
+          phi::errors::InvalidArgument(
+              "Index holds the wrong type, it holds %s, "
+              "but desires to be %s or %s",
+              paddle::framework::DataTypeToString(
+                  paddle::framework::TransToProtoVarType(index_type)),
+              paddle::framework::DataTypeToString(
+                  paddle::framework::TransToProtoVarType(DataType::INT32)),
+              paddle::framework::DataTypeToString(
+                  paddle::framework::TransToProtoVarType(DataType::INT64))));
+
+      if (index_type == DataType::INT32) {
+        for (auto i = 0; i < in_->numel(); ++i) {
+          const IndexT& index = index_data[i];
+          count_data[static_cast<int32_t>(index)] += static_cast<IndexT>(1);
+        }
+      } else {
+        for (auto i = 0; i < in_->numel(); ++i) {
+          const IndexT& index = index_data[i];
+          count_data[static_cast<int64_t>(index)] += static_cast<IndexT>(1);
+        }
+      }
+    }
+
+    out_->Resize(phi::make_ddim({static_cast<int64_t>(uniq.size())}));
+    auto* out_data = context_.template Alloc<InT>(out_);
+    std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT));
+  }
+};
+
+static std::vector<DenseTensor> Unbind(const DenseTensor& in) {
+  int64_t size = in.dims()[0];
+  std::vector<DenseTensor> tensors(size);
+  for (int64_t i = 0; i < size; ++i) {
+    tensors[i] = in.Slice(i, i + 1);
+  }
+  return tensors;
+}
+
+template <typename T>
+static bool Equal(const DenseTensor& a, const DenseTensor& b) {
+  if (a.numel() != b.numel()) {
+    return false;
+  }
+  for (int64_t i = 0; i < a.numel(); ++i) {
+    if (a.data<T>()[i] != b.data<T>()[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename Context, typename InT, typename IndexT>
+static void UniqueFlattendTensor(const Context& context,
+                                 const DenseTensor& in,
+                                 DenseTensor* out,
+                                 DenseTensor* indices,
+                                 DenseTensor* index,
+                                 DenseTensor* count,
+                                 bool return_index,
+                                 bool return_inverse,
+                                 bool return_counts) {
+  const InT* in_data = in.data<InT>();
+  std::set<InT> unique(in_data, in_data + in.numel());
+  out->Resize(phi::make_ddim({static_cast<int64_t>(unique.size())}));
+  auto* out_data = context.template Alloc<InT>(out);
+  std::copy(unique.begin(), unique.end(), out_data);
+
+  if (return_index) {
+    indices->Resize(phi::make_ddim({out->numel()}));
+    auto indices_data = context.template Alloc<IndexT>(indices);
+    std::unordered_map<InT, IndexT> indices_map;
+    indices_map.reserve(out->numel());
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      if (indices_map.find(in_data[i]) != indices_map.end()) continue;
+      indices_map[in_data[i]] = i;
+    }
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      indices_data[i] = indices_map[out_data[i]];
+    }
+  }
+
+  if (return_inverse) {
+    index->Resize(phi::make_ddim({in.numel()}));
+    auto inverse_data = context.template Alloc<IndexT>(index);
+    std::unordered_map<InT, IndexT> inverse_map;
+    inverse_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      inverse_map[out_data[i]] = i;
+    }
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      inverse_data[i] = inverse_map[in_data[i]];
+    }
+  }
+
+  if (return_counts) {
+    count->Resize(phi::make_ddim({out->numel()}));
+    auto count_data = context.template Alloc<IndexT>(count);
+    std::unordered_map<InT, IndexT> counts_map;
+    counts_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      counts_map[out_data[i]] = 0;
+    }
+    for (int64_t i = 0; i < in.numel(); i++) {
+      counts_map[in_data[i]] += 1;
+    }
+    for (int64_t i = 0; i < out->numel(); i++) {
+      count_data[i] = counts_map[out_data[i]];
+    }
+  }
+}
+
+template <typename Context, typename ForwardIt, typename InT, typename IndexT>
+static ForwardIt UniqueDimImpl(const Context& context,
+                               ForwardIt first,
+                               ForwardIt last,
+                               const std::vector<IndexT>& sorted_indices_vec,
+                               std::vector<IndexT>* inverse_vec,
+                               std::vector<IndexT>* counts_vec,
+                               std::vector<IndexT>* indices_vec) {
+  if (first == last) {
+    return last;
+  }
+
+  (*inverse_vec)[sorted_indices_vec[0]] = 0;
+  (*counts_vec)[0] = 1;
+  (*indices_vec)[0] = sorted_indices_vec[0];
+
+  ForwardIt begin = first;
+  ForwardIt result = first;
+
+  while (++first != last) {
+    int64_t idx_first = std::distance(begin, first);
+    int64_t idx_result = std::distance(begin, result);
+    if (!Equal<InT>(*result, *first)) {
+      if (++result != first) {
+        *result = std::move(*first);
+      }
+      idx_result += 1;
+      (*indices_vec)[idx_result] = sorted_indices_vec[idx_first];
+    }
+    (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result;
+    (*counts_vec)[idx_result] += 1;
+  }
+  return ++result;
+}
+
+template <typename Context, typename InT, typename IndexT>
+static void UniqueDim(const Context& context,
+                      const DenseTensor& in,
+                      DenseTensor* out,
+                      DenseTensor* indices,
+                      DenseTensor* index,
+                      DenseTensor* count,
+                      bool return_index,
+                      bool return_inverse,
+                      bool return_counts,
+                      int axis) {
+  // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
+  std::vector<int> permute(in.dims().size());
+  std::iota(permute.begin(), permute.end(), 0);
+  permute[axis] = 0;
+  permute[0] = axis;
+  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
+  in_trans_dims_vec[axis] = in.dims()[0];
+  in_trans_dims_vec[0] = in.dims()[axis];
+  DenseTensor in_trans;
+  phi::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec);
+  in_trans.Resize(in_trans_dims);
+  context.template Alloc<InT>(&in_trans);
+  TransCompute<Context, InT>(in.dims().size(), context, in, &in_trans, permute);
+  // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
+  phi::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
+  in_trans.Resize(in_trans_flat_dims);
+
+  // sort indices
+  std::vector<IndexT> sorted_indices_vec(in_trans.dims()[0]);
+  std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0);
+  int64_t col = in_trans.dims()[1];
+  const InT* in_trans_data = in_trans.data<InT>();
+  std::sort(sorted_indices_vec.begin(),
+            sorted_indices_vec.end(),
+            [&](int64_t a, int64_t b) -> bool {
+              for (int64_t i = 0; i < col; ++i) {
+                InT lhs = in_trans_data[i + a * col];
+                InT rhs = in_trans_data[i + b * col];
+                if (lhs < rhs) {
+                  return true;
+                } else if (lhs > rhs) {
+                  return false;
+                }
+              }
+              return false;
+            });
+
+  // sort tensor according to indices
+  DenseTensor input_sorted;
+  input_sorted.Resize(in_trans_dims);
+  context.template Alloc<InT>(&input_sorted);
+  InT* input_sorted_data = input_sorted.data<InT>();
+  for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
+    memcpy(input_sorted_data + i * col,
+           in_trans_data + static_cast<int64_t>(sorted_indices_vec[i]) * col,
+           col * sizeof(InT));
+  }
+
+  std::vector<DenseTensor> input_unbind = Unbind(input_sorted);
+  std::vector<IndexT> inverse_vec(sorted_indices_vec.size(), 0);
+  std::vector<IndexT> counts_vec(sorted_indices_vec.size(), 0);
+  std::vector<IndexT> indices_vec(sorted_indices_vec.size(), 0);
+  auto last = UniqueDimImpl<Context, std::vector<DenseTensor>::iterator, InT>(
+      context,
+      input_unbind.begin(),
+      input_unbind.end(),
+      sorted_indices_vec,
+      &inverse_vec,
+      &counts_vec,
+      &indices_vec);
+  input_unbind.erase(last, input_unbind.end());
+  counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
+  indices_vec.erase(indices_vec.begin() + input_unbind.size(),
+                    indices_vec.end());
+
+  phi::funcs::ConcatFunctor<Context, InT> concat_functor;
+  DenseTensor out_trans;
+  std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
+  out_trans_dims_vec[0] = input_unbind.size();
+  out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
+  context.template Alloc<InT>(&out_trans);
+  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
+  out->Resize(phi::make_ddim(out_trans_dims_vec));
+  context.template Alloc<InT>(out);
+  concat_functor(context, input_unbind, 0, &out_trans);
+  TransCompute<Context, InT>(
+      out_trans.dims().size(), context, out_trans, out, permute);
+
+  if (return_inverse) {
+    paddle::framework::TensorFromVector(inverse_vec, context, index);
+  }
+
+  if (return_counts) {
+    paddle::framework::TensorFromVector(counts_vec, context, count);
+  }
+
+  if (return_index) {
+    paddle::framework::TensorFromVector(indices_vec, context, indices);
+  }
+}
+
+template <typename Context, typename InT>
+struct UniqueFlattendTensorFunctor {
+  const Context& ctx_; /*  */
+  const DenseTensor& in_;
+  DenseTensor* out_;
+  DenseTensor* indices_;
+  DenseTensor* index_;
+  DenseTensor* count_;
+  const bool return_index_;
+  const bool return_inverse_;
+  const bool return_counts_;
+
+  UniqueFlattendTensorFunctor(const Context& context,
+                              const DenseTensor& in,
+                              DenseTensor* out,
+                              DenseTensor* indices,
+                              DenseTensor* index,
+                              DenseTensor* count,
+                              bool return_index,
+                              bool return_inverse,
+                              bool return_counts)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        indices_(indices),
+        index_(index),
+        count_(count),
+        return_index_(return_index),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts) {}
+
+  template <typename IndexT>
+  void apply() const {
+    UniqueFlattendTensor<Context, InT, IndexT>(ctx_,
+                                               in_,
+                                               out_,
+                                               indices_,
+                                               index_,
+                                               count_,
+                                               return_index_,
+                                               return_inverse_,
+                                               return_counts_);
+  }
+};
+
+template <typename Context, typename InT>
+struct UniqueDimFunctor {
+  const Context& ctx_;
+  const DenseTensor& in_;
+  DenseTensor* out_;
+  DenseTensor* indices_;
+  DenseTensor* index_;
+  DenseTensor* count_;
+  const int axis_;
+  const bool return_index_;
+  const bool return_inverse_;
+  const bool return_counts_;
+
+  UniqueDimFunctor(const Context& context,
+                   const DenseTensor& in,
+                   DenseTensor* out,
+                   DenseTensor* indices,
+                   DenseTensor* index,
+                   DenseTensor* count,
+                   const int axis,
+                   bool return_index,
+                   bool return_inverse,
+                   bool return_counts)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        indices_(indices),
+        index_(index),
+        count_(count),
+        axis_(axis),
+        return_index_(return_index),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts) {}
+
+  template <typename IndexT>
+  void apply() const {
+    UniqueDim<Context, InT, IndexT>(ctx_,
+                                    in_,
+                                    out_,
+                                    indices_,
+                                    index_,
+                                    count_,
+                                    return_index_,
+                                    return_inverse_,
+                                    return_counts_,
+                                    axis_);
+  }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/range_kernel.cu b/paddle/phi/kernels/gpu/range_kernel.cu
new file mode 100644
index 0000000000..65d9b45efb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/range_kernel.cu
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/range_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/range_function.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void Range(T start, T step, int64_t size, T* out) {
+  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+}
+
+template <typename T, typename Context>
+void RangeKernel(const Context& dev_ctx,
+                 const DenseTensor& start,
+                 const DenseTensor& end,
+                 const DenseTensor& step,
+                 DenseTensor* out) {
+  T start_value = start.data<T>()[0];
+  T end_value = end.data<T>()[0];
+  T step_value = step.data<T>()[0];
+
+  int64_t size = 0;
+  phi::funcs::GetSize(start_value, end_value, step_value, &size);
+  out->Resize(phi::make_ddim({size}));
+  T* out_data = dev_ctx.template Alloc<T>(out);
+
+  auto stream = dev_ctx.stream();
+  int block = std::min(size, static_cast<int64_t>(256));
+  int grid = (size + block - 1) / block;
+  Range<T><<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    range, GPU, ALL_LAYOUT, phi::RangeKernel, float, double, int64_t, int) {
+  kernel->InputAt(0).SetBackend(phi::Backend::CPU);
+  kernel->InputAt(1).SetBackend(phi::Backend::CPU);
+  kernel->InputAt(2).SetBackend(phi::Backend::CPU);
+}
diff --git a/paddle/phi/kernels/gpu/stack_grad_kernel.cu b/paddle/phi/kernels/gpu/stack_grad_kernel.cu
new file mode 100644
index 0000000000..9b754e2269
--- /dev/null
+++ b/paddle/phi/kernels/gpu/stack_grad_kernel.cu
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/stack_grad_kernel.h"
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename IntType>
+__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input,
+                                        int pre_dim_size,
+                                        int split_dim_size,
+                                        int suf_dim_size,
+                                        int num_split,
+                                        T** output_ptrs) {
+  assert(blockDim.y == 1);
+  assert(blockDim.z == 1);
+  // In this case they are equal
+  assert(split_dim_size % num_split == 0);
+
+  IntType size = pre_dim_size * split_dim_size * suf_dim_size;
+  IntType each_dim_size = split_dim_size / num_split;
+
+  for (IntType offset = blockIdx.x * blockDim.x + threadIdx.x; offset < size;
+       offset += blockDim.x * gridDim.x) {
+    IntType i = offset / (split_dim_size * suf_dim_size);
+    IntType j = (offset % (split_dim_size * suf_dim_size)) / suf_dim_size;
+    IntType k = offset % suf_dim_size;
+
+    T* output = output_ptrs[j / each_dim_size];
+    if (output == nullptr) {
+      return;
+    }
+    IntType output_ind = i * each_dim_size * suf_dim_size +
+                         (j % each_dim_size) * suf_dim_size + k;
+    *(output + output_ind) = input[offset];
+  }
+}
+
+template <typename T, typename Context>
+void StackGradKernel(const Context& dev_ctx,
+                     const DenseTensor& out,
+                     int axis,
+                     std::vector<DenseTensor*> x_grad) {
+  if (axis < 0) axis += out.dims().size();
+
+  int n = out.dims()[axis];
+  PADDLE_ENFORCE_EQ(n,
+                    x_grad.size(),
+                    phi::errors::InvalidArgument(
+                        "Output x_grad size should be equal to n, but"
+                        " received n is:%d x_grad size is:%d.",
+                        n,
+                        x_grad.size()));
+
+  // x_grad is output, so save each data address, then copy each dy into dx_data
+  std::vector<T*> outputs(n);
+  for (size_t j = 0; j < x_grad.size(); ++j) {
+    if (x_grad[j] == nullptr) {
+      outputs[j] = nullptr;
+      continue;
+    }
+    if (x_grad[j]->numel() != 0UL) {
+      T* ptr = dev_ctx.template Alloc<T>(x_grad[j]);
+      outputs[j] = ptr;
+    } else {
+      outputs[j] = nullptr;
+    }
+  }
+  auto dy_data = out.data<T>();
+  // each x_grad should have same shape
+  int dy_pre = 1, dy_suf = 1;
+  auto dy_dims = out.dims();
+  int split_dim = n;
+  for (int i = 0; i < axis; ++i) {
+    dy_pre *= dy_dims[i];
+  }
+  dy_suf = out.numel() / (split_dim * dy_pre);
+
+  auto tmp_out_data =
+      paddle::memory::Alloc(dev_ctx, outputs.size() * sizeof(T*));
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       tmp_out_data->ptr(),
+                       phi::CPUPlace(),
+                       reinterpret_cast<void*>(outputs.data()),
+                       outputs.size() * sizeof(T*),
+                       dev_ctx.stream());
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, dy_pre * split_dim * dy_suf);
+
+  if (out.numel() < std::numeric_limits<int32_t>::max()) {
+    UnStackHelperCUDAKernel<T, int32_t><<<config.block_per_grid.x,
+                                          config.thread_per_block.x,
+                                          0,
+                                          dev_ctx.stream()>>>(
+        dy_data,
+        dy_pre,
+        split_dim,
+        dy_suf,
+        split_dim,
+        reinterpret_cast<T**>(tmp_out_data->ptr()));
+  } else {
+    UnStackHelperCUDAKernel<T, int64_t><<<config.block_per_grid.x,
+                                          config.thread_per_block.x,
+                                          0,
+                                          dev_ctx.stream()>>>(
+        dy_data,
+        dy_pre,
+        split_dim,
+        dy_suf,
+        split_dim,
+        reinterpret_cast<T**>(tmp_out_data->ptr()));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(stack_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::StackGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/stack_kernel.cu b/paddle/phi/kernels/gpu/stack_kernel.cu
new file mode 100644
index 0000000000..cc7d136c95
--- /dev/null
+++ b/paddle/phi/kernels/gpu/stack_kernel.cu
@@ -0,0 +1,113 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/stack_kernel.h"
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename IntType>
+__global__ void StackCUDAKernel(T** input_ptrs,
+                                int split_size,
+                                int rows,
+                                int cols,
+                                T* __restrict__ output) {
+  IntType grid_x = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (; grid_x < cols; grid_x += blockDim.x * gridDim.x) {
+    IntType grid_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    IntType split = grid_x / split_size;
+    const T* input_ptr = input_ptrs[split];
+    IntType col_offset = grid_x % split_size;
+#pragma unroll
+    for (; grid_y < rows; grid_y += blockDim.y * gridDim.y) {
+      output[grid_y * cols + grid_x] =
+          input_ptr[grid_y * split_size + col_offset];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void StackKernel(const Context& dev_ctx,
+                 const std::vector<const DenseTensor*>& x,
+                 int axis,
+                 DenseTensor* out) {
+  if (axis < 0) axis += (x[0]->dims().size() + 1);
+
+  int n = static_cast<int>(x.size());
+  T* y_data = dev_ctx.template Alloc<T>(out);
+  std::vector<const T*> x_datas(n);
+  for (int i = 0; i < n; i++) {
+    x_datas[i] = x[i]->data<T>();
+  }
+
+  auto tmp_x_data = paddle::memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*));
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       tmp_x_data->ptr(),
+                       phi::CPUPlace(),
+                       reinterpret_cast<void*>(x_datas.data()),
+                       x_datas.size() * sizeof(T*),
+                       dev_ctx.stream());
+
+  // Split x dim from axis to matrix
+  int x_row = 1, x_col = 1;
+  for (int i = 0; i < axis; ++i) {
+    x_row *= x[0]->dims()[i];
+  }
+  x_col = x[0]->numel() / x_row;
+  int out_col = x_col * n;
+
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);
+
+  if (out->numel() < std::numeric_limits<int32_t>::max()) {
+    StackCUDAKernel<T, int32_t><<<config.block_per_grid,
+                                  config.thread_per_block,
+                                  0,
+                                  dev_ctx.stream()>>>(
+        reinterpret_cast<T**>(tmp_x_data->ptr()),
+        x_col,
+        x_row,
+        out_col,
+        y_data);
+  } else {
+    StackCUDAKernel<T, int64_t><<<config.block_per_grid,
+                                  config.thread_per_block,
+                                  0,
+                                  dev_ctx.stream()>>>(
+        reinterpret_cast<T**>(tmp_x_data->ptr()),
+        x_col,
+        x_row,
+        out_col,
+        y_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(stack,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::StackKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
new file mode 100644
index 0000000000..c09730ba76
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -0,0 +1,615 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unique_kernel.h"
+
+#include <thrust/adjacent_difference.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/scatter.h>
+#include <thrust/sequence.h>
+#include <thrust/unique.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"  // TensorToVector()
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/unique_functor.h"
+
+namespace phi {
+
+// Binary function 'less than'
+template <typename InT>
+struct LessThan {
+  int col;
+  const InT* in_trans_data;
+
+  LessThan(int64_t _col, const InT* _in_trans_data)
+      : col(_col), in_trans_data(_in_trans_data) {}
+
+  __device__ bool operator()(int64_t a, int64_t b) const {
+    for (int i = 0; i < col; ++i) {
+      InT lhs = in_trans_data[i + a * col];
+      InT rhs = in_trans_data[i + b * col];
+      if (lhs < rhs) {
+        return true;
+      } else if (lhs > rhs) {
+        return false;
+      }
+    }
+    return false;
+  }
+};
+
+// Binary function 'equal_to'
+template <typename InT>
+struct BinaryEqual {
+  int64_t col;
+  const InT* in_trans_data;
+
+  BinaryEqual(int64_t _col, const InT* _in_trans_data)
+      : col(_col), in_trans_data(_in_trans_data) {}
+
+  __device__ bool operator()(int64_t a, int64_t b) const {
+    for (int64_t i = 0; i < col; ++i) {
+      InT lhs = in_trans_data[i + a * col];
+      InT rhs = in_trans_data[i + b * col];
+      if (lhs != rhs) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+// Binary function 'not_equal_to'
+template <typename InT>
+struct BinaryNotEqual {
+  int64_t col;
+  const InT* in_trans_data;
+
+  BinaryNotEqual(int64_t _col, const InT* _in_trans_data)
+      : col(_col), in_trans_data(_in_trans_data) {}
+
+  __device__ bool operator()(int64_t a, int64_t b) const {
+    for (int64_t i = 0; i < col; ++i) {
+      InT lhs = in_trans_data[i + a * col];
+      InT rhs = in_trans_data[i + b * col];
+      if (lhs != rhs) {
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
+// index_select() function for DenseTensor
+template <typename Context, typename InT, typename IndexT>
+void IndexSelect(const Context& context,
+                 const DenseTensor& input,
+                 const DenseTensor& index,
+                 DenseTensor* output,
+                 int dim) {
+  auto input_dim = input.dims();
+  auto input_dim_size = input_dim.size();
+  auto output_dim = output->dims();
+
+  auto slice_size = 1;
+  for (auto i = dim + 1; i < input_dim_size; i++) {
+    slice_size *= input_dim[i];
+  }
+
+  auto input_width = slice_size * input_dim[dim];
+  auto output_width = slice_size * output_dim[dim];
+
+  auto outer_nums = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_nums *= input_dim[i];
+  }
+
+  auto index_size = index.dims()[0];
+
+  std::vector<InT> input_vec;
+  std::vector<IndexT> index_vec;
+  paddle::framework::TensorToVector(input, context, &input_vec);
+  paddle::framework::TensorToVector(index, context, &index_vec);
+  std::vector<InT> out_vec(output->numel());
+
+  for (int i = 0; i < index_size; i++) {
+    PADDLE_ENFORCE_GE(
+        index_vec[i],
+        0,
+        phi::errors::InvalidArgument(
+            "Variable value (index) of OP(index_select) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            input_dim[dim],
+            index_vec[i]));
+    PADDLE_ENFORCE_LT(
+        index_vec[i],
+        input_dim[dim],
+        phi::errors::InvalidArgument(
+            "Variable value (index) of OP(index_select) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            input_dim[dim],
+            index_vec[i]));
+  }
+
+  for (auto i = 0; i < outer_nums; i++) {
+    auto input_start_offset = i * input_width;
+    auto output_start_offset = i * output_width;
+
+    for (auto j = 0; j < index_size; j++) {
+      IndexT index_value = index_vec[j];
+      for (auto k = 0; k < slice_size; k++) {
+        out_vec[output_start_offset + j * slice_size + k] =
+            input_vec[input_start_offset + index_value * slice_size + k];
+      }
+    }
+  }
+  context.template Alloc<IndexT>(output);
+  paddle::framework::TensorFromVector(out_vec, context, output);
+  output->Resize(output_dim);
+}
+
+// The core logic of computing Unique for a flattend DenseTensor
+template <typename Context,
+          typename InT,
+          typename IndexT,
+          typename equal_T,
+          typename not_equal_T>
+static void UniqueFlattendCUDATensor(const Context& context,
+                                     const DenseTensor& in,
+                                     DenseTensor* out,
+                                     DenseTensor* indices,
+                                     DenseTensor* index,
+                                     DenseTensor* counts,
+                                     bool return_index,
+                                     bool return_inverse,
+                                     bool return_counts,
+                                     equal_T equal,
+                                     not_equal_T not_equal,
+                                     int64_t num_input) {
+  // 0. Prepration
+  DenseTensor in_hat;
+  phi::Copy(context, in, context.GetPlace(), false, &in_hat);
+  auto* in_data_hat = context.template Alloc<InT>(&in_hat);
+
+  indices->Resize(phi::make_ddim({num_input}));
+  auto* indices_data = context.template Alloc<IndexT>(indices);
+
+  thrust::sequence(thrust::device, indices_data, indices_data + num_input);
+  thrust::sort_by_key(
+      thrust::device, in_data_hat, in_data_hat + num_input, indices_data);
+
+  // 1. Calculate op result: 'out'
+  DenseTensor range;
+  range.Resize(phi::make_ddim({num_input + 1}));
+  auto* range_data_ptr = context.template Alloc<IndexT>(&range);
+  thrust::sequence(
+      thrust::device, range_data_ptr, range_data_ptr + num_input + 1);
+  phi::Copy(context, in_hat, context.GetPlace(), false, out);
+  int num_out;
+  auto out_data = context.template Alloc<InT>(out);
+  num_out =
+      thrust::unique_by_key(
+          thrust::device, out_data, out_data + num_input, range_data_ptr, equal)
+          .first -
+      out_data;
+  out->Resize(phi::make_ddim({num_out}));
+
+  // 3. Calculate inverse index: 'inverse'
+  if (return_inverse) {
+    index->Resize(phi::make_ddim({num_input}));
+    auto* inverse_data = context.template Alloc<IndexT>(index);
+    DenseTensor inv_loc;
+    inv_loc.Resize(phi::make_ddim({num_input}));
+    auto inv_loc_data_ptr = context.template Alloc<IndexT>(&inv_loc);
+    thrust::adjacent_difference(thrust::device,
+                                in_data_hat,
+                                in_data_hat + num_input,
+                                inv_loc_data_ptr,
+                                not_equal);
+    thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
+    inv_loc_data_dev[0] = 0;  // without device_ptr, segmentation fault
+    thrust::inclusive_scan(thrust::device,
+                           inv_loc_data_ptr,
+                           inv_loc_data_ptr + num_input,
+                           inv_loc_data_ptr);
+    thrust::scatter(thrust::device,
+                    inv_loc_data_ptr,
+                    inv_loc_data_ptr + num_input,
+                    indices_data,
+                    inverse_data);
+  }
+
+  // 2. Calculate sorted index: 'indices'
+  if (return_index) {
+    DenseTensor tmp_indices;
+    tmp_indices.Resize(phi::make_ddim({num_input}));
+    auto* tmp_indices_data_ptr = context.template Alloc<IndexT>(&tmp_indices);
+    thrust::copy(thrust::device,
+                 in_data_hat,
+                 in_data_hat + num_input,
+                 tmp_indices_data_ptr);
+    thrust::unique_by_key(thrust::device,
+                          tmp_indices_data_ptr,
+                          tmp_indices_data_ptr + num_input,
+                          indices_data,
+                          equal);
+    indices->Resize(phi::make_ddim({num_out}));
+  }
+
+  // 4. Calculate 'counts'
+  if (return_counts) {
+    counts->Resize(phi::make_ddim({num_out}));
+    auto count_data = context.template Alloc<IndexT>(counts);
+    // init 'count_data' as 0
+    thrust::fill(thrust::device, count_data, count_data + num_out, 0);
+    thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr);
+    range_data_ptr_dev[num_out] = num_input;
+    thrust::adjacent_difference(thrust::device,
+                                range_data_ptr + 1,
+                                range_data_ptr + num_out + 1,
+                                count_data);
+  }
+}
+
+// The logic of compute unique with axis required, it's a little different
+// from above function
+template <typename Context,
+          typename InT,
+          typename IndexT,
+          typename equal_T,
+          typename not_equal_T>
+static void ComputeUniqueDims(const Context& context,
+                              DenseTensor* sorted_indices,
+                              IndexT* sorted_indices_data,
+                              DenseTensor* out,
+                              DenseTensor* inverse,
+                              DenseTensor* counts,
+                              bool return_index,
+                              bool return_inverse,
+                              bool return_counts,
+                              equal_T equal,
+                              not_equal_T not_equal,
+                              int64_t row) {
+  // 1. inverse indices: 'inverse'
+  inverse->Resize(phi::make_ddim({row}));
+  auto* inverse_data = context.template Alloc<IndexT>(inverse);
+  DenseTensor inv_loc;
+  inv_loc.Resize(phi::make_ddim({row}));
+  auto inv_loc_data_ptr = context.template Alloc<IndexT>(&inv_loc);
+  thrust::adjacent_difference(thrust::device,
+                              sorted_indices_data,
+                              sorted_indices_data + row,
+                              inv_loc_data_ptr,
+                              not_equal);
+  thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
+  inv_loc_data_dev[0] = 0;
+  thrust::inclusive_scan(thrust::device,
+                         inv_loc_data_ptr,
+                         inv_loc_data_ptr + row,
+                         inv_loc_data_ptr);
+  thrust::scatter(thrust::device,
+                  inv_loc_data_ptr,
+                  inv_loc_data_ptr + row,
+                  sorted_indices_data,
+                  inverse_data);
+
+  // 2. sorted indices
+  DenseTensor range;
+  range.Resize(phi::make_ddim({row + 1}));
+  auto range_data_ptr = context.template Alloc<IndexT>(&range);
+  thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1);
+  int num_out;
+  num_out = thrust::unique_by_key(thrust::device,
+                                  sorted_indices_data,
+                                  sorted_indices_data + row,
+                                  range_data_ptr,
+                                  equal)
+                .first -
+            sorted_indices_data;
+  thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr);
+  range_data_ptr_dev[num_out] = row;
+  sorted_indices->Resize(phi::make_ddim({num_out}));
+
+  // 3. counts: 'counts'
+  counts->Resize(phi::make_ddim({num_out}));
+  auto* count_data = context.template Alloc<IndexT>(counts);
+  thrust::fill(thrust::device, count_data, count_data + row, 0);
+  thrust::adjacent_difference(
+      thrust::device, range_data_ptr + 1, range_data_ptr + row + 1, count_data);
+}
+
+// Calculate unique when 'axis' is set
+template <typename Context, typename InT, typename IndexT>
+static void UniqueDimsCUDATensor(const Context& context,
+                                 const DenseTensor& in,
+                                 DenseTensor* out,
+                                 DenseTensor* indices,
+                                 DenseTensor* index,
+                                 DenseTensor* counts,
+                                 bool return_index,
+                                 bool return_inverse,
+                                 bool return_counts,
+                                 int axis) {
+  // 1. Transpose & reshape
+  // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
+  std::vector<int> permute(in.dims().size());
+  std::iota(permute.begin(), permute.end(), 0);
+  permute[axis] = 0;
+  permute[0] = axis;
+  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
+  in_trans_dims_vec[axis] = in.dims()[0];
+  in_trans_dims_vec[0] = in.dims()[axis];
+  DenseTensor in_trans;
+  auto in_trans_dims = phi::make_ddim(in_trans_dims_vec);
+  in_trans.Resize(in_trans_dims);
+  context.template Alloc<InT>(&in_trans);
+  phi::funcs::TransCompute<Context, InT>(
+      in.dims().size(),  // num of dims
+      context,           // device
+      in,                // original DenseTensor
+      &in_trans,         // DenseTensor after reshape
+      permute);          // index of axis
+
+  // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
+  auto in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
+  in_trans.Resize(in_trans_flat_dims);
+
+  // now 'in_trans' is 2D
+  int64_t col = in_trans.dims()[1];
+  int64_t row = in_trans.dims()[0];
+  const InT* in_trans_data = in_trans.data<InT>();
+
+  indices->Resize(phi::make_ddim({row}));
+  auto* sorted_indices_data = context.template Alloc<IndexT>(indices);
+
+  // 2. Calculate 'indices', 'inverse', 'counts'
+  // Init index and sort
+  thrust::sequence(
+      thrust::device, sorted_indices_data, sorted_indices_data + row);
+  thrust::sort(thrust::device,
+               sorted_indices_data,
+               sorted_indices_data + row,
+               LessThan<InT>(col, in_trans_data));
+  ComputeUniqueDims<Context, InT, IndexT>(
+      context,
+      indices,
+      sorted_indices_data,
+      out,
+      index,
+      counts,
+      return_index,
+      return_inverse,
+      return_counts,
+      BinaryEqual<InT>(col, in_trans_data),
+      BinaryNotEqual<InT>(col, in_trans_data),
+      row);
+
+  // 3. Select indices and reshape back to get 'out'
+  DenseTensor out_trans;
+  std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
+  out_trans_dims_vec[0] = indices->numel();
+  out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
+  context.template Alloc<InT>(&out_trans);
+
+  IndexSelect<Context, InT, IndexT>(context, in_trans, *indices, &out_trans, 0);
+
+  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
+  out->Resize(phi::make_ddim(out_trans_dims_vec));
+  context.template Alloc<InT>(out);
+  std::vector<DenseTensor> out_trans_unbind = phi::funcs::Unbind(out_trans);
+  phi::funcs::ConcatFunctor<Context, InT> concat_functor;
+  concat_functor(context, out_trans_unbind, 0, &out_trans);
+  phi::funcs::TransCompute<Context, InT>(
+      out_trans.dims().size(), context, out_trans, out, permute);
+}
+
+// functor for processing a flattend DenseTensor
+template <typename Context, typename InT>
+struct UniqueFlattendCUDAFunctor {
+  const Context& ctx_;
+  const DenseTensor& in_;
+  DenseTensor* out_;
+  DenseTensor* indices_;
+  DenseTensor* index_;
+  DenseTensor* counts_;
+  const bool return_index_;
+  const bool return_inverse_;
+  const bool return_counts_;
+
+  UniqueFlattendCUDAFunctor(const Context& context,
+                            const DenseTensor& in,
+                            DenseTensor* out,
+                            DenseTensor* indices,
+                            DenseTensor* index,
+                            DenseTensor* counts,
+                            bool return_index,
+                            bool return_inverse,
+                            bool return_counts)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        indices_(indices),
+        index_(index),
+        counts_(counts),
+        return_index_(return_index),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts) {}
+
+  template <typename IndexT>
+  void apply() const {
+    UniqueFlattendCUDATensor<Context, InT, IndexT>(ctx_,
+                                                   in_,
+                                                   out_,
+                                                   indices_,
+                                                   index_,
+                                                   counts_,
+                                                   return_index_,
+                                                   return_inverse_,
+                                                   return_counts_,
+                                                   thrust::equal_to<InT>(),
+                                                   thrust::not_equal_to<InT>(),
+                                                   in_.numel());
+  }
+};
+
+// functor for processing a multi-dimentional DenseTensor
+template <typename Context, typename InT>
+struct UniqueDimsCUDAFunctor {
+  const Context& ctx_;
+  const DenseTensor& in_;
+  DenseTensor* out_;
+  DenseTensor* indices_;
+  DenseTensor* index_;
+  DenseTensor* counts_;
+  const int axis_;
+  const bool return_index_;
+  const bool return_inverse_;
+  const bool return_counts_;
+
+  UniqueDimsCUDAFunctor(const Context& context,
+                        const DenseTensor& in,
+                        DenseTensor* out,
+                        DenseTensor* indices,
+                        DenseTensor* index,
+                        DenseTensor* counts,
+                        const int axis,
+                        bool return_index,
+                        bool return_inverse,
+                        bool return_counts)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        indices_(indices),
+        index_(index),
+        counts_(counts),
+        axis_(axis),
+        return_index_(return_index),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts) {}
+
+  template <typename IndexT>
+  void apply() const {
+    UniqueDimsCUDATensor<Context, InT, IndexT>(ctx_,
+                                               in_,
+                                               out_,
+                                               indices_,
+                                               index_,
+                                               counts_,
+                                               return_index_,
+                                               return_inverse_,
+                                               return_counts_,
+                                               axis_);
+  }
+};
+
+template <typename T, typename Context>
+void UniqueRawKernel(const Context& context,
+                     const DenseTensor& x,
+                     bool return_index,
+                     bool return_inverse,
+                     bool return_counts,
+                     const std::vector<int>& axis,
+                     DataType dtype,
+                     bool is_sorted,
+                     DenseTensor* out,
+                     DenseTensor* indices,
+                     DenseTensor* index,
+                     DenseTensor* counts) {
+  if (dtype == phi::DataType::INT32) {
+    PADDLE_ENFORCE_LE(
+        x.numel() + 1,
+        INT_MAX,
+        phi::errors::InvalidArgument(
+            "The number of elements in Input(X) should be less than or "
+            "equal to INT_MAX, but received num is %d. Please set `dtype` to "
+            "int64.",
+            x.numel()));
+  }
+  // if 'axis' is not required, flatten the DenseTensor.
+  if (axis.empty()) {
+    phi::VisitDataTypeTiny(
+        dtype,
+        UniqueFlattendCUDAFunctor<Context, T>(context,
+                                              x,
+                                              out,
+                                              indices,
+                                              index,
+                                              counts,
+                                              return_index,
+                                              return_inverse,
+                                              return_counts));
+  } else {
+    // 'axis' is required.
+    int axis_value = axis[0];
+    phi::VisitDataTypeTiny(dtype,
+                           UniqueDimsCUDAFunctor<Context, T>(context,
+                                                             x,
+                                                             out,
+                                                             indices,
+                                                             index,
+                                                             counts,
+                                                             axis_value,
+                                                             return_index,
+                                                             return_inverse,
+                                                             return_counts));
+  }
+}
+
+template <typename T, typename Context>
+void UniqueKernel(const Context& context,
+                  const DenseTensor& x,
+                  bool return_index,
+                  bool return_inverse,
+                  bool return_counts,
+                  const std::vector<int>& axis,
+                  DataType dtype,
+                  DenseTensor* out,
+                  DenseTensor* indices,
+                  DenseTensor* index,
+                  DenseTensor* counts) {
+  bool is_sorted = true;
+  UniqueRawKernel<T, Context>(context,
+                              x,
+                              return_index,
+                              return_inverse,
+                              return_counts,
+                              axis,
+                              dtype,
+                              is_sorted,
+                              out,
+                              indices,
+                              index,
+                              counts);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    unique, GPU, ALL_LAYOUT, phi::UniqueKernel, float, double, int64_t, int) {}
+
+PD_REGISTER_KERNEL(unique_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UniqueRawKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int) {}
diff --git a/paddle/phi/kernels/gpu/unstack_grad_kernel.cu b/paddle/phi/kernels/gpu/unstack_grad_kernel.cu
new file mode 100644
index 0000000000..b7c349de0d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unstack_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unstack_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unstack_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnStackGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/unstack_kernel.cu b/paddle/phi/kernels/gpu/unstack_kernel.cu
new file mode 100644
index 0000000000..f147f4c0f0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unstack_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unstack_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unstack_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unstack,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnStackKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
new file mode 100644
index 0000000000..0576742e34
--- /dev/null
+++ b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/stack_functor.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include <thrust/device_vector.h>
+#endif
+namespace phi {
+
+template <typename T, typename Context>
+void UnStackGradKernel(const Context &dev_ctx,
+                       const std::vector<const DenseTensor *> &x,
+                       int axis,
+                       DenseTensor *x_grad) {
+  if (axis < 0) axis += (x[0]->dims().size() + 1);
+
+  int n = static_cast<int>(x.size());
+  auto *x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  std::vector<const T *> x_datas(n);
+  for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
+
+  int pre = 1;
+  int post = 1;
+  auto &dim = x[0]->dims();
+  for (auto i = 0; i < axis; ++i) pre *= dim[i];
+  for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+  int total_num = pre * n * post;
+
+  thrust::device_vector<const T *> device_x_vec(x_datas);
+  auto x_data_arr = device_x_vec.data().get();
+
+  phi::funcs::StackFunctorForRange(
+      dev_ctx, x_data_arr, x_grad_data, total_num, n, post);
+
+  // Wait() must be called because device_x_vec may be destructed before
+  // kernel ends
+  dev_ctx.Wait();
+#else
+  auto x_data_arr = x_datas.data();
+
+  size_t x_offset = 0;
+  size_t y_offset = 0;
+  for (int i = 0; i < pre; i++) {
+    for (int j = 0; j < n; j++) {
+      std::memcpy(
+          x_grad_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T));
+      y_offset += post;
+    }
+    x_offset += post;
+  }
+#endif
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/unstack_kernel_impl.h b/paddle/phi/kernels/impl/unstack_kernel_impl.h
new file mode 100644
index 0000000000..030f4a62c6
--- /dev/null
+++ b/paddle/phi/kernels/impl/unstack_kernel_impl.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/stack_functor.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include <thrust/device_vector.h>
+#endif
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnStackKernel(const Context &dev_ctx,
+                   const DenseTensor &x,
+                   int axis,
+                   int num,
+                   std::vector<DenseTensor *> outs) {
+  auto *dy = &x;
+  auto dx = outs;
+  if (axis < 0) axis += dy->dims().size();
+
+  int n = dy->dims()[axis];
+  std::vector<T *> dx_datas(n);  // NOLINT
+  for (int i = 0; i < n; i++) {
+    dx_datas[i] = dev_ctx.template Alloc<T>(dx[i]);
+  }
+  auto dy_data = dy->data<T>();
+  if (dy->numel() == 0) return;
+  int pre = 1;
+  for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
+  int total_num = dy->numel();
+  int post = total_num / (n * pre);
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+  thrust::device_vector<T *> device_dx_vec(dx_datas);
+  auto dx_data_arr = device_dx_vec.data().get();
+#else
+  auto dx_data_arr = dx_datas.data();
+#endif
+  phi::funcs::StackGradFunctorForRange(
+      dev_ctx, dx_data_arr, dy_data, total_num, n, post);
+#if defined(__NVCC__) || defined(__HIPCC__)
+  // Wait() must be called because device_dx_vec may be destructed before
+  // kernel ends
+  dev_ctx.Wait();
+#endif
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/range_kernel.h b/paddle/phi/kernels/range_kernel.h
new file mode 100644
index 0000000000..c76308193a
--- /dev/null
+++ b/paddle/phi/kernels/range_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RangeKernel(const Context& dev_ctx,
+                 const DenseTensor& start,
+                 const DenseTensor& end,
+                 const DenseTensor& step,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/stack_grad_kernel.h b/paddle/phi/kernels/stack_grad_kernel.h
new file mode 100644
index 0000000000..32451e606f
--- /dev/null
+++ b/paddle/phi/kernels/stack_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void StackGradKernel(const Context& dev_ctx,
+                     const DenseTensor& out,
+                     int axis,
+                     std::vector<DenseTensor*> x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/stack_kernel.h b/paddle/phi/kernels/stack_kernel.h
new file mode 100644
index 0000000000..679c740630
--- /dev/null
+++ b/paddle/phi/kernels/stack_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void StackKernel(const Context& dev_ctx,
+                 const std::vector<const DenseTensor*>& x,
+                 int axis,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/unique_kernel.h b/paddle/phi/kernels/unique_kernel.h
new file mode 100644
index 0000000000..353570c8e7
--- /dev/null
+++ b/paddle/phi/kernels/unique_kernel.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UniqueKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  bool return_index,
+                  bool return_inverse,
+                  bool return_counts,
+                  const std::vector<int>& axis,
+                  DataType dtype,
+                  DenseTensor* out,
+                  DenseTensor* indices,
+                  DenseTensor* index,
+                  DenseTensor* counts);
+
+template <typename T, typename Context>
+void UniqueRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     bool return_index,
+                     bool return_inverse,
+                     bool return_counts,
+                     const std::vector<int>& axis,
+                     DataType dtype,
+                     bool is_sorted,
+                     DenseTensor* out,
+                     DenseTensor* indices,
+                     DenseTensor* index,
+                     DenseTensor* counts);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/unstack_grad_kernel.h b/paddle/phi/kernels/unstack_grad_kernel.h
new file mode 100644
index 0000000000..de0e3004d8
--- /dev/null
+++ b/paddle/phi/kernels/unstack_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnStackGradKernel(const Context& dev_ctx,
+                       const std::vector<const DenseTensor*>& x,
+                       int axis,
+                       DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/unstack_kernel.h b/paddle/phi/kernels/unstack_kernel.h
new file mode 100644
index 0000000000..0494aa6327
--- /dev/null
+++ b/paddle/phi/kernels/unstack_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnStackKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int axis,
+                   int num,
+                   std::vector<DenseTensor*> outs);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/stack_sig.cc b/paddle/phi/ops/compat/stack_sig.cc
new file mode 100644
index 0000000000..97768eb890
--- /dev/null
+++ b/paddle/phi/ops/compat/stack_sig.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature StackGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "stack_grad", {GradVarName("Y")}, {"axis"}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(stack_grad, phi::StackGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/unique_sig.cc b/paddle/phi/ops/compat/unique_sig.cc
new file mode 100644
index 0000000000..2a7ba54301
--- /dev/null
+++ b/paddle/phi/ops/compat/unique_sig.cc
@@ -0,0 +1,42 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature UniqueOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  bool is_sorted = paddle::any_cast<bool>(ctx.Attr("is_sorted"));
+  if (is_sorted) {
+    return KernelSignature(
+        "unique",
+        {"X"},
+        {"return_index", "return_inverse", "return_counts", "axis", "dtype"},
+        {"Out", "Indices", "Index", "Counts"});
+  } else {
+    return KernelSignature("unique_raw",
+                           {"X"},
+                           {"return_index",
+                            "return_inverse",
+                            "return_counts",
+                            "axis",
+                            "dtype",
+                            "is_sorted"},
+                           {"Out", "Indices", "Index", "Counts"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(unique, phi::UniqueOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/unstack_sig.cc b/paddle/phi/ops/compat/unstack_sig.cc
new file mode 100644
index 0000000000..41d7fc120a
--- /dev/null
+++ b/paddle/phi/ops/compat/unstack_sig.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature UnStackGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "unstack_grad", {GradVarName("Y")}, {"axis"}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(unstack_grad, phi::UnStackGradOpArgumentMapping);
-- 
GitLab