fix conflict (#40851)

74894cd7 · csy0225 · GitHub · e559fe41 · 74894cd7 · e559fe41
44 changed file
--- a/paddle/fluid/operators/range_op.cc
+++ b/paddle/fluid/operators/range_op.cc
@@ -14,6 +14,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/range_op.h"
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 namespace paddle {
 namespace operators {
@@ -22,51 +26,6 @@ class RangeOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->HasInput("Start")) {
-      auto s_dims = ctx->GetInputDim("Start");
-      PADDLE_ENFORCE_EQ(
-          s_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The dim of the shape of Input(Start) should be 1, but got %d",
-              s_dims.size()));
-      PADDLE_ENFORCE_EQ(s_dims[0], 1,
-                        platform::errors::InvalidArgument(
-                            "The first dim of the shape of Input(Start) should "
-                            "be 1, but got %d",
-                            s_dims[0]));
-    }
-    if (ctx->HasInput("End")) {
-      auto e_dims = ctx->GetInputDim("End");
-      PADDLE_ENFORCE_EQ(
-          e_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The dim of the shape of Input(End) should be 1, but got %d",
-              e_dims.size()));
-      PADDLE_ENFORCE_EQ(e_dims[0], 1, platform::errors::InvalidArgument(
-                                          "The first dim of the shape of "
-                                          "Input(End) should be 1, but got %d",
-                                          e_dims[0]));
-    }
-    if (ctx->HasInput("Step")) {
-      auto step_dims = ctx->GetInputDim("Step");
-      PADDLE_ENFORCE_EQ(
-          step_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The dim of the shape of Input(Step) should be 1, but got %d",
-              step_dims.size()));
-      PADDLE_ENFORCE_EQ(step_dims[0], 1,
-                        platform::errors::InvalidArgument(
-                            "The first dim of the shape of Input(Step) should "
-                            "be 1, but got %d",
-                            step_dims[0]));
-    }
-    ctx->SetOutputDim("Out", {-1});
-  }
 protected:
  framework::OpKernelType GetKernelTypeForVar(
      const std::string &var_name, const framework::Tensor &tensor,
@@ -101,7 +60,7 @@ class RangeOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker);
+DECLARE_INFER_SHAPE_FUNCTOR(range, RangeInferMetaFunctor,
-REGISTER_OP_CPU_KERNEL(range, ops::CPURangeKernel<int>,
+                            PD_INFER_META(phi::RangeInferMeta));
-                       ops::CPURangeKernel<float>, ops::CPURangeKernel<double>,
+REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker,
-                       ops::CPURangeKernel<int64_t>);
+                             RangeInferMetaFunctor);
--- a/paddle/fluid/operators/range_op.cu
+++ b/paddle/fluid/operators/range_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/range_op.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-__global__ void RangeKernel(T start, T step, int64_t size, T* out) {
-  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
-}
-template <typename T>
-class CUDARangeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* start_t = context.Input<framework::Tensor>("Start");
-    auto* end_t = context.Input<framework::Tensor>("End");
-    auto* step_t = context.Input<framework::Tensor>("Step");
-    auto* out = context.Output<framework::Tensor>("Out");
-    T start = GetValue<T>(start_t);
-    T end = GetValue<T>(end_t);
-    T step = GetValue<T>(step_t);
-    int64_t size = 0;
-    GetSize(start, end, step, &size);
-    out->Resize(phi::make_ddim({size}));
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto stream = context.cuda_device_context().stream();
-    int block = std::min(size, static_cast<int64_t>(256));
-    int grid = (size + block - 1) / block;
-    RangeKernel<T><<<grid, block, 0, stream>>>(start, step, size, out_data);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(range, ops::CUDARangeKernel<int>,
-                        ops::CUDARangeKernel<int64_t>,
-                        ops::CUDARangeKernel<float>,
-                        ops::CUDARangeKernel<double>);
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -30,7 +30,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-USE_OP(range);
+USE_OP_ITSELF(range);
 USE_OP_DEVICE_KERNEL(range, NPU);
 template <typename T>

--- a/paddle/fluid/operators/stack_op.cc
+++ b/paddle/fluid/operators/stack_op.cc
@@ -12,9 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/stack_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
@@ -26,52 +29,6 @@ class StackOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_GT(ctx->Inputs("X").size(), 0,
-                      platform::errors::InvalidArgument(
-                          "Number of Inputs(X) must be larger than 0, but"
-                          " received value is:%d.",
-                          ctx->Inputs("X").size()));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Y) of stack_op should not be null."));
-    auto input_dims = ctx->GetInputsDim("X");
-    for (size_t i = 1; i < input_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
-                        platform::errors::InvalidArgument(
-                            "Dims of all Inputs(X) must be the same, but"
-                            " received input %d dim is:%d not equal to input 0"
-                            " dim:%d.",
-                            i, input_dims[i], input_dims[0]));
-    }
-    // Only lod of X[0] would be shared with Y
-    ctx->ShareLoD("X", /*->*/ "Y");
-    int axis = ctx->Attrs().Get<int>("axis");
-    int rank = input_dims[0].size();
-    PADDLE_ENFORCE_GE(
-        axis, -(rank + 1),
-        platform::errors::InvalidArgument(
-            "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
-            "but received axis is:%d.",
-            rank, axis));
-    PADDLE_ENFORCE_LT(
-        axis, rank + 1,
-        platform::errors::InvalidArgument(
-            "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
-            "but received axis is:%d",
-            rank, axis));
-    if (axis < 0) axis += (rank + 1);
-    auto vec = phi::vectorize<int>(input_dims[0]);
-    vec.insert(vec.begin() + axis, input_dims.size());
-    ctx->SetOutputDim("Y", phi::make_ddim(vec));
-  }
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
    auto input_data_type =
@@ -168,21 +125,10 @@ class StackGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
+DECLARE_INFER_SHAPE_FUNCTOR(stack, StackInferMetaFunctor,
+                            PD_INFER_META(phi::StackInferMeta));
 REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker,
                  ops::StackGradOpMaker<paddle::framework::OpDesc>,
-                  ops::StackGradOpMaker<paddle::imperative::OpBase>);
+                  ops::StackGradOpMaker<paddle::imperative::OpBase>,
+                  StackInferMetaFunctor);
 REGISTER_OPERATOR(stack_grad, ops::StackOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    stack, ops::StackKernel<plat::CPUDeviceContext, float>,
-    ops::StackKernel<plat::CPUDeviceContext, double>,
-    ops::StackKernel<plat::CPUDeviceContext, int>,
-    ops::StackKernel<plat::CPUDeviceContext, int64_t>,
-    ops::StackKernel<plat::CPUDeviceContext, paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    stack_grad, ops::StackGradKernel<plat::CPUDeviceContext, float>,
-    ops::StackGradKernel<plat::CPUDeviceContext, double>,
-    ops::StackGradKernel<plat::CPUDeviceContext, int>,
-    ops::StackGradKernel<plat::CPUDeviceContext, int64_t>,
-    ops::StackGradKernel<plat::CPUDeviceContext, paddle::platform::bfloat16>);
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <algorithm>
-#include <limits>
-#include <vector>
-#include "paddle/fluid/operators/stack_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-namespace paddle {
-namespace operators {
-template <typename T, typename IntType>
-__global__ void StackCUDAKernel(T** input_ptrs, int split_size, int rows,
-                                int cols, T* __restrict__ output) {
-  IntType grid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; grid_x < cols; grid_x += blockDim.x * gridDim.x) {
-    IntType grid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    IntType split = grid_x / split_size;
-    const T* input_ptr = input_ptrs[split];
-    IntType col_offset = grid_x % split_size;
-#pragma unroll
-    for (; grid_y < rows; grid_y += blockDim.y * gridDim.y) {
-      output[grid_y * cols + grid_x] =
-          input_ptr[grid_y * split_size + col_offset];
-    }
-  }
-}
-template <typename T>
-class StackGPUKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto x = ctx.MultiInput<Tensor>("X");
-    auto* y = ctx.Output<Tensor>("Y");
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-    int n = static_cast<int>(x.size());
-    auto* y_data = y->mutable_data<T>(ctx.GetPlace());
-    std::vector<const T*> x_datas(n);
-    for (int i = 0; i < n; i++) {
-      x_datas[i] = x[i]->data<T>();
-    }
-    auto& dev_ctx = ctx.template device_context<plat::CUDADeviceContext>();
-    auto tmp_x_data = memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*));
-    memory::Copy(dev_ctx.GetPlace(), tmp_x_data->ptr(), platform::CPUPlace(),
-                 reinterpret_cast<void*>(x_datas.data()),
-                 x_datas.size() * sizeof(T*), dev_ctx.stream());
-    // Split x dim from axis to matrix
-    int x_row = 1, x_col = 1;
-    for (int i = 0; i < axis; ++i) {
-      x_row *= x[0]->dims()[i];
-    }
-    x_col = x[0]->numel() / x_row;
-    int out_col = x_col * n;
-    auto config = GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);
-    if (y->numel() < std::numeric_limits<int32_t>::max()) {
-      StackCUDAKernel<T,
-                      int32_t><<<config.block_per_grid, config.thread_per_block,
-                                 0, dev_ctx.stream()>>>(
-          reinterpret_cast<T**>(tmp_x_data->ptr()), x_col, x_row, out_col,
-          y_data);
-    } else {
-      StackCUDAKernel<T,
-                      int64_t><<<config.block_per_grid, config.thread_per_block,
-                                 0, dev_ctx.stream()>>>(
-          reinterpret_cast<T**>(tmp_x_data->ptr()), x_col, x_row, out_col,
-          y_data);
-    }
-  }
-};
-template <typename T, typename IntType>
-__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input,
-                                        int pre_dim_size, int split_dim_size,
-                                        int suf_dim_size, int num_split,
-                                        T** output_ptrs) {
-  assert(blockDim.y == 1);
-  assert(blockDim.z == 1);
-  // In this case they are equal
-  assert(split_dim_size % num_split == 0);
-  IntType size = pre_dim_size * split_dim_size * suf_dim_size;
-  IntType each_dim_size = split_dim_size / num_split;
-  for (IntType offset = blockIdx.x * blockDim.x + threadIdx.x; offset < size;
-       offset += blockDim.x * gridDim.x) {
-    IntType i = offset / (split_dim_size * suf_dim_size);
-    IntType j = (offset % (split_dim_size * suf_dim_size)) / suf_dim_size;
-    IntType k = offset % suf_dim_size;
-    T* output = output_ptrs[j / each_dim_size];
-    if (output == nullptr) {
-      return;
-    }
-    IntType output_ind = i * each_dim_size * suf_dim_size +
-                         (j % each_dim_size) * suf_dim_size + k;
-    *(output + output_ind) = input[offset];
-  }
-}
-template <typename T>
-class StackGradGPUKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += dy->dims().size();
-    int n = dy->dims()[axis];
-    PADDLE_ENFORCE_EQ(n, dx.size(),
-                      platform::errors::InvalidArgument(
-                          "Output dx size should be equal to n, but"
-                          " received n is:%d dx size is:%d.",
-                          n, dx.size()));
-    // dx is output, so save each data address, then copy each dy into dx_data
-    std::vector<T*> outputs(n);
-    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
-    for (size_t j = 0; j < dx.size(); ++j) {
-      if (dx[j] == nullptr) {
-        outputs[j] = nullptr;
-      }
-      if (out_var_names[j] != framework::kEmptyVarName &&
-          dx[j]->numel() != 0UL) {
-        T* ptr = dx[j]->mutable_data<T>(ctx.GetPlace());
-        outputs[j] = ptr;
-      } else {
-        outputs[j] = nullptr;
-      }
-    }
-    auto dy_data = dy->data<T>();
-    // each dx should have same shape
-    int dy_pre = 1, dy_suf = 1;
-    auto dy_dims = dy->dims();
-    int split_dim = n;
-    for (int i = 0; i < axis; ++i) {
-      dy_pre *= dy_dims[i];
-    }
-    dy_suf = dy->numel() / (split_dim * dy_pre);
-    auto& dev_ctx = ctx.template device_context<plat::CUDADeviceContext>();
-    auto tmp_out_data = memory::Alloc(dev_ctx, outputs.size() * sizeof(T*));
-    memory::Copy(dev_ctx.GetPlace(), tmp_out_data->ptr(), platform::CPUPlace(),
-                 reinterpret_cast<void*>(outputs.data()),
-                 outputs.size() * sizeof(T*), dev_ctx.stream());
-    auto config = GetGpuLaunchConfig1D(dev_ctx, dy_pre * split_dim * dy_suf);
-    if (dy->numel() < std::numeric_limits<int32_t>::max()) {
-      UnStackHelperCUDAKernel<
-          T, int32_t><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        dev_ctx.stream()>>>(
-          dy_data, dy_pre, split_dim, dy_suf, split_dim,
-          reinterpret_cast<T**>(tmp_out_data->ptr()));
-    } else {
-      UnStackHelperCUDAKernel<
-          T, int64_t><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        dev_ctx.stream()>>>(
-          dy_data, dy_pre, split_dim, dy_suf, split_dim,
-          reinterpret_cast<T**>(tmp_out_data->ptr()));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OP_CUDA_KERNEL(stack, ops::StackGPUKernel<float>,
-                        ops::StackGPUKernel<double>, ops::StackGPUKernel<int>,
-                        ops::StackGPUKernel<int64_t>,
-                        ops::StackGPUKernel<plat::float16>,
-                        ops::StackGPUKernel<plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(stack_grad, ops::StackGradGPUKernel<float>,
-                        ops::StackGradGPUKernel<double>,
-                        ops::StackGradGPUKernel<int>,
-                        ops::StackGradGPUKernel<int64_t>,
-                        ops::StackGradGPUKernel<plat::float16>,
-                        ops::StackGradGPUKernel<plat::bfloat16>);
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ b/paddle/fluid/operators/stack_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/stack_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 namespace paddle {

--- a/paddle/fluid/operators/stack_op_xpu.cc
+++ b/paddle/fluid/operators/stack_op_xpu.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/stack_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/concat_op.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"

--- a/paddle/fluid/operators/unique_op.cc
+++ b/paddle/fluid/operators/unique_op.cc
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/unique_op.h"
-#include "paddle/fluid/framework/op_version_registry.h"
+#include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace operators {
@@ -25,62 +29,54 @@ class UniqueOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext* ctx) const override {
    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique");
    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique");
-    auto in_dims = ctx->GetInputDim("X");
-    if (!ctx->Attrs().Get<bool>("is_sorted")) {
-      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
-      PADDLE_ENFORCE_EQ(in_dims.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The Input(X) should be 1-D Tensor, "
-                            "But now the dims of Input(X) is %d.",
-                            in_dims.size()));
-      ctx->SetOutputDim("Out", {-1});
-      ctx->SetOutputDim("Index", in_dims);
-      return;
-    }
    bool return_index = ctx->Attrs().Get<bool>("return_index");
    bool return_inverse = ctx->Attrs().Get<bool>("return_inverse");
    bool return_counts = ctx->Attrs().Get<bool>("return_counts");
    auto axis_vec = ctx->Attrs().Get<std::vector<int>>("axis");
+    auto data_type =
+        static_cast<phi::DataType>(static_cast<framework::proto::VarType::Type>(
+            ctx->Attrs().Get<int>("dtype")));
+    // Construct MetaTensor for InferMeta Func
+    using CompatMetaTensor = framework::CompatMetaTensor;
+    CompatMetaTensor x(ctx->GetInputVarPtrs("X")[0], ctx->IsRuntime());
+    CompatMetaTensor out(ctx->GetOutputVarPtrs("Out")[0], ctx->IsRuntime());
+    std::unique_ptr<CompatMetaTensor> indices(nullptr);
+    std::unique_ptr<CompatMetaTensor> index(nullptr);
+    std::unique_ptr<CompatMetaTensor> counts(nullptr);
    if (return_index) {
      OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique");
+      indices =
+          std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
+              ctx->GetOutputVarPtrs("Indices")[0], ctx->IsRuntime())));
    }
    if (return_inverse) {
      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
+      index = std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
+          ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime())));
    }
    if (return_counts) {
      OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique");
+      counts = std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
+          ctx->GetOutputVarPtrs("Counts")[0], ctx->IsRuntime())));
    }
+    bool is_sorted = ctx->Attrs().Get<bool>("is_sorted");
-    if (axis_vec.empty()) {
+    if (is_sorted) {
-      ctx->SetOutputDim("Out", {-1});
+      phi::UniqueInferMeta(x, return_index, return_inverse, return_counts,
-      if (return_inverse) {
+                           axis_vec, data_type, &out, indices.get(),
-        ctx->SetOutputDim("Index", {phi::product(in_dims)});
+                           index.get(), counts.get());
-      }
    } else {
-      int axis = axis_vec[0];
+      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
-      if (axis < 0) {
+      if (index == nullptr) {
-        axis += in_dims.size();
+        index =
-      }
+            std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
-      PADDLE_ENFORCE_LT(
+                ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime())));
-          axis, in_dims.size(),
-          platform::errors::InvalidArgument("The axis(%d) should be less than "
-                                            "the dimension size(%d) of x.",
-                                            axis, in_dims.size()));
-      auto out_dims = in_dims;
-      out_dims[axis] = -1;
-      ctx->SetOutputDim("Out", out_dims);
-      if (return_inverse) {
-        ctx->SetOutputDim("Index", {in_dims[axis]});
-      }
-    }
-    if (return_index) {
-      ctx->SetOutputDim("Indices", {-1});
      }
-    if (return_counts) {
+      phi::UniqueRawInferMeta(x, return_index, return_inverse, return_counts,
-      ctx->SetOutputDim("Counts", {-1});
+                              axis_vec, data_type, is_sorted, &out,
+                              indices.get(), index.get(), counts.get());
    }
  }
@@ -152,40 +148,5 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    unique, ops::UniqueKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UniqueKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_VERSION(unique)
-    .AddCheckpoint(
-        R"ROC(
-        Upgrade unique, add 2 outputs [Indices, Counts] and 5 attribute
-        [return_index, return_inverse, return_counts, axis, is_sorted].
-      )ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewOutput("Indices",
-                       "The indices of the input tensor that result in the "
-                       "unique tensor.")
-            .NewOutput("Counts", "The counts for each unique element.")
-            .NewAttr("return_index",
-                     "If True, also return the indices of the input"
-                     " tensor that result in the unique Tensor.",
-                     false)
-            .NewAttr("return_inverse",
-                     "If True, also return the indices for where elements"
-                     " in the original input ended up in the returned unique "
-                     "tensor.",
-                     false)
-            .NewAttr("return_counts",
-                     "If True, also return the counts for each unique element.",
-                     false)
-            .NewAttr("axis",
-                     "The axis to apply unique. If None, the input will be "
-                     "flattened.",
-                     std::vector<int>{})
-            .NewAttr("is_sorted",
-                     "If True, the unique elements of X are in ascending order."
-                     "Otherwise, the unique elements are not sorted.",
-                     false));
--- a/paddle/fluid/operators/unstack_op.cc
+++ b/paddle/fluid/operators/unstack_op.cc
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/unstack_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace operators {
@@ -25,43 +27,6 @@ namespace operators {
 class UnStackOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "UnStack");
-    int axis = ctx->Attrs().Get<int>("axis");
-    int num = ctx->Attrs().Get<int>("num");
-    auto x_dim = ctx->GetInputDim("X");
-    int rank = x_dim.size();
-    PADDLE_ENFORCE_GE(axis, -rank,
-                      platform::errors::InvalidArgument(
-                          "The attribute axis is out of range, it must be "
-                          "inside [-rank, rank), where rank = %d",
-                          rank));
-    PADDLE_ENFORCE_LT(axis, rank,
-                      platform::errors::InvalidArgument(
-                          "The attribute axis is out of range, it must be "
-                          "inside [-rank, rank), where rank = %d",
-                          rank));
-    if (axis < 0) axis += rank;
-    PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast<size_t>(num),
-                      platform::errors::InvalidArgument(
-                          "Number of Outputs(Y) is wrong. Got %d , but it must "
-                          "equal to attribute num which is %d.",
-                          ctx->Outputs("Y").size(), static_cast<size_t>(num)));
-    if (x_dim[axis] > 0) {
-      PADDLE_ENFORCE_EQ(
-          num, x_dim[axis],
-          platform::errors::InvalidArgument(
-              "The number of attribute num is not equal to the length of the "
-              "%d axis of Input(X). Expect %d but got %d.",
-              axis, x_dim[axis], num));
-    }
-    auto vec = phi::vectorize<int>(x_dim);
-    vec.erase(vec.begin() + axis);
-    ctx->SetOutputsDim("Y", std::vector<framework::DDim>(  // NOLINT
-                                x_dim[axis], phi::make_ddim(vec)));
-  }
 };
 class UnStackOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -141,20 +106,12 @@ class UnStackGradOp : public framework::OperatorWithKernel {
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(unstack, UnStackInferMetaFunctor,
+                            PD_INFER_META(phi::UnStackInferMeta));
 REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker,
                  ops::UnStackGradOpMaker<paddle::framework::OpDesc>,
-                  ops::UnStackGradOpMaker<paddle::imperative::OpBase>);
+                  ops::UnStackGradOpMaker<paddle::imperative::OpBase>,
+                  UnStackInferMetaFunctor);
 REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp);
-REGISTER_OP_CPU_KERNEL(unstack,
-                       ops::UnStackKernel<plat::CPUDeviceContext, float>,
-                       ops::UnStackKernel<plat::CPUDeviceContext, double>,
-                       ops::UnStackKernel<plat::CPUDeviceContext, int>,
-                       ops::UnStackKernel<plat::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(unstack_grad,
-                       ops::UnStackGradKernel<plat::CPUDeviceContext, float>,
-                       ops::UnStackGradKernel<plat::CPUDeviceContext, double>,
-                       ops::UnStackGradKernel<plat::CPUDeviceContext, int>,
-                       ops::UnStackGradKernel<plat::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/unstack_op.h
+++ b/paddle/fluid/operators/unstack_op.h
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-     http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <memory>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include <thrust/device_vector.h>
-#endif
-namespace paddle {
-namespace operators {
-template <typename VecXType, typename T>
-struct StackFunctor {
-  HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
-      : x_(x), y_(y), n_(n), post_(post) {}
-  HOSTDEVICE void operator()(int idx) {
-    int i = idx / (n_ * post_);
-    int which_x = idx / post_ - i * n_;
-    int x_index = i * post_ + idx % post_;
-    y_[idx] = x_[which_x][x_index];
-  }
- private:
-  VecXType x_;
-  T *y_;
-  int n_;
-  int post_;
-};
-template <typename VecDxType, typename T>
-struct StackGradFunctor {
-  HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
-      : dx_(dx), dy_(dy), n_(n), post_(post) {}
-  HOSTDEVICE void operator()(int idx) {
-    int i = idx / (n_ * post_);
-    int which_x = idx / post_ - i * n_;
-    int x_index = i * post_ + idx % post_;
-    dx_[which_x][x_index] = dy_[idx];
-  }
- private:
-  VecDxType dx_;
-  const T *dy_;
-  int n_;
-  int post_;
-};
-template <typename DeviceContext, typename VecXType, typename T>
-static inline void StackFunctorForRange(const DeviceContext &ctx,
-                                        const VecXType &x, T *y, int total_num,
-                                        int n, int post) {
-  platform::ForRange<DeviceContext> for_range(ctx, total_num);
-  for_range(StackFunctor<VecXType, T>(x, y, n, post));
-}
-template <typename DeviceContext, typename VecDxType, typename T>
-static inline void StackGradFunctorForRange(const DeviceContext &ctx,
-                                            const VecDxType &dx, const T *dy,
-                                            int total_num, int n, int post) {
-  platform::ForRange<DeviceContext> for_range(ctx, total_num);
-  for_range(StackGradFunctor<VecDxType, T>(dx, dy, n, post));
-}
-template <typename DeviceContext, typename T>
-class UnStackGradKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto x = ctx.MultiInput<Tensor>(framework::GradVarName("Y"));
-    auto *y = ctx.Output<Tensor>(framework::GradVarName("X"));
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-    int n = static_cast<int>(x.size());
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    std::vector<const T *> x_datas(n);
-    for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
-    int pre = 1;
-    int post = 1;
-    auto &dim = x[0]->dims();
-    for (auto i = 0; i < axis; ++i) pre *= dim[i];
-    for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
-#if defined(__NVCC__) || defined(__HIPCC__)
-    int total_num = pre * n * post;
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    thrust::device_vector<const T *> device_x_vec(x_datas);
-    auto x_data_arr = device_x_vec.data().get();
-    StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
-    // Wait() must be called because device_x_vec may be destructed before
-    // kernel ends
-    dev_ctx.Wait();
-#else
-    auto x_data_arr = x_datas.data();
-    size_t x_offset = 0;
-    size_t y_offset = 0;
-    for (int i = 0; i < pre; i++) {
-      for (int j = 0; j < n; j++) {
-        std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset,
-                    post * sizeof(T));
-        y_offset += post;
-      }
-      x_offset += post;
-    }
-#endif
-  }
-};
-template <typename DeviceContext, typename T>
-class UnStackKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *dy = ctx.Input<Tensor>("X");
-    auto dx = ctx.MultiOutput<Tensor>("Y");
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += dy->dims().size();
-    int n = dy->dims()[axis];
-    std::vector<T *> dx_datas(n);  // NOLINT
-    for (int i = 0; i < n; i++) {
-      dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
-    }
-    auto dy_data = dy->data<T>();
-    if (dy->numel() == 0) return;
-    int pre = 1;
-    for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
-    int total_num = dy->numel();
-    int post = total_num / (n * pre);
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-#if defined(__NVCC__) || defined(__HIPCC__)
-    thrust::device_vector<T *> device_dx_vec(dx_datas);
-    auto dx_data_arr = device_dx_vec.data().get();
-#else
-    auto dx_data_arr = dx_datas.data();
-#endif
-    StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
-#if defined(__NVCC__) || defined(__HIPCC__)
-    // Wait() must be called because device_dx_vec may be destructed before
-    // kernel ends
-    dev_ctx.Wait();
-#endif
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/unstack_op_npu.cc
+++ b/paddle/fluid/operators/unstack_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/unstack_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 namespace paddle {

--- a/paddle/phi/core/utils/data_type.h
+++ b/paddle/phi/core/utils/data_type.h
@@ -44,6 +44,10 @@ namespace phi {
  _PhiForEachDataTypeHelper_(                                       \
      callback, ::phi::dtype::complex<double>, DataType::COMPLEX128);
+#define _PhiForEachDataTypeTiny_(callback)                    \
+  _PhiForEachDataTypeHelper_(callback, int, DataType::INT32); \
+  _PhiForEachDataTypeHelper_(callback, int64_t, DataType::INT64);
 template <typename Visitor>
 inline void VisitDataType(phi::DataType type, Visitor visitor) {
 #define PhiVisitDataTypeCallback(cpp_type, data_type) \
@@ -59,4 +63,21 @@ inline void VisitDataType(phi::DataType type, Visitor visitor) {
  PADDLE_THROW(phi::errors::Unimplemented(
      "Not supported phi::DataType(%d) as data type.", static_cast<int>(type)));
 }
+template <typename Visitor>
+inline void VisitDataTypeTiny(phi::DataType type, Visitor visitor) {
+#define PhiVisitDataTypeCallbackTiny(cpp_type, data_type) \
+  do {                                                    \
+    if (type == data_type) {                              \
+      visitor.template apply<cpp_type>();                 \
+      return;                                             \
+    }                                                     \
+  } while (0)
+  _PhiForEachDataTypeTiny_(PhiVisitDataTypeCallbackTiny);
+#undef PhiVisitDataTypeCallbackTiny
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "Not supported phi::DataType(%d) as data type.", static_cast<int>(type)));
+}
 }  // namespace phi
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1167,6 +1167,52 @@ void RnnInferMeta(const MetaTensor& x,
  }
 }
+void StackInferMeta(const std::vector<MetaTensor*>& x,
+                    int axis,
+                    MetaTensor* out) {
+  PADDLE_ENFORCE_GT(x.size(),
+                    0UL,
+                    phi::errors::InvalidArgument(
+                        "Number of Inputs(x) must be larger than 0, but"
+                        " received value is:%d.",
+                        x.size()));
+  const auto& input_dims = GetMetaTensorsDim(x);
+  for (size_t i = 1; i < input_dims.size(); ++i) {
+    PADDLE_ENFORCE_EQ(input_dims[i],
+                      input_dims[0],
+                      phi::errors::InvalidArgument(
+                          "Dims of all Inputs(X) must be the same, but"
+                          " received input %d dim is:%d not equal to input 0"
+                          " dim:%d.",
+                          i,
+                          input_dims[i],
+                          input_dims[0]));
+  }
+  int rank = input_dims[0].size();
+  PADDLE_ENFORCE_GE(
+      axis,
+      -(rank + 1),
+      phi::errors::InvalidArgument(
+          "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
+          "but received axis is:%d.",
+          rank,
+          axis));
+  PADDLE_ENFORCE_LT(
+      axis,
+      rank + 1,
+      phi::errors::InvalidArgument(
+          "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
+          "but received axis is:%d",
+          rank,
+          axis));
+  if (axis < 0) axis += (rank + 1);
+  auto vec = phi::vectorize<int>(input_dims[0]);
+  vec.insert(vec.begin() + axis, input_dims.size());
+  out->set_dims(phi::make_ddim(vec));
+  out->set_dtype(x.at(0)->dtype());
+  out->share_lod(*x.at(0));
+}
 void WarpctcInferMeta(const MetaTensor& logits,
                      const MetaTensor& label,
                      const paddle::optional<const MetaTensor&> logits_length,

--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -231,6 +231,10 @@ void RnnInferMeta(const MetaTensor& x,
                  std::vector<MetaTensor*> state,
                  MetaTensor* reserve);
+void StackInferMeta(const std::vector<MetaTensor*>& x,
+                    int axis,
+                    MetaTensor* out);
 void WarpctcInferMeta(const MetaTensor& logits,
                      const MetaTensor& label,
                      const paddle::optional<const MetaTensor&> logits_length,

--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -345,6 +345,56 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
  out->set_dtype(x.dtype());
 }
+void RangeInferMeta(const MetaTensor& start,
+                    const MetaTensor& end,
+                    const MetaTensor& step,
+                    MetaTensor* out) {
+  auto start_dims = start.dims();
+  auto end_dims = end.dims();
+  auto step_dims = step.dims();
+  PADDLE_ENFORCE_EQ(
+      start_dims.size(),
+      1,
+      phi::errors::InvalidArgument(
+          "The dim of the shape of Input(Start) should be 1, but got %d",
+          start_dims.size()));
+  PADDLE_ENFORCE_EQ(start_dims[0],
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The first dim of the shape of Input(Start) should "
+                        "be 1, but got %d",
+                        start_dims[0]));
+  PADDLE_ENFORCE_EQ(
+      end_dims.size(),
+      1,
+      phi::errors::InvalidArgument(
+          "The dim of the shape of Input(End) should be 1, but got %d",
+          end_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      end_dims[0],
+      1,
+      phi::errors::InvalidArgument("The first dim of the shape of "
+                                   "Input(End) should be 1, but got %d",
+                                   end_dims[0]));
+  PADDLE_ENFORCE_EQ(
+      step_dims.size(),
+      1,
+      phi::errors::InvalidArgument(
+          "The dim of the shape of Input(Step) should be 1, but got %d",
+          step_dims.size()));
+  PADDLE_ENFORCE_EQ(step_dims[0],
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The first dim of the shape of Input(Step) should "
+                        "be 1, but got %d",
+                        step_dims[0]));
+  out->set_dims({-1});
+  out->set_dtype(start.dtype());
+}
 void RoiAlignInferMeta(const MetaTensor& x,
                       const MetaTensor& boxes,
                       paddle::optional<const MetaTensor&> boxes_num,

--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -81,6 +81,11 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
                           const std::string& reduce,
                           MetaTensor* out);
+void RangeInferMeta(const MetaTensor& start,
+                    const MetaTensor& end,
+                    const MetaTensor& step,
+                    MetaTensor* out);
 void RoiAlignInferMeta(const MetaTensor& x,
                       const MetaTensor& boxes,
                       paddle::optional<const MetaTensor&> boxes_num,

--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -2552,6 +2552,85 @@ void UnfoldInferMeta(const MetaTensor& x,
  out->set_dims(phi::make_ddim(out_dims));
 }
+void UniqueInferMeta(const MetaTensor& x,
+                     bool return_index,
+                     bool return_inverse,
+                     bool return_counts,
+                     const std::vector<int>& axis,
+                     DataType dtype,
+                     MetaTensor* out,
+                     MetaTensor* indices,
+                     MetaTensor* index,
+                     MetaTensor* counts) {
+  bool is_sorted = true;
+  UniqueRawInferMeta(x,
+                     return_index,
+                     return_inverse,
+                     return_counts,
+                     axis,
+                     dtype,
+                     is_sorted,
+                     out,
+                     indices,
+                     index,
+                     counts);
+}
+void UniqueRawInferMeta(const MetaTensor& x,
+                        bool return_index,
+                        bool return_inverse,
+                        bool return_counts,
+                        const std::vector<int>& axis,
+                        DataType dtype,
+                        bool is_sorted,
+                        MetaTensor* out,
+                        MetaTensor* indices,
+                        MetaTensor* index,
+                        MetaTensor* counts) {
+  if (!is_sorted) {
+    PADDLE_ENFORCE_EQ(
+        x.dims().size(),
+        1,
+        phi::errors::InvalidArgument("The Input(X) should be 1-D Tensor, "
+                                     "But now the dims of Input(X) is %d.",
+                                     x.dims().size()));
+    out->set_dims(phi::make_ddim({-1}));
+    index->set_dims(x.dims());
+    return;
+  }
+  if (axis.empty()) {
+    out->set_dims(phi::make_ddim({-1}));
+    if (return_inverse) {
+      index->set_dims(phi::make_ddim({phi::product(x.dims())}));
+    }
+  } else {
+    int axis_value = axis[0];
+    if (axis_value < 0) {
+      axis_value += x.dims().size();
+    }
+    PADDLE_ENFORCE_LT(
+        axis_value,
+        x.dims().size(),
+        phi::errors::InvalidArgument("The axis(%d) should be less than "
+                                     "the dimension size(%d) of x.",
+                                     axis_value,
+                                     x.dims().size()));
+    auto out_dims = x.dims();
+    out_dims[axis_value] = -1;
+    out->set_dims(out_dims);
+    if (return_inverse) {
+      index->set_dims(phi::make_ddim({x.dims()[axis_value]}));
+    }
+  }
+  if (return_index) {
+    indices->set_dims(phi::make_ddim({-1}));
+  }
+  if (return_counts) {
+    counts->set_dims(phi::make_ddim({-1}));
+  }
+}
 void UnsqueezeInferMeta(const MetaTensor& x,
                        const IntArray& axes,
                        MetaTensor* xshape,
@@ -2595,6 +2674,53 @@ void UnsqueezeInferMeta(const MetaTensor& x,
  xshape->set_dtype(x.dtype());
 }
+void UnStackInferMeta(const MetaTensor& x,
+                      int axis,
+                      int num,
+                      std::vector<MetaTensor*> outs) {
+  auto x_dim = x.dims();
+  int rank = x_dim.size();
+  PADDLE_ENFORCE_GE(axis,
+                    -rank,
+                    phi::errors::InvalidArgument(
+                        "The attribute axis is out of range, it must be "
+                        "inside [-rank, rank), where rank = %d",
+                        rank));
+  PADDLE_ENFORCE_LT(axis,
+                    rank,
+                    phi::errors::InvalidArgument(
+                        "The attribute axis is out of range, it must be "
+                        "inside [-rank, rank), where rank = %d",
+                        rank));
+  if (axis < 0) axis += rank;
+  size_t output_count = outs.size();
+  PADDLE_ENFORCE_EQ(output_count,
+                    static_cast<size_t>(num),
+                    phi::errors::InvalidArgument(
+                        "Number of Outputs(Y) is wrong. Got %d , but it must "
+                        "equal to attribute num which is %d.",
+                        output_count,
+                        static_cast<size_t>(num)));
+  if (x_dim[axis] > 0) {
+    PADDLE_ENFORCE_EQ(
+        num,
+        x_dim[axis],
+        phi::errors::InvalidArgument(
+            "The number of attribute num is not equal to the length of the "
+            "%d axis of Input(X). Expect %d but got %d.",
+            axis,
+            x_dim[axis],
+            num));
+  }
+  auto vec = phi::vectorize<int>(x_dim);
+  vec.erase(vec.begin() + axis);
+  for (size_t i = 0; i < output_count; i++) {
+    outs[i]->set_dims(phi::make_ddim(vec));
+    outs[i]->set_dtype(x.dtype());
+  }
+}
 void OneHotRawInferMeta(const MetaTensor& x,
                        int32_t depth,
                        DataType dtype,

--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -360,12 +360,40 @@ void UnfoldInferMeta(const MetaTensor& x,
                     MetaTensor* out,
                     MetaConfig config = MetaConfig());
+void UniqueInferMeta(const MetaTensor& x,
+                     bool return_index,
+                     bool return_inverse,
+                     bool return_counts,
+                     const std::vector<int>& axis,
+                     DataType dtype,
+                     MetaTensor* out,
+                     MetaTensor* indices,
+                     MetaTensor* index,
+                     MetaTensor* counts);
+void UniqueRawInferMeta(const MetaTensor& x,
+                        bool return_index,
+                        bool return_inverse,
+                        bool return_counts,
+                        const std::vector<int>& axis,
+                        DataType dtype,
+                        bool is_sorted,
+                        MetaTensor* out,
+                        MetaTensor* indices,
+                        MetaTensor* index,
+                        MetaTensor* counts);
 void UnsqueezeInferMeta(const MetaTensor& x,
                        const IntArray& axes,
                        MetaTensor* xshape,
                        MetaTensor* out,
                        MetaConfig config = MetaConfig());
+void UnStackInferMeta(const MetaTensor& x,
+                      int axis,
+                      int num,
+                      std::vector<MetaTensor*> outs);
 void OneHotRawInferMeta(const MetaTensor& x,
                        int32_t depth,
                        DataType dtype,

--- a/paddle/phi/kernels/cpu/range_kernel.cc
+++ b/paddle/phi/kernels/cpu/range_kernel.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/kernels/range_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/range_function.h"
+namespace phi {
+template <typename T, typename Context>
+void RangeKernel(const Context& dev_ctx,
+                 const DenseTensor& start,
+                 const DenseTensor& end,
+                 const DenseTensor& step,
+                 DenseTensor* out) {
+  T start_value = start.data<T>()[0];
+  T end_value = end.data<T>()[0];
+  T step_value = step.data<T>()[0];
+  int64_t size = 0;
+  phi::funcs::GetSize(start_value, end_value, step_value, &size);
+  out->Resize(phi::make_ddim({size}));
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  T value = start_value;
+  for (int64_t i = 0; i < size; ++i) {
+    out_data[i] = value;
+    value += step_value;
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    range, CPU, ALL_LAYOUT, phi::RangeKernel, float, double, int, int64_t) {}
--- a/paddle/phi/kernels/cpu/stack_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/stack_grad_kernel.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/kernels/stack_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/stack_functor.h"
+namespace phi {
+template <typename T, typename Context>
+void StackGradKernel(const Context& dev_ctx,
+                     const DenseTensor& out,
+                     int axis,
+                     std::vector<DenseTensor*> x_grad) {
+  if (axis < 0) axis += out.dims().size();
+  int n = out.dims()[axis];
+  std::vector<T*> dx_datas(n);  // NOLINT
+  for (int i = 0; i < n; i++) {
+    if (x_grad[i] == nullptr) {
+      dx_datas[i] = nullptr;
+    } else {
+      dx_datas[i] = dev_ctx.template Alloc<T>(x_grad[i]);
+    }
+  }
+  auto dy_data = out.data<T>();
+  int pre = 1;
+  for (int i = 0; i < axis; ++i) pre *= out.dims()[i];
+  int total_num = out.numel();
+  int post = total_num / (n * pre);
+  auto dx_data_arr = dx_datas.data();
+  phi::funcs::StackGradFunctorForRange(
+      dev_ctx, dx_data_arr, dy_data, total_num, n, post);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(stack_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::StackGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/cpu/stack_kernel.cc
+++ b/paddle/phi/kernels/cpu/stack_kernel.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/kernels/stack_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void StackKernel(const Context& dev_ctx,
+                 const std::vector<const DenseTensor*>& x,
+                 int axis,
+                 DenseTensor* out) {
+  if (axis < 0) axis += (x[0]->dims().size() + 1);
+  int n = static_cast<int>(x.size());
+  T* y_data = dev_ctx.template Alloc<T>(out);
+  std::vector<const T*> x_datas(n);
+  for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
+  int pre = 1, post = 1;
+  auto& dim = x[0]->dims();
+  for (auto i = 0; i < axis; ++i) pre *= dim[i];
+  for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
+  auto x_data_arr = x_datas.data();
+  size_t x_offset = 0;
+  size_t y_offset = 0;
+  for (int i = 0; i < pre; i++) {
+    for (int j = 0; j < n; j++) {
+      std::memcpy(
+          y_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T));
+      y_offset += post;
+    }
+    x_offset += post;
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(stack,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::StackKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/cpu/unique_kernel.cc
+++ b/paddle/phi/kernels/cpu/unique_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/unique_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/unique_functor.h"
+namespace phi {
+template <typename T, typename Context>
+void UniqueKernel(const Context& context,
+                  const DenseTensor& x,
+                  bool return_index,
+                  bool return_inverse,
+                  bool return_counts,
+                  const std::vector<int>& axis,
+                  DataType dtype,
+                  DenseTensor* out,
+                  DenseTensor* indices,
+                  DenseTensor* index,
+                  DenseTensor* counts) {
+  bool is_sorted = true;
+  UniqueRawKernel<T, Context>(context,
+                              x,
+                              return_index,
+                              return_inverse,
+                              return_counts,
+                              axis,
+                              dtype,
+                              is_sorted,
+                              out,
+                              indices,
+                              index,
+                              counts);
+}
+template <typename T, typename Context>
+void UniqueRawKernel(const Context& context,
+                     const DenseTensor& x,
+                     bool return_index,
+                     bool return_inverse,
+                     bool return_counts,
+                     const std::vector<int>& axis,
+                     DataType dtype,
+                     bool is_sorted,
+                     DenseTensor* out,
+                     DenseTensor* indices,
+                     DenseTensor* index,
+                     DenseTensor* counts) {
+  if (dtype == phi::DataType::INT32) {
+    PADDLE_ENFORCE_LE(
+        x.numel(),
+        INT_MAX,
+        phi::errors::InvalidArgument(
+            "The number of elements in Input(X) should be less than or "
+            "equal to INT_MAX, but received num is %d. Please set `dtype` to "
+            "int64.",
+            x.numel()));
+  }
+  if (!is_sorted) {
+    phi::VisitDataType(
+        dtype,
+        phi::funcs::UniqueOpFunctor<Context, T>(context, out, index, &x));
+    return;
+  }
+  if (x.numel() == 0) {
+    context.template Alloc<T>(out);
+    return;
+  }
+  if (axis.empty()) {
+    phi::VisitDataTypeTiny(
+        dtype,
+        phi::funcs::UniqueFlattendTensorFunctor<Context, T>(context,
+                                                            x,
+                                                            out,
+                                                            indices,
+                                                            index,
+                                                            counts,
+                                                            return_index,
+                                                            return_inverse,
+                                                            return_counts));
+  } else {
+    int axis_value = axis[0];
+    phi::VisitDataTypeTiny(
+        dtype,
+        phi::funcs::UniqueDimFunctor<Context, T>(context,
+                                                 x,
+                                                 out,
+                                                 indices,
+                                                 index,
+                                                 counts,
+                                                 axis_value,
+                                                 return_index,
+                                                 return_inverse,
+                                                 return_counts));
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(unique,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniqueKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
+PD_REGISTER_KERNEL(unique_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniqueRawKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
--- a/paddle/fluid/operators/unstack_op.cu
+++ b/paddle/fluid/operators/unstack_op.cu
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,21 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/unstack_op.h"
+#include "paddle/phi/kernels/unstack_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h"
-namespace plat = paddle::platform;
+PD_REGISTER_KERNEL(unstack_grad,
-namespace ops = paddle::operators;
+                   CPU,
+                   ALL_LAYOUT,
-REGISTER_OP_CUDA_KERNEL(
+                   phi::UnStackGradKernel,
-    unstack, ops::UnStackKernel<plat::CUDADeviceContext, float>,
+                   float,
-    ops::UnStackKernel<plat::CUDADeviceContext, double>,
+                   double,
-    ops::UnStackKernel<plat::CUDADeviceContext, int>,
+                   int,
-    ops::UnStackKernel<plat::CUDADeviceContext, int64_t>,
+                   int64_t) {}
-    ops::UnStackKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    unstack_grad, ops::UnStackGradKernel<plat::CUDADeviceContext, float>,
-    ops::UnStackGradKernel<plat::CUDADeviceContext, double>,
-    ops::UnStackGradKernel<plat::CUDADeviceContext, int>,
-    ops::UnStackGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::UnStackGradKernel<plat::CUDADeviceContext, plat::float16>);
--- a/paddle/phi/kernels/cpu/unstack_kernel.cc
+++ b/paddle/phi/kernels/cpu/unstack_kernel.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/kernels/unstack_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unstack_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    unstack, CPU, ALL_LAYOUT, phi::UnStackKernel, float, double, int, int64_t) {
+}
--- a/paddle/phi/kernels/funcs/range_function.h
+++ b/paddle/phi/kernels/funcs/range_function.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/enforce.h"
+namespace phi {
+namespace funcs {
+template <typename T>
+void GetSize(T start, T end, T step, int64_t* size) {
+  PADDLE_ENFORCE_NE(
+      step,
+      0,
+      phi::errors::InvalidArgument("The step of range op should not be 0."));
+  if (start < end) {
+    PADDLE_ENFORCE_GT(
+        step,
+        0,
+        phi::errors::InvalidArgument(
+            "The step should be greater than 0 while start < end."));
+  }
+  if (start > end) {
+    PADDLE_ENFORCE_LT(step,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The step should be less than 0 while start > end."));
+  }
+  *size = std::is_integral<T>::value
+              ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step))
+              : std::ceil(std::abs((end - start) / step));
+}
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,12 +14,29 @@
 #pragma once
-#include <memory>
+#include "paddle/phi/kernels/funcs/for_range.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-namespace paddle {
+namespace phi {
-namespace operators {
+namespace funcs {
+template <typename VecXType, typename T>
+struct StackFunctor {
+  HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
+      : x_(x), y_(y), n_(n), post_(post) {}
+  HOSTDEVICE void operator()(int idx) {
+    int i = idx / (n_ * post_);
+    int which_x = idx / post_ - i * n_;
+    int x_index = i * post_ + idx % post_;
+    y_[idx] = x_[which_x][x_index];
+  }
+ private:
+  VecXType x_;
+  T *y_;
+  int n_;
+  int post_;
+};
 template <typename VecDxType, typename T>
 struct StackGradFunctor {
@@ -40,81 +57,27 @@ struct StackGradFunctor {
  int post_;
 };
+template <typename DeviceContext, typename VecXType, typename T>
+static inline void StackFunctorForRange(const DeviceContext &ctx,
+                                        const VecXType &x,
+                                        T *y,
+                                        int total_num,
+                                        int n,
+                                        int post) {
+  phi::funcs::ForRange<DeviceContext> for_range(ctx, total_num);
+  for_range(StackFunctor<VecXType, T>(x, y, n, post));
+}
 template <typename DeviceContext, typename VecDxType, typename T>
 static inline void StackGradFunctorForRange(const DeviceContext &ctx,
-                                            const VecDxType &dx, const T *dy,
+                                            const VecDxType &dx,
-                                            int total_num, int n, int post) {
+                                            const T *dy,
-  platform::ForRange<DeviceContext> for_range(ctx, total_num);
+                                            int total_num,
+                                            int n,
+                                            int post) {
+  phi::funcs::ForRange<DeviceContext> for_range(ctx, total_num);
  for_range(StackGradFunctor<VecDxType, T>(dx, dy, n, post));
 }
-template <typename DeviceContext, typename T>
+}  // namespace funcs
-class StackKernel : public framework::OpKernel<T> {
+}  // namespace phi
-  using Tensor = framework::LoDTensor;
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto x = ctx.MultiInput<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Y");
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-    int n = static_cast<int>(x.size());
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    std::vector<const T *> x_datas(n);
-    for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
-    int pre = 1, post = 1;
-    auto &dim = x[0]->dims();
-    for (auto i = 0; i < axis; ++i) pre *= dim[i];
-    for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
-    auto x_data_arr = x_datas.data();
-    size_t x_offset = 0;
-    size_t y_offset = 0;
-    for (int i = 0; i < pre; i++) {
-      for (int j = 0; j < n; j++) {
-        std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset,
-                    post * sizeof(T));
-        y_offset += post;
-      }
-      x_offset += post;
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class StackGradKernel : public framework::OpKernel<T> {
-  using Tensor = framework::LoDTensor;
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += dy->dims().size();
-    int n = dy->dims()[axis];
-    std::vector<T *> dx_datas(n);  // NOLINT
-    for (int i = 0; i < n; i++) {
-      if (dx[i] == nullptr) {
-        dx_datas[i] = nullptr;
-      } else {
-        dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
-      }
-    }
-    auto dy_data = dy->data<T>();
-    int pre = 1;
-    for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
-    int total_num = dy->numel();
-    int post = total_num / (n * pre);
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    auto dx_data_arr = dx_datas.data();
-    StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/phi/kernels/funcs/unique_functor.h
+++ b/paddle/phi/kernels/funcs/unique_functor.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+namespace funcs {
+template <typename Context, typename InT>
+struct UniqueOpFunctor {
+  const Context& context_;
+  DenseTensor* out_;
+  DenseTensor* index_;
+  const DenseTensor* in_;
+  DenseTensor* count_;
+  UniqueOpFunctor(const Context& context,
+                  DenseTensor* out,
+                  DenseTensor* index,
+                  const DenseTensor* in,
+                  DenseTensor* count = nullptr)
+      : context_(context), out_(out), index_(index), in_(in), count_(count) {}
+  template <typename IndexT>
+  void apply() const {
+    auto* in_data = in_->data<InT>();
+    auto* index_data = context_.template Alloc<IndexT>(index_);
+    int64_t j = 0;
+    // TODO(fangzeyang): Should optimize performance here.
+    std::unordered_map<InT, int64_t> dict;
+    std::vector<InT> uniq;
+    PADDLE_ENFORCE_LT(
+        in_->numel(),
+        pow(2, 31),
+        phi::errors::InvalidArgument(
+            "The num of Input(X) elements should be less then INT_MAX, "
+            "but received num is %d.",
+            in_->numel()));
+    for (auto i = 0; i < in_->numel(); i++) {
+      auto it = dict.find(in_data[i]);
+      if (it == dict.end()) {
+        dict.emplace(std::make_pair(in_data[i], j));
+        uniq.emplace_back(in_data[i]);
+        index_data[i] = static_cast<IndexT>(j);
+        j++;
+      } else {
+        index_data[i] = static_cast<IndexT>(it->second);
+      }
+    }
+    if (count_ != nullptr) {
+      // Resize the count tensor dims to allocate the memory
+      count_->Resize(phi::make_ddim({static_cast<int64_t>(uniq.size())}));
+      IndexT* count_data = context_.template Alloc<IndexT>(count_);
+      // init count_data to 0
+      memset(count_data, 0, uniq.size() * sizeof(IndexT));
+      const auto& index_type = index_->dtype();
+      bool index_type_match =
+          index_type == DataType::INT32 || index_type == DataType::INT64;
+      PADDLE_ENFORCE_EQ(
+          index_type_match,
+          true,
+          phi::errors::InvalidArgument(
+              "Index holds the wrong type, it holds %s, "
+              "but desires to be %s or %s",
+              paddle::framework::DataTypeToString(
+                  paddle::framework::TransToProtoVarType(index_type)),
+              paddle::framework::DataTypeToString(
+                  paddle::framework::TransToProtoVarType(DataType::INT32)),
+              paddle::framework::DataTypeToString(
+                  paddle::framework::TransToProtoVarType(DataType::INT64))));
+      if (index_type == DataType::INT32) {
+        for (auto i = 0; i < in_->numel(); ++i) {
+          const IndexT& index = index_data[i];
+          count_data[static_cast<int32_t>(index)] += static_cast<IndexT>(1);
+        }
+      } else {
+        for (auto i = 0; i < in_->numel(); ++i) {
+          const IndexT& index = index_data[i];
+          count_data[static_cast<int64_t>(index)] += static_cast<IndexT>(1);
+        }
+      }
+    }
+    out_->Resize(phi::make_ddim({static_cast<int64_t>(uniq.size())}));
+    auto* out_data = context_.template Alloc<InT>(out_);
+    std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT));
+  }
+};
+static std::vector<DenseTensor> Unbind(const DenseTensor& in) {
+  int64_t size = in.dims()[0];
+  std::vector<DenseTensor> tensors(size);
+  for (int64_t i = 0; i < size; ++i) {
+    tensors[i] = in.Slice(i, i + 1);
+  }
+  return tensors;
+}
+template <typename T>
+static bool Equal(const DenseTensor& a, const DenseTensor& b) {
+  if (a.numel() != b.numel()) {
+    return false;
+  }
+  for (int64_t i = 0; i < a.numel(); ++i) {
+    if (a.data<T>()[i] != b.data<T>()[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+template <typename Context, typename InT, typename IndexT>
+static void UniqueFlattendTensor(const Context& context,
+                                 const DenseTensor& in,
+                                 DenseTensor* out,
+                                 DenseTensor* indices,
+                                 DenseTensor* index,
+                                 DenseTensor* count,
+                                 bool return_index,
+                                 bool return_inverse,
+                                 bool return_counts) {
+  const InT* in_data = in.data<InT>();
+  std::set<InT> unique(in_data, in_data + in.numel());
+  out->Resize(phi::make_ddim({static_cast<int64_t>(unique.size())}));
+  auto* out_data = context.template Alloc<InT>(out);
+  std::copy(unique.begin(), unique.end(), out_data);
+  if (return_index) {
+    indices->Resize(phi::make_ddim({out->numel()}));
+    auto indices_data = context.template Alloc<IndexT>(indices);
+    std::unordered_map<InT, IndexT> indices_map;
+    indices_map.reserve(out->numel());
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      if (indices_map.find(in_data[i]) != indices_map.end()) continue;
+      indices_map[in_data[i]] = i;
+    }
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      indices_data[i] = indices_map[out_data[i]];
+    }
+  }
+  if (return_inverse) {
+    index->Resize(phi::make_ddim({in.numel()}));
+    auto inverse_data = context.template Alloc<IndexT>(index);
+    std::unordered_map<InT, IndexT> inverse_map;
+    inverse_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      inverse_map[out_data[i]] = i;
+    }
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      inverse_data[i] = inverse_map[in_data[i]];
+    }
+  }
+  if (return_counts) {
+    count->Resize(phi::make_ddim({out->numel()}));
+    auto count_data = context.template Alloc<IndexT>(count);
+    std::unordered_map<InT, IndexT> counts_map;
+    counts_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      counts_map[out_data[i]] = 0;
+    }
+    for (int64_t i = 0; i < in.numel(); i++) {
+      counts_map[in_data[i]] += 1;
+    }
+    for (int64_t i = 0; i < out->numel(); i++) {
+      count_data[i] = counts_map[out_data[i]];
+    }
+  }
+}
+template <typename Context, typename ForwardIt, typename InT, typename IndexT>
+static ForwardIt UniqueDimImpl(const Context& context,
+                               ForwardIt first,
+                               ForwardIt last,
+                               const std::vector<IndexT>& sorted_indices_vec,
+                               std::vector<IndexT>* inverse_vec,
+                               std::vector<IndexT>* counts_vec,
+                               std::vector<IndexT>* indices_vec) {
+  if (first == last) {
+    return last;
+  }
+  (*inverse_vec)[sorted_indices_vec[0]] = 0;
+  (*counts_vec)[0] = 1;
+  (*indices_vec)[0] = sorted_indices_vec[0];
+  ForwardIt begin = first;
+  ForwardIt result = first;
+  while (++first != last) {
+    int64_t idx_first = std::distance(begin, first);
+    int64_t idx_result = std::distance(begin, result);
+    if (!Equal<InT>(*result, *first)) {
+      if (++result != first) {
+        *result = std::move(*first);
+      }
+      idx_result += 1;
+      (*indices_vec)[idx_result] = sorted_indices_vec[idx_first];
+    }
+    (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result;
+    (*counts_vec)[idx_result] += 1;
+  }
+  return ++result;
+}
+template <typename Context, typename InT, typename IndexT>
+static void UniqueDim(const Context& context,
+                      const DenseTensor& in,
+                      DenseTensor* out,
+                      DenseTensor* indices,
+                      DenseTensor* index,
+                      DenseTensor* count,
+                      bool return_index,
+                      bool return_inverse,
+                      bool return_counts,
+                      int axis) {
+  // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
+  std::vector<int> permute(in.dims().size());
+  std::iota(permute.begin(), permute.end(), 0);
+  permute[axis] = 0;
+  permute[0] = axis;
+  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
+  in_trans_dims_vec[axis] = in.dims()[0];
+  in_trans_dims_vec[0] = in.dims()[axis];
+  DenseTensor in_trans;
+  phi::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec);
+  in_trans.Resize(in_trans_dims);
+  context.template Alloc<InT>(&in_trans);
+  TransCompute<Context, InT>(in.dims().size(), context, in, &in_trans, permute);
+  // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
+  phi::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
+  in_trans.Resize(in_trans_flat_dims);
+  // sort indices
+  std::vector<IndexT> sorted_indices_vec(in_trans.dims()[0]);
+  std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0);
+  int64_t col = in_trans.dims()[1];
+  const InT* in_trans_data = in_trans.data<InT>();
+  std::sort(sorted_indices_vec.begin(),
+            sorted_indices_vec.end(),
+            [&](int64_t a, int64_t b) -> bool {
+              for (int64_t i = 0; i < col; ++i) {
+                InT lhs = in_trans_data[i + a * col];
+                InT rhs = in_trans_data[i + b * col];
+                if (lhs < rhs) {
+                  return true;
+                } else if (lhs > rhs) {
+                  return false;
+                }
+              }
+              return false;
+            });
+  // sort tensor according to indices
+  DenseTensor input_sorted;
+  input_sorted.Resize(in_trans_dims);
+  context.template Alloc<InT>(&input_sorted);
+  InT* input_sorted_data = input_sorted.data<InT>();
+  for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
+    memcpy(input_sorted_data + i * col,
+           in_trans_data + static_cast<int64_t>(sorted_indices_vec[i]) * col,
+           col * sizeof(InT));
+  }
+  std::vector<DenseTensor> input_unbind = Unbind(input_sorted);
+  std::vector<IndexT> inverse_vec(sorted_indices_vec.size(), 0);
+  std::vector<IndexT> counts_vec(sorted_indices_vec.size(), 0);
+  std::vector<IndexT> indices_vec(sorted_indices_vec.size(), 0);
+  auto last = UniqueDimImpl<Context, std::vector<DenseTensor>::iterator, InT>(
+      context,
+      input_unbind.begin(),
+      input_unbind.end(),
+      sorted_indices_vec,
+      &inverse_vec,
+      &counts_vec,
+      &indices_vec);
+  input_unbind.erase(last, input_unbind.end());
+  counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
+  indices_vec.erase(indices_vec.begin() + input_unbind.size(),
+                    indices_vec.end());
+  phi::funcs::ConcatFunctor<Context, InT> concat_functor;
+  DenseTensor out_trans;
+  std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
+  out_trans_dims_vec[0] = input_unbind.size();
+  out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
+  context.template Alloc<InT>(&out_trans);
+  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
+  out->Resize(phi::make_ddim(out_trans_dims_vec));
+  context.template Alloc<InT>(out);
+  concat_functor(context, input_unbind, 0, &out_trans);
+  TransCompute<Context, InT>(
+      out_trans.dims().size(), context, out_trans, out, permute);
+  if (return_inverse) {
+    paddle::framework::TensorFromVector(inverse_vec, context, index);
+  }
+  if (return_counts) {
+    paddle::framework::TensorFromVector(counts_vec, context, count);
+  }
+  if (return_index) {
+    paddle::framework::TensorFromVector(indices_vec, context, indices);
+  }
+}
+template <typename Context, typename InT>
+struct UniqueFlattendTensorFunctor {
+  const Context& ctx_; /*  */
+  const DenseTensor& in_;
+  DenseTensor* out_;
+  DenseTensor* indices_;
+  DenseTensor* index_;
+  DenseTensor* count_;
+  const bool return_index_;
+  const bool return_inverse_;
+  const bool return_counts_;
+  UniqueFlattendTensorFunctor(const Context& context,
+                              const DenseTensor& in,
+                              DenseTensor* out,
+                              DenseTensor* indices,
+                              DenseTensor* index,
+                              DenseTensor* count,
+                              bool return_index,
+                              bool return_inverse,
+                              bool return_counts)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        indices_(indices),
+        index_(index),
+        count_(count),
+        return_index_(return_index),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts) {}
+  template <typename IndexT>
+  void apply() const {
+    UniqueFlattendTensor<Context, InT, IndexT>(ctx_,
+                                               in_,
+                                               out_,
+                                               indices_,
+                                               index_,
+                                               count_,
+                                               return_index_,
+                                               return_inverse_,
+                                               return_counts_);
+  }
+};
+template <typename Context, typename InT>
+struct UniqueDimFunctor {
+  const Context& ctx_;
+  const DenseTensor& in_;
+  DenseTensor* out_;
+  DenseTensor* indices_;
+  DenseTensor* index_;
+  DenseTensor* count_;
+  const int axis_;
+  const bool return_index_;
+  const bool return_inverse_;
+  const bool return_counts_;
+  UniqueDimFunctor(const Context& context,
+                   const DenseTensor& in,
+                   DenseTensor* out,
+                   DenseTensor* indices,
+                   DenseTensor* index,
+                   DenseTensor* count,
+                   const int axis,
+                   bool return_index,
+                   bool return_inverse,
+                   bool return_counts)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        indices_(indices),
+        index_(index),
+        count_(count),
+        axis_(axis),
+        return_index_(return_index),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts) {}
+  template <typename IndexT>
+  void apply() const {
+    UniqueDim<Context, InT, IndexT>(ctx_,
+                                    in_,
+                                    out_,
+                                    indices_,
+                                    index_,
+                                    count_,
+                                    return_index_,
+                                    return_inverse_,
+                                    return_counts_,
+                                    axis_);
+  }
+};
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/range_kernel.cu
+++ b/paddle/phi/kernels/gpu/range_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/range_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/range_function.h"
+namespace phi {
+template <typename T>
+__global__ void Range(T start, T step, int64_t size, T* out) {
+  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+}
+template <typename T, typename Context>
+void RangeKernel(const Context& dev_ctx,
+                 const DenseTensor& start,
+                 const DenseTensor& end,
+                 const DenseTensor& step,
+                 DenseTensor* out) {
+  T start_value = start.data<T>()[0];
+  T end_value = end.data<T>()[0];
+  T step_value = step.data<T>()[0];
+  int64_t size = 0;
+  phi::funcs::GetSize(start_value, end_value, step_value, &size);
+  out->Resize(phi::make_ddim({size}));
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  auto stream = dev_ctx.stream();
+  int block = std::min(size, static_cast<int64_t>(256));
+  int grid = (size + block - 1) / block;
+  Range<T><<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    range, GPU, ALL_LAYOUT, phi::RangeKernel, float, double, int64_t, int) {
+  kernel->InputAt(0).SetBackend(phi::Backend::CPU);
+  kernel->InputAt(1).SetBackend(phi::Backend::CPU);
+  kernel->InputAt(2).SetBackend(phi::Backend::CPU);
+}
--- a/paddle/phi/kernels/gpu/stack_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/stack_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/stack_grad_kernel.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename IntType>
+__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input,
+                                        int pre_dim_size,
+                                        int split_dim_size,
+                                        int suf_dim_size,
+                                        int num_split,
+                                        T** output_ptrs) {
+  assert(blockDim.y == 1);
+  assert(blockDim.z == 1);
+  // In this case they are equal
+  assert(split_dim_size % num_split == 0);
+  IntType size = pre_dim_size * split_dim_size * suf_dim_size;
+  IntType each_dim_size = split_dim_size / num_split;
+  for (IntType offset = blockIdx.x * blockDim.x + threadIdx.x; offset < size;
+       offset += blockDim.x * gridDim.x) {
+    IntType i = offset / (split_dim_size * suf_dim_size);
+    IntType j = (offset % (split_dim_size * suf_dim_size)) / suf_dim_size;
+    IntType k = offset % suf_dim_size;
+    T* output = output_ptrs[j / each_dim_size];
+    if (output == nullptr) {
+      return;
+    }
+    IntType output_ind = i * each_dim_size * suf_dim_size +
+                         (j % each_dim_size) * suf_dim_size + k;
+    *(output + output_ind) = input[offset];
+  }
+}
+template <typename T, typename Context>
+void StackGradKernel(const Context& dev_ctx,
+                     const DenseTensor& out,
+                     int axis,
+                     std::vector<DenseTensor*> x_grad) {
+  if (axis < 0) axis += out.dims().size();
+  int n = out.dims()[axis];
+  PADDLE_ENFORCE_EQ(n,
+                    x_grad.size(),
+                    phi::errors::InvalidArgument(
+                        "Output x_grad size should be equal to n, but"
+                        " received n is:%d x_grad size is:%d.",
+                        n,
+                        x_grad.size()));
+  // x_grad is output, so save each data address, then copy each dy into dx_data
+  std::vector<T*> outputs(n);
+  for (size_t j = 0; j < x_grad.size(); ++j) {
+    if (x_grad[j] == nullptr) {
+      outputs[j] = nullptr;
+      continue;
+    }
+    if (x_grad[j]->numel() != 0UL) {
+      T* ptr = dev_ctx.template Alloc<T>(x_grad[j]);
+      outputs[j] = ptr;
+    } else {
+      outputs[j] = nullptr;
+    }
+  }
+  auto dy_data = out.data<T>();
+  // each x_grad should have same shape
+  int dy_pre = 1, dy_suf = 1;
+  auto dy_dims = out.dims();
+  int split_dim = n;
+  for (int i = 0; i < axis; ++i) {
+    dy_pre *= dy_dims[i];
+  }
+  dy_suf = out.numel() / (split_dim * dy_pre);
+  auto tmp_out_data =
+      paddle::memory::Alloc(dev_ctx, outputs.size() * sizeof(T*));
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       tmp_out_data->ptr(),
+                       phi::CPUPlace(),
+                       reinterpret_cast<void*>(outputs.data()),
+                       outputs.size() * sizeof(T*),
+                       dev_ctx.stream());
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, dy_pre * split_dim * dy_suf);
+  if (out.numel() < std::numeric_limits<int32_t>::max()) {
+    UnStackHelperCUDAKernel<T, int32_t><<<config.block_per_grid.x,
+                                          config.thread_per_block.x,
+                                          0,
+                                          dev_ctx.stream()>>>(
+        dy_data,
+        dy_pre,
+        split_dim,
+        dy_suf,
+        split_dim,
+        reinterpret_cast<T**>(tmp_out_data->ptr()));
+  } else {
+    UnStackHelperCUDAKernel<T, int64_t><<<config.block_per_grid.x,
+                                          config.thread_per_block.x,
+                                          0,
+                                          dev_ctx.stream()>>>(
+        dy_data,
+        dy_pre,
+        split_dim,
+        dy_suf,
+        split_dim,
+        reinterpret_cast<T**>(tmp_out_data->ptr()));
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(stack_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::StackGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/gpu/stack_kernel.cu
+++ b/paddle/phi/kernels/gpu/stack_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/stack_kernel.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename IntType>
+__global__ void StackCUDAKernel(T** input_ptrs,
+                                int split_size,
+                                int rows,
+                                int cols,
+                                T* __restrict__ output) {
+  IntType grid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; grid_x < cols; grid_x += blockDim.x * gridDim.x) {
+    IntType grid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    IntType split = grid_x / split_size;
+    const T* input_ptr = input_ptrs[split];
+    IntType col_offset = grid_x % split_size;
+#pragma unroll
+    for (; grid_y < rows; grid_y += blockDim.y * gridDim.y) {
+      output[grid_y * cols + grid_x] =
+          input_ptr[grid_y * split_size + col_offset];
+    }
+  }
+}
+template <typename T, typename Context>
+void StackKernel(const Context& dev_ctx,
+                 const std::vector<const DenseTensor*>& x,
+                 int axis,
+                 DenseTensor* out) {
+  if (axis < 0) axis += (x[0]->dims().size() + 1);
+  int n = static_cast<int>(x.size());
+  T* y_data = dev_ctx.template Alloc<T>(out);
+  std::vector<const T*> x_datas(n);
+  for (int i = 0; i < n; i++) {
+    x_datas[i] = x[i]->data<T>();
+  }
+  auto tmp_x_data = paddle::memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*));
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       tmp_x_data->ptr(),
+                       phi::CPUPlace(),
+                       reinterpret_cast<void*>(x_datas.data()),
+                       x_datas.size() * sizeof(T*),
+                       dev_ctx.stream());
+  // Split x dim from axis to matrix
+  int x_row = 1, x_col = 1;
+  for (int i = 0; i < axis; ++i) {
+    x_row *= x[0]->dims()[i];
+  }
+  x_col = x[0]->numel() / x_row;
+  int out_col = x_col * n;
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);
+  if (out->numel() < std::numeric_limits<int32_t>::max()) {
+    StackCUDAKernel<T, int32_t><<<config.block_per_grid,
+                                  config.thread_per_block,
+                                  0,
+                                  dev_ctx.stream()>>>(
+        reinterpret_cast<T**>(tmp_x_data->ptr()),
+        x_col,
+        x_row,
+        out_col,
+        y_data);
+  } else {
+    StackCUDAKernel<T, int64_t><<<config.block_per_grid,
+                                  config.thread_per_block,
+                                  0,
+                                  dev_ctx.stream()>>>(
+        reinterpret_cast<T**>(tmp_x_data->ptr()),
+        x_col,
+        x_row,
+        out_col,
+        y_data);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(stack,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::StackKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
--- a/paddle/fluid/operators/unique_op.cu
+++ b/paddle/fluid/operators/unique_op.cu
--- a/paddle/phi/kernels/gpu/unstack_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/unstack_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/unstack_grad_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(unstack_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnStackGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/gpu/unstack_kernel.cu
+++ b/paddle/phi/kernels/gpu/unstack_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/unstack_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unstack_kernel_impl.h"
+PD_REGISTER_KERNEL(unstack,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnStackKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/stack_functor.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include <thrust/device_vector.h>
+#endif
+namespace phi {
+template <typename T, typename Context>
+void UnStackGradKernel(const Context &dev_ctx,
+                       const std::vector<const DenseTensor *> &x,
+                       int axis,
+                       DenseTensor *x_grad) {
+  if (axis < 0) axis += (x[0]->dims().size() + 1);
+  int n = static_cast<int>(x.size());
+  auto *x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  std::vector<const T *> x_datas(n);
+  for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
+  int pre = 1;
+  int post = 1;
+  auto &dim = x[0]->dims();
+  for (auto i = 0; i < axis; ++i) pre *= dim[i];
+  for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
+#if defined(__NVCC__) || defined(__HIPCC__)
+  int total_num = pre * n * post;
+  thrust::device_vector<const T *> device_x_vec(x_datas);
+  auto x_data_arr = device_x_vec.data().get();
+  phi::funcs::StackFunctorForRange(
+      dev_ctx, x_data_arr, x_grad_data, total_num, n, post);
+  // Wait() must be called because device_x_vec may be destructed before
+  // kernel ends
+  dev_ctx.Wait();
+#else
+  auto x_data_arr = x_datas.data();
+  size_t x_offset = 0;
+  size_t y_offset = 0;
+  for (int i = 0; i < pre; i++) {
+    for (int j = 0; j < n; j++) {
+      std::memcpy(
+          x_grad_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T));
+      y_offset += post;
+    }
+    x_offset += post;
+  }
+#endif
+}
+}  // namespace phi
--- a/paddle/phi/kernels/impl/unstack_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unstack_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/stack_functor.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include <thrust/device_vector.h>
+#endif
+namespace phi {
+template <typename T, typename Context>
+void UnStackKernel(const Context &dev_ctx,
+                   const DenseTensor &x,
+                   int axis,
+                   int num,
+                   std::vector<DenseTensor *> outs) {
+  auto *dy = &x;
+  auto dx = outs;
+  if (axis < 0) axis += dy->dims().size();
+  int n = dy->dims()[axis];
+  std::vector<T *> dx_datas(n);  // NOLINT
+  for (int i = 0; i < n; i++) {
+    dx_datas[i] = dev_ctx.template Alloc<T>(dx[i]);
+  }
+  auto dy_data = dy->data<T>();
+  if (dy->numel() == 0) return;
+  int pre = 1;
+  for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
+  int total_num = dy->numel();
+  int post = total_num / (n * pre);
+#if defined(__NVCC__) || defined(__HIPCC__)
+  thrust::device_vector<T *> device_dx_vec(dx_datas);
+  auto dx_data_arr = device_dx_vec.data().get();
+#else
+  auto dx_data_arr = dx_datas.data();
+#endif
+  phi::funcs::StackGradFunctorForRange(
+      dev_ctx, dx_data_arr, dy_data, total_num, n, post);
+#if defined(__NVCC__) || defined(__HIPCC__)
+  // Wait() must be called because device_dx_vec may be destructed before
+  // kernel ends
+  dev_ctx.Wait();
+#endif
+}
+}  // namespace phi
--- a/paddle/phi/kernels/range_kernel.h
+++ b/paddle/phi/kernels/range_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void RangeKernel(const Context& dev_ctx,
+                 const DenseTensor& start,
+                 const DenseTensor& end,
+                 const DenseTensor& step,
+                 DenseTensor* out);
+}  // namespace phi
--- a/paddle/phi/kernels/stack_grad_kernel.h
+++ b/paddle/phi/kernels/stack_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void StackGradKernel(const Context& dev_ctx,
+                     const DenseTensor& out,
+                     int axis,
+                     std::vector<DenseTensor*> x_grad);
+}  // namespace phi
--- a/paddle/phi/kernels/stack_kernel.h
+++ b/paddle/phi/kernels/stack_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void StackKernel(const Context& dev_ctx,
+                 const std::vector<const DenseTensor*>& x,
+                 int axis,
+                 DenseTensor* out);
+}  // namespace phi
--- a/paddle/phi/kernels/unique_kernel.h
+++ b/paddle/phi/kernels/unique_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void UniqueKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  bool return_index,
+                  bool return_inverse,
+                  bool return_counts,
+                  const std::vector<int>& axis,
+                  DataType dtype,
+                  DenseTensor* out,
+                  DenseTensor* indices,
+                  DenseTensor* index,
+                  DenseTensor* counts);
+template <typename T, typename Context>
+void UniqueRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     bool return_index,
+                     bool return_inverse,
+                     bool return_counts,
+                     const std::vector<int>& axis,
+                     DataType dtype,
+                     bool is_sorted,
+                     DenseTensor* out,
+                     DenseTensor* indices,
+                     DenseTensor* index,
+                     DenseTensor* counts);
+}  // namespace phi
--- a/paddle/phi/kernels/unstack_grad_kernel.h
+++ b/paddle/phi/kernels/unstack_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void UnStackGradKernel(const Context& dev_ctx,
+                       const std::vector<const DenseTensor*>& x,
+                       int axis,
+                       DenseTensor* x_grad);
+}  // namespace phi
--- a/paddle/phi/kernels/unstack_kernel.h
+++ b/paddle/phi/kernels/unstack_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void UnStackKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int axis,
+                   int num,
+                   std::vector<DenseTensor*> outs);
+}  // namespace phi
--- a/paddle/phi/ops/compat/stack_sig.cc
+++ b/paddle/phi/ops/compat/stack_sig.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature StackGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "stack_grad", {GradVarName("Y")}, {"axis"}, {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(stack_grad, phi::StackGradOpArgumentMapping);
--- a/paddle/phi/ops/compat/unique_sig.cc
+++ b/paddle/phi/ops/compat/unique_sig.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature UniqueOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  bool is_sorted = paddle::any_cast<bool>(ctx.Attr("is_sorted"));
+  if (is_sorted) {
+    return KernelSignature(
+        "unique",
+        {"X"},
+        {"return_index", "return_inverse", "return_counts", "axis", "dtype"},
+        {"Out", "Indices", "Index", "Counts"});
+  } else {
+    return KernelSignature("unique_raw",
+                           {"X"},
+                           {"return_index",
+                            "return_inverse",
+                            "return_counts",
+                            "axis",
+                            "dtype",
+                            "is_sorted"},
+                           {"Out", "Indices", "Index", "Counts"});
+  }
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(unique, phi::UniqueOpArgumentMapping);
--- a/paddle/phi/ops/compat/unstack_sig.cc
+++ b/paddle/phi/ops/compat/unstack_sig.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature UnStackGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "unstack_grad", {GradVarName("Y")}, {"axis"}, {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(unstack_grad, phi::UnStackGradOpArgumentMapping);