[phi] move unbind to phi (#39789)

* move unbind to phi * revert infer shape * add header file * move concat_and_split to phi

[phi] move unbind to phi (#39789)
* move unbind to phi * revert infer shape * add header file * move concat_and_split to phi
dba694f4 · Leo Chen · GitHub · 1a1a2ce8 · dba694f4 · dba694f4
23 changed file
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,9 +6,9 @@ endif()
 # please add new math_library in alphabetical order
 if (WITH_ASCEND_CL)
-math_library(concat_and_split DEPS npu_op_runner)
+math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner)
 else()
-math_library(concat_and_split)
+math_library(concat_and_split DEPS concat_and_split_functor)
 endif()
 math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)

--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/phi/kernels/cpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
@@ -46,9 +46,8 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& context,
                  const std::vector<framework::Tensor>& input, int axis,
                  framework::Tensor* output) {
-    std::vector<phi::DenseTensor> pt_input{input.begin(), input.end()};
+    phi::funcs::ConcatFunctor<phi::CPUContext, T> functor;
-    phi::ConcatImpl<T, platform::CPUDeviceContext>(context, pt_input, axis,
+    functor(context, input, axis, output);
-                                                   output);
  }
 };
@@ -63,11 +62,8 @@ class SplitFunctor<platform::CPUDeviceContext, T> {
                  const framework::Tensor& input,
                  const std::vector<const framework::Tensor*>& ref_inputs,
                  const int axis, std::vector<framework::Tensor*>* outputs) {
-    std::vector<const phi::DenseTensor*> pt_ref_inputs{ref_inputs.begin(),
+    phi::funcs::SplitFunctor<phi::CPUContext, T> functor;
-                                                       ref_inputs.end()};
+    functor(context, input, ref_inputs, axis, outputs);
-    std::vector<phi::DenseTensor*> pt_outputs{outputs->begin(), outputs->end()};
-    phi::SplitImpl<T, platform::CPUDeviceContext>(context, input, pt_ref_inputs,
-                                                  axis, &pt_outputs);
  }
 };

--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/phi/kernels/gpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace paddle {
 namespace operators {
 namespace math {
@@ -29,10 +29,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
  void operator()(const platform::CUDADeviceContext& context,
                  const std::vector<framework::Tensor>& input, int axis,
                  framework::Tensor* output) {
-    std::vector<phi::DenseTensor> pt_input{input.begin(), input.end()};
+    phi::funcs::ConcatFunctor<phi::GPUContext, T> functor;
+    functor(context, input, axis, output);
-    phi::ConcatImpl<T, platform::CUDADeviceContext>(context, pt_input, axis,
-                                                    output);
  }
 };
@@ -43,16 +41,12 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
 template <typename T>
 class SplitFunctor<platform::CUDADeviceContext, T> {
 public:
-  SplitFunctor();
  void operator()(const platform::CUDADeviceContext& context,
                  const framework::Tensor& input,
                  const std::vector<const framework::Tensor*>& ref_inputs,
                  int axis, std::vector<framework::Tensor*>* outputs) {
-    std::vector<const phi::DenseTensor*> pt_ref_inputs{ref_inputs.begin(),
+    phi::funcs::SplitFunctor<phi::GPUContext, T> functor;
-                                                       ref_inputs.end()};
+    functor(context, input, ref_inputs, axis, outputs);
-    std::vector<phi::DenseTensor*> pt_outputs{outputs->begin(), outputs->end()};
-    phi::SplitImpl<T, platform::CUDADeviceContext>(
-        context, input, pt_ref_inputs, axis, &pt_outputs);
  }
 };

--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -64,17 +64,3 @@ class SplitFunctor {
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
-#define FOR_ALL_TYPES(macro)                 \
-  macro(int);                                \
-  macro(float);                              \
-  macro(double);                             \
-  macro(bool);                               \
-  macro(int64_t);                            \
-  macro(int16_t);                            \
-  macro(uint8_t);                            \
-  macro(int8_t);                             \
-  macro(::paddle::platform::float16);        \
-  macro(::paddle::platform::bfloat16);       \
-  macro(::paddle::platform::complex<float>); \
-  macro(::paddle::platform::complex<double>);
--- a/paddle/fluid/operators/unbind_op.cc
+++ b/paddle/fluid/operators/unbind_op.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/unbind_op.h"
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace operators {
@@ -79,11 +82,3 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(unbind, ops::UnbindOp, ops::UnbindOpMaker,
                  ops::UnbindGradMaker<paddle::framework::OpDesc>,
                  ops::UnbindGradMaker<paddle::imperative::OpBase>);
-namespace plat = paddle::platform;
-REGISTER_OP_CPU_KERNEL(
-    unbind, ops::UnbindOpKernel<plat::CPUDeviceContext, double>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, float>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, int64_t>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, int>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, plat::float16>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, plat::bfloat16>);
--- a/paddle/fluid/operators/unbind_op.cu.cc
+++ b/paddle/fluid/operators/unbind_op.cu.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/unbind_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    unbind, ops::UnbindOpKernel<plat::CUDADeviceContext, double>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, float>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, int64_t>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, int>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, plat::bfloat16>);
--- a/paddle/fluid/operators/unbind_op.h
+++ b/paddle/fluid/operators/unbind_op.h
@@ -34,27 +34,6 @@ static inline framework::DDim UnbindOutsDims(const framework::DDim in_dims,
  }
  return phi::make_ddim(out_dims);
 }
-template <typename DeviceContext, typename T>
-class UnbindOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    int axis = ctx.Attr<int>("axis");
-    auto in_dims = in->dims();
-    axis = axis < 0 ? in_dims.size() + axis : axis;
-    std::vector<const framework::Tensor*> shape_refer;
-    for (size_t j = 0; j < outs.size(); ++j) {
-      outs[j]->mutable_data<T>(ctx.GetPlace());
-      shape_refer.emplace_back(outs[j]);
-    }
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::SplitFunctor<DeviceContext, T> functor;
-    functor(dev_ctx, *in, shape_refer, axis, &outs);
-  }
-};
 template <typename T>
 class UnbindGradMaker : public framework::SingleGradOpMaker<T> {

--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -485,6 +485,25 @@ void SplitInferMeta(const MetaTensor& x,
  }
 }
+void UnbindInferMeta(const MetaTensor& x,
+                     int axis,
+                     std::vector<MetaTensor>* outs) {
+  auto in_dims = x.dims();
+  std::vector<int> out_dim;
+  axis = axis < 0 ? in_dims.size() + axis : axis;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (i != axis) out_dim.push_back(in_dims[i]);
+  }
+  auto out_dims = phi::make_ddim(out_dim);
+  for (size_t i = 0; i < outs->size(); ++i) {
+    (*outs)[i].set_dtype(x.dtype());
+    (*outs)[i].set_dims(out_dims);
+    (*outs)[i].set_layout(x.layout());
+    (*outs)[i].share_lod(x);
+  }
+}
 void TraceInferMeta(
    const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out) {
  int dim1 = axis1;

--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -90,6 +90,9 @@ void SplitInferMeta(const MetaTensor& x_meta,
                    std::vector<MetaTensor>* out,
                    MetaConfig config = MetaConfig());
+void UnbindInferMeta(const MetaTensor& x,
+                     int axis,
+                     std::vector<MetaTensor>* outs);
 void TraceInferMeta(
    const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);

--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -10,7 +10,7 @@ add_subdirectory(funcs)
 set_property(GLOBAL PROPERTY PTEN_KERNELS "")
 set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)

--- a/paddle/phi/kernels/cpu/concat_and_split.h
+++ b/paddle/phi/kernels/cpu/concat_and_split.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/dense_tensor.h"
-namespace phi {
-/*
- * \brief Concatenate the input tensors along the dimension axis.
- *  TODO(zcd): maybe it needs to be more detailed.
- *  Examples:
- *     Input[0] = [[1,2],[3,4]]
- *     Input[1] = [[5,6]]
- *     axis = 0
- *
- *     Output = [[1,2],
- *               [3,4],
- *               [5,6]]
- */
-template <typename T, typename Context>
-void ConcatImpl(const Context& context,
-                const std::vector<DenseTensor>& input,
-                int axis,
-                DenseTensor* output) {
-  // TODO(zcd): Add input data validity checking
-  size_t num = input.size();
-  int64_t rows = 1;
-  auto dim_0 = input[0].dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
-  }
-  int64_t out_rows = rows, out_cols = 0;
-  std::vector<int64_t> input_cols(input.size());
-  for (size_t i = 0; i < num; ++i) {
-    int64_t t_cols = input[i].numel() / rows;
-    out_cols += t_cols;
-    input_cols[i] = t_cols;
-  }
-  auto cpu_place = context.GetPlace();
-  // computation
-  auto output_data = output->data<T>();
-  int64_t col_idx = 0;
-  for (size_t j = 0; j < num; ++j) {
-    int64_t col_len = input_cols[j];
-    auto input_data = input[j].data<T>();
-    for (int64_t k = 0; k < out_rows; ++k) {
-      paddle::memory::Copy(cpu_place,
-                           output_data + k * out_cols + col_idx,
-                           cpu_place,
-                           input_data + k * col_len,
-                           sizeof(T) * col_len);
-    }
-    col_idx += col_len;
-  }
-}
-/*
- * \brief Split the input tensors along the dimension axis into outputs.
- *  TODO(zcd): maybe it needs to be more detailed.
- *  Examples:
- *     Input = [[1,2],
- *              [3,4],
- *              [5,6]]
- *     axis = 0
- *
- *     Output[0] = [[1,2],[3,4]]
- *     Output[1] = [[5,6]]
- */
-template <typename T, typename Context>
-void SplitImpl(const Context& context,
-               const DenseTensor& input,
-               const std::vector<const DenseTensor*>& ref_inputs,
-               const int axis,
-               std::vector<DenseTensor*>* outputs) {
-  // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
-  // tensors of shape [0,1,4]
-  if (input.numel() == 0) {
-    return;
-  }
-  // TODO(zcd): Add input data validity checking
-  size_t num = outputs->size();
-  int input_rows = 1;
-  auto dim_0 = ref_inputs[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    input_rows *= dim_0[i];
-  }
-  int input_cols = 0;
-  std::vector<int64_t> output_cols(outputs->size());
-  for (size_t i = 0; i < num; ++i) {
-    int t_cols = ref_inputs[i]->numel() / input_rows;
-    input_cols += t_cols;
-    output_cols[i] = t_cols;
-  }
-  auto cpu_place = context.GetPlace();
-  // computation
-  for (int k = 0; k < input_rows; ++k) {
-    const T* src_ptr = input.data<T>() + k * input_cols;
-    int col_idx = 0;
-    for (size_t j = 0; j < num; ++j) {
-      int col_len = output_cols[j];
-      auto* out_tensor = outputs->at(j);
-      if (out_tensor != nullptr) {
-        T* dst_ptr = out_tensor->data<T>() + k * col_len;
-        paddle::memory::Copy(cpu_place,
-                             dst_ptr,
-                             cpu_place,
-                             src_ptr + col_idx,
-                             sizeof(T) * col_len);
-      }
-      col_idx += col_len;
-    }
-  }
-}
-}  // namespace phi
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -22,7 +22,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/lod_utils.h"
-#include "paddle/phi/kernels/cpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 namespace phi {
@@ -104,7 +104,8 @@ void ConcatKernel(const Context& dev_ctx,
        continue;
      }
    }
-    ConcatImpl<T, Context>(dev_ctx, inputs, axis, out);
+    phi::funcs::ConcatFunctor<Context, T> functor;
+    functor(dev_ctx, inputs, axis, out);
  }
 }

--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/cpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace phi {
 template <typename T, typename Context>
@@ -54,7 +54,8 @@ void SplitKernel(const Context& dev_ctx,
    paddle::operators::StridedMemcpyWithAxis0<T>(
        dev_ctx, x, shape_refer, &outs);
  } else {
-    SplitImpl<T, Context>(dev_ctx, x, shape_refer, axis, &outs);
+    phi::funcs::SplitFunctor<Context, T> functor;
+    functor(dev_ctx, x, shape_refer, axis, &outs);
  }
 }

--- a/paddle/phi/kernels/cpu/unbind_kernel.cc
+++ b/paddle/phi/kernels/cpu/unbind_kernel.cc
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/unbind_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unbind_kernel_impl.h"
+PD_REGISTER_KERNEL(unbind,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UnbindKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -3,3 +3,4 @@ add_subdirectory(blas)
 add_subdirectory(lapack)
 math_library(math_function DEPS blas dense_tensor tensor)
+math_library(concat_and_split_functor DEPS dense_tensor)
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cc
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cmath>
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+namespace phi {
+namespace funcs {
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+struct ConcatFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& context,
+                  const std::vector<phi::DenseTensor>& input,
+                  int axis,
+                  phi::DenseTensor* output) {
+    // TODO(zcd): Add input data validity checking
+    size_t num = input.size();
+    int64_t rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int64_t out_rows = rows, out_cols = 0;
+    std::vector<int64_t> input_cols(input.size());
+    for (size_t i = 0; i < num; ++i) {
+      int64_t t_cols = input[i].numel() / rows;
+      out_cols += t_cols;
+      input_cols[i] = t_cols;
+    }
+    auto cpu_place = context.GetPlace();
+    // computation
+    auto output_data = output->data<T>();
+    int64_t col_idx = 0;
+    for (size_t j = 0; j < num; ++j) {
+      int64_t col_len = input_cols[j];
+      auto input_data = input[j].data<T>();
+      for (int64_t k = 0; k < out_rows; ++k) {
+        paddle::memory::Copy(cpu_place,
+                             output_data + k * out_cols + col_idx,
+                             cpu_place,
+                             input_data + k * col_len,
+                             sizeof(T) * col_len);
+      }
+      col_idx += col_len;
+    }
+  }
+};
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+struct SplitFunctor<phi::CPUContext, T> {
+ public:
+  void operator()(const phi::CPUContext& context,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
+                  int axis,
+                  std::vector<phi::DenseTensor*>* outputs) {
+    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
+    // tensors of shape [0,1,4]
+    if (input.numel() == 0) {
+      return;
+    }
+    // TODO(zcd): Add input data validity checking
+    size_t num = outputs->size();
+    int input_rows = 1;
+    auto dim_0 = ref_inputs[0]->dims();
+    for (int i = 0; i < axis; ++i) {
+      input_rows *= dim_0[i];
+    }
+    int input_cols = 0;
+    std::vector<int64_t> output_cols(outputs->size());
+    for (size_t i = 0; i < num; ++i) {
+      int t_cols = ref_inputs[i]->numel() / input_rows;
+      input_cols += t_cols;
+      output_cols[i] = t_cols;
+    }
+    auto cpu_place = context.GetPlace();
+    // computation
+    for (int k = 0; k < input_rows; ++k) {
+      const T* src_ptr = input.data<T>() + k * input_cols;
+      int col_idx = 0;
+      for (size_t j = 0; j < num; ++j) {
+        int col_len = output_cols[j];
+        auto* out_tensor = outputs->at(j);
+        if (out_tensor != nullptr) {
+          T* dst_ptr = out_tensor->data<T>() + k * col_len;
+          paddle::memory::Copy(cpu_place,
+                               dst_ptr,
+                               cpu_place,
+                               src_ptr + col_idx,
+                               sizeof(T) * col_len);
+        }
+        col_idx += col_len;
+      }
+    }
+  }
+};
+#define DEFINE_FUNCTOR(type)                           \
+  template class ConcatFunctor<phi::CPUContext, type>; \
+  template class SplitFunctor<phi::CPUContext, type>;
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/concat_and_split.h
+++ b/paddle/phi/kernels/gpu/concat_and_split.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-// limitations under the License.
+limitations under the License. */
-#pragma once
+#include <cmath>
-#include <algorithm>
+#include <memory>
 #include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace phi {
+namespace funcs {
 template <typename T>
 __global__ void ConcatKernel_(const T** inputs,
@@ -264,26 +271,28 @@ static inline void GetBlockDims(const phi::GPUContext& context,
 * All tensors' dimension should be the same and the values of
 * each dimension must be the same, except the axis dimension.
 */
-template <typename T, typename Context>
-void ConcatImpl(const Context& context,
-                const std::vector<phi::DenseTensor>& input,
-                int axis,
-                phi::DenseTensor* output) {
-  // TODO(zcd): Add input data validity checking
-  int in_num = input.size();
-  int64_t in_row = 1;
-  auto dim_0 = input[0].dims();
-  for (int i = 0; i < axis; ++i) {
-    in_row *= dim_0[i];
-  }
-  int64_t in_col = input[0].numel() / in_row;
-  int64_t out_row = in_row, out_col = 0;
-  int inputs_col_num = in_num + 1;
+template <typename T>
-  std::vector<const T*> inputs_data_vec(in_num);
+struct ConcatFunctor<phi::GPUContext, T> {
-  std::vector<int64_t> inputs_col_vec(inputs_col_num);
+  void operator()(const phi::GPUContext& context,
-  const T** inputs_data = inputs_data_vec.data();
+                  const std::vector<phi::DenseTensor>& input,
-  int64_t* inputs_col = inputs_col_vec.data();
+                  int axis,
+                  phi::DenseTensor* output) {
+    // TODO(zcd): Add input data validity checking
+    int in_num = input.size();
+    int64_t in_row = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      in_row *= dim_0[i];
+    }
+    int64_t in_col = input[0].numel() / in_row;
+    int64_t out_row = in_row, out_col = 0;
+    int inputs_col_num = in_num + 1;
+    std::vector<const T*> inputs_data_vec(in_num);
+    std::vector<int64_t> inputs_col_vec(inputs_col_num);
+    const T** inputs_data = inputs_data_vec.data();
+    int64_t* inputs_col = inputs_col_vec.data();
 // There are some differences between hip runtime and NV runtime.
 // In NV, when the pageable memory data less than 64K is transferred from
@@ -293,152 +302,151 @@ void ConcatImpl(const Context& context,
 // 3.2.6.1. Concurrent Execution between Host and Device
 // Memory copies from host to device of a memory block of 64 KB or less
 #ifdef PADDLE_WITH_HIP
-  paddle::memory::AllocationPtr data_alloc, col_alloc;
+    paddle::memory::AllocationPtr data_alloc, col_alloc;
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
+    // TODO(chentianyu03): try to find a method to remove the Alloc function
-  data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+    data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                     in_num * sizeof(T*));
+                                       in_num * sizeof(T*));
-  inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
+    inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
+    // TODO(chentianyu03): try to find a method to remove the Alloc function
-  col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+    col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                    inputs_col_num * sizeof(int));
+                                      inputs_col_num * sizeof(int));
-  inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
+    inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
 #endif
-  inputs_col[0] = 0;
+    inputs_col[0] = 0;
-  bool has_same_shape = true;
+    bool has_same_shape = true;
-  for (int i = 0; i < in_num; ++i) {
+    for (int i = 0; i < in_num; ++i) {
-    int64_t t_cols = input[i].numel() / in_row;
+      int64_t t_cols = input[i].numel() / in_row;
-    if (has_same_shape) {
+      if (has_same_shape) {
-      if (t_cols != in_col) has_same_shape = false;
+        if (t_cols != in_col) has_same_shape = false;
+      }
+      out_col += t_cols;
+      inputs_col[i + 1] = out_col;
+      inputs_data[i] = input[i].data<T>();
    }
-    out_col += t_cols;
-    inputs_col[i + 1] = out_col;
-    inputs_data[i] = input[i].data<T>();
-  }
-  dim3 block_dims;
+    dim3 block_dims;
-  dim3 grid_dims;
+    dim3 grid_dims;
-  GetBlockDims(context, out_row, out_col, &block_dims, &grid_dims);
+    GetBlockDims(context, out_row, out_col, &block_dims, &grid_dims);
-  paddle::memory::allocation::AllocationPtr tmp_dev_ins_data;
+    paddle::memory::allocation::AllocationPtr tmp_dev_ins_data;
-  const T** dev_ins_data = nullptr;
+    const T** dev_ins_data = nullptr;
-  if (!has_same_shape || in_num < 2 || in_num > 4) {
+    if (!has_same_shape || in_num < 2 || in_num > 4) {
-    tmp_dev_ins_data = paddle::memory::Alloc(context, in_num * sizeof(T*));
+      tmp_dev_ins_data = paddle::memory::Alloc(context, in_num * sizeof(T*));
-    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
-        inputs_data, in_num);
+          inputs_data, in_num);
-    paddle::memory::Copy(context.GetPlace(),
+      paddle::memory::Copy(context.GetPlace(),
-                         tmp_dev_ins_data->ptr(),
+                           tmp_dev_ins_data->ptr(),
-                         phi::CPUPlace(),
+                           paddle::platform::CPUPlace(),
-                         restored,
+                           restored,
-                         in_num * sizeof(T*),
+                           in_num * sizeof(T*),
-                         context.stream());
+                           context.stream());
-    dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
+      dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
-  }
+    }
+    if (has_same_shape) {
+      if (in_num == 2) {
+        ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            inputs_data[0],
+            inputs_data[1],
+            in_col,
+            out_row,
+            out_col,
+            output->data<T>());
+      } else if (in_num == 3) {
+        ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            inputs_data[0],
+            inputs_data[1],
+            inputs_data[2],
+            in_col,
+            out_row,
+            out_col,
+            output->data<T>());
+      } else if (in_num == 4) {
+        ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            inputs_data[0],
+            inputs_data[1],
+            inputs_data[2],
+            inputs_data[3],
+            in_col,
+            out_row,
+            out_col,
+            output->data<T>());
+      } else {
+        ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>());
+      }
+    } else {
+      auto tmp_dev_ins_col_data =
+          paddle::memory::Alloc(context, inputs_col_num * sizeof(int64_t));
+      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+          inputs_col, inputs_col_num);
+      paddle::memory::Copy(context.GetPlace(),
+                           tmp_dev_ins_col_data->ptr(),
+                           paddle::platform::CPUPlace(),
+                           restored,
+                           inputs_col_num * sizeof(int64_t),
+                           context.stream());
+      int64_t* dev_ins_col_data =
+          static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
-  if (has_same_shape) {
-    if (in_num == 2) {
-      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          inputs_data[0],
-          inputs_data[1],
-          in_col,
-          out_row,
-          out_col,
-          output->data<T>());
-    } else if (in_num == 3) {
-      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          inputs_data[0],
-          inputs_data[1],
-          inputs_data[2],
-          in_col,
-          out_row,
-          out_col,
-          output->data<T>());
-    } else if (in_num == 4) {
      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          inputs_data[0],
+          dev_ins_data,
-          inputs_data[1],
+          dev_ins_col_data,
-          inputs_data[2],
+          static_cast<int>(inputs_col_num),
-          inputs_data[3],
-          in_col,
          out_row,
          out_col,
          output->data<T>());
-    } else {
-      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>());
    }
-  } else {
-    auto tmp_dev_ins_col_data =
-        paddle::memory::Alloc(context, inputs_col_num * sizeof(int64_t));
-    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
-        inputs_col, inputs_col_num);
-    paddle::memory::Copy(context.GetPlace(),
-                         tmp_dev_ins_col_data->ptr(),
-                         phi::CPUPlace(),
-                         restored,
-                         inputs_col_num * sizeof(int64_t),
-                         context.stream());
-    int64_t* dev_ins_col_data =
-        static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
-    ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-        dev_ins_data,
-        dev_ins_col_data,
-        static_cast<int>(inputs_col_num),
-        out_row,
-        out_col,
-        output->data<T>());
-  }
 #ifdef PADDLE_WITH_HIP
-  // Prevent the pinned memory value from being covered and release the memory
+    // Prevent the pinned memory value from being covered and release the memory
-  // after the launch kernel of the stream is executed (reapply pinned memory
+    // after the launch kernel of the stream is executed (reapply pinned memory
-  // next time)
+    // next time)
-  auto* data_alloc_released = data_alloc.release();
+    auto* data_alloc_released = data_alloc.release();
-  auto* col_alloc_released = col_alloc.release();
+    auto* col_alloc_released = col_alloc.release();
-  context.AddStreamCallback([data_alloc_released, col_alloc_released] {
+    context.AddStreamCallback([data_alloc_released, col_alloc_released] {
-    paddle::memory::allocation::Allocator::AllocationDeleter(
+      paddle::memory::allocation::Allocator::AllocationDeleter(
-        data_alloc_released);
+          data_alloc_released);
-    paddle::memory::allocation::Allocator::AllocationDeleter(
+      paddle::memory::allocation::Allocator::AllocationDeleter(
-        col_alloc_released);
+          col_alloc_released);
-  });
+    });
 #endif
-}
-/*
- * All tensors' dimension should be the same and the values of
- * each dimension must be the same, except the axis dimension.
- */
-template <typename T, typename Context>
-void SplitImpl(const Context& context,
-               const phi::DenseTensor& input,
-               const std::vector<const phi::DenseTensor*>& ref_inputs,
-               int axis,
-               std::vector<phi::DenseTensor*>* outputs) {
-  // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
-  // tensors of shape [0,1,4]
-  if (input.numel() == 0) {
-    return;
  }
+};
-  // TODO(zcd): Add input data validity checking
+template <typename T>
-  int o_num = outputs->size();
+class SplitFunctor<phi::GPUContext, T> {
-  int64_t out_row = 1;
+ public:
-  auto dim_0 = ref_inputs[0]->dims();
+  void operator()(const phi::GPUContext& context,
-  for (int i = 0; i < axis; ++i) {
+                  const phi::DenseTensor& input,
-    out_row *= dim_0[i];
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
-  }
+                  int axis,
+                  std::vector<phi::DenseTensor*>* outputs) {
+    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
+    // tensors of shape [0,1,4]
+    if (input.numel() == 0) {
+      return;
+    }
+    // TODO(zcd): Add input data validity checking
+    int o_num = outputs->size();
+    int64_t out_row = 1;
+    auto dim_0 = ref_inputs[0]->dims();
+    for (int i = 0; i < axis; ++i) {
+      out_row *= dim_0[i];
+    }
-  int64_t out0_col = ref_inputs[0]->numel() / out_row;
+    int64_t out0_col = ref_inputs[0]->numel() / out_row;
-  int64_t in_col = 0, in_row = out_row;
+    int64_t in_col = 0, in_row = out_row;
-  bool has_same_shape = true;
+    bool has_same_shape = true;
-  int outputs_cols_num = o_num + 1;
+    int outputs_cols_num = o_num + 1;
-  std::vector<T*> outputs_data_vec(o_num);
+    std::vector<T*> outputs_data_vec(o_num);
-  std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
+    std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
-  T** outputs_data = outputs_data_vec.data();
+    T** outputs_data = outputs_data_vec.data();
-  int64_t* outputs_cols = outputs_cols_vec.data();
+    int64_t* outputs_cols = outputs_cols_vec.data();
 // There are some differences between hip runtime and NV runtime.
 // In NV, when the pageable memory data less than 64K is transferred from
@@ -448,120 +456,129 @@ void SplitImpl(const Context& context,
 // 3.2.6.1. Concurrent Execution between Host and Device
 // Memory copies from host to device of a memory block of 64 KB or less
 #ifdef PADDLE_WITH_HIP
-  paddle::memory::AllocationPtr data_alloc, cols_alloc;
+    paddle::memory::AllocationPtr data_alloc, cols_alloc;
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
+    // TODO(chentianyu03): try to find a method to remove the Alloc function
-  data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+    data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                     o_num * sizeof(T*));
+                                       o_num * sizeof(T*));
-  outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
+    outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
+    // TODO(chentianyu03): try to find a method to remove the Alloc function
-  cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+    cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                     (outputs_cols_num) * sizeof(int64_t));
+                                       (outputs_cols_num) * sizeof(int64_t));
-  outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
+    outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
 #endif
-  outputs_cols[0] = 0;
+    outputs_cols[0] = 0;
-  for (int i = 0; i < o_num; ++i) {
+    for (int i = 0; i < o_num; ++i) {
-    int64_t t_col = ref_inputs.at(i)->numel() / out_row;
+      int64_t t_col = ref_inputs.at(i)->numel() / out_row;
-    if (has_same_shape) {
+      if (has_same_shape) {
-      if (t_col != out0_col) has_same_shape = false;
+        if (t_col != out0_col) has_same_shape = false;
-    }
+      }
-    in_col += t_col;
+      in_col += t_col;
-    outputs_cols[i + 1] = in_col;
+      outputs_cols[i + 1] = in_col;
-    if (outputs->at(i) != nullptr) {
+      if (outputs->at(i) != nullptr) {
-      outputs_data[i] = outputs->at(i)->data<T>();
+        outputs_data[i] = outputs->at(i)->data<T>();
-    } else {
+      } else {
-      outputs_data[i] = nullptr;
+        outputs_data[i] = nullptr;
+      }
    }
-  }
-  dim3 block_dims;
+    dim3 block_dims;
-  dim3 grid_dims;
+    dim3 grid_dims;
-  GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims);
+    GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims);
+    paddle::memory::allocation::AllocationPtr tmp_dev_outs_data;
+    T** dev_out_gpu_data = nullptr;
+    if (!has_same_shape || o_num < 2 || o_num > 4) {
+      // TODO(chentianyu03): try to find a method to remove the Alloc function
+      tmp_dev_outs_data = paddle::memory::Alloc(context, o_num * sizeof(T*));
+      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+          outputs_data, o_num);
+      paddle::memory::Copy(context.GetPlace(),
+                           tmp_dev_outs_data->ptr(),
+                           paddle::platform::CPUPlace(),
+                           restored,
+                           o_num * sizeof(T*),
+                           context.stream());
+      dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
+    }
-  paddle::memory::allocation::AllocationPtr tmp_dev_outs_data;
+    if (has_same_shape) {
-  T** dev_out_gpu_data = nullptr;
+      if (o_num == 2) {
-  if (!has_same_shape || o_num < 2 || o_num > 4) {
+        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-    // TODO(chentianyu03): try to find a method to remove the Alloc function
+            input.data<T>(),
-    tmp_dev_outs_data = paddle::memory::Alloc(context, o_num * sizeof(T*));
+            in_row,
-    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+            in_col,
-        outputs_data, o_num);
+            out0_col,
-    paddle::memory::Copy(context.GetPlace(),
+            outputs_data[0],
-                         tmp_dev_outs_data->ptr(),
+            outputs_data[1]);
-                         phi::CPUPlace(),
+      } else if (o_num == 3) {
-                         restored,
+        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-                         o_num * sizeof(T*),
+            input.data<T>(),
-                         context.stream());
+            in_row,
-    dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
+            in_col,
-  }
+            out0_col,
+            outputs_data[0],
+            outputs_data[1],
+            outputs_data[2]);
+      } else if (o_num == 4) {
+        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            input.data<T>(),
+            in_row,
+            in_col,
+            out0_col,
+            outputs_data[0],
+            outputs_data[1],
+            outputs_data[2],
+            outputs_data[3]);
+      } else {
+        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
+      }
+    } else {
+      auto tmp_dev_ins_col_data =
+          // TODO(chentianyu03): try to find a method to remove the Alloc
+          // function
+          paddle::memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
+      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+          outputs_cols, outputs_cols_num);
+      paddle::memory::Copy(context.GetPlace(),
+                           tmp_dev_ins_col_data->ptr(),
+                           paddle::platform::CPUPlace(),
+                           restored,
+                           outputs_cols_num * sizeof(int64_t),
+                           context.stream());
+      int64_t* dev_outs_col_data =
+          reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
-  if (has_same_shape) {
-    if (o_num == 2) {
      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
          input.data<T>(),
          in_row,
          in_col,
-          out0_col,
+          dev_outs_col_data,
-          outputs_data[0],
+          static_cast<int>(outputs_cols_num),
-          outputs_data[1]);
+          dev_out_gpu_data);
-    } else if (o_num == 3) {
-      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(),
-          in_row,
-          in_col,
-          out0_col,
-          outputs_data[0],
-          outputs_data[1],
-          outputs_data[2]);
-    } else if (o_num == 4) {
-      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(),
-          in_row,
-          in_col,
-          out0_col,
-          outputs_data[0],
-          outputs_data[1],
-          outputs_data[2],
-          outputs_data[3]);
-    } else {
-      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
    }
-  } else {
-    auto tmp_dev_ins_col_data =
-        // TODO(chentianyu03): try to find a method to remove the Alloc function
-        paddle::memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
-    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
-        outputs_cols, outputs_cols_num);
-    paddle::memory::Copy(context.GetPlace(),
-                         tmp_dev_ins_col_data->ptr(),
-                         phi::CPUPlace(),
-                         restored,
-                         outputs_cols_num * sizeof(int64_t),
-                         context.stream());
-    int64_t* dev_outs_col_data =
-        reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
-    SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-        input.data<T>(),
-        in_row,
-        in_col,
-        dev_outs_col_data,
-        static_cast<int>(outputs_cols_num),
-        dev_out_gpu_data);
-  }
 #ifdef PADDLE_WITH_HIP
-  // Prevent the pinned memory value from being covered and release the memory
+    // Prevent the pinned memory value from being covered and release the memory
-  // after the launch kernel of the stream is executed (reapply pinned memory
+    // after the launch kernel of the stream is executed (reapply pinned memory
-  // next time)
+    // next time)
-  auto* data_alloc_released = data_alloc.release();
+    auto* data_alloc_released = data_alloc.release();
-  auto* cols_alloc_released = cols_alloc.release();
+    auto* cols_alloc_released = cols_alloc.release();
-  context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
+    context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-    paddle::memory::allocation::Allocator::AllocationDeleter(
+      paddle::memory::allocation::Allocator::AllocationDeleter(
-        data_alloc_released);
+          data_alloc_released);
-    paddle::memory::allocation::Allocator::AllocationDeleter(
+      paddle::memory::allocation::Allocator::AllocationDeleter(
-        cols_alloc_released);
+          cols_alloc_released);
-  });
+    });
 #endif
-}
+  }
+};
+#define DEFINE_FUNCTOR(type)                           \
+  template class ConcatFunctor<phi::GPUContext, type>; \
+  template class SplitFunctor<phi::GPUContext, type>
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+}  // namespace funcs
 }  // namespace phi
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.h
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cmath>
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/utils/data_type.h"
+namespace phi {
+namespace funcs {
+/*
+ * \brief Concatenate the input tensors along the dimension axis.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input[0] = [[1,2],[3,4]]
+ *     Input[1] = [[5,6]]
+ *     axis = 0
+ *
+ *     Output = [[1,2],
+ *               [3,4],
+ *               [5,6]]
+ */
+template <typename Context, typename T>
+struct ConcatFunctor {
+  void operator()(const Context& context,
+                  const std::vector<phi::DenseTensor>& input,
+                  int axis,
+                  phi::DenseTensor* output);
+};
+/*
+ * \brief Split the input tensors along the dimension axis into outputs.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input = [[1,2],
+ *              [3,4],
+ *              [5,6]]
+ *     axis = 0
+ *
+ *     Output[0] = [[1,2],[3,4]]
+ *     Output[1] = [[5,6]]
+ */
+template <typename Context, typename T>
+class SplitFunctor {
+ public:
+  void operator()(const Context& context,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
+                  int axis,
+                  std::vector<phi::DenseTensor*>* outputs);
+};
+}  // namespace funcs
+}  // namespace phi
+#define FOR_ALL_TYPES(macro)         \
+  macro(int);                        \
+  macro(float);                      \
+  macro(double);                     \
+  macro(bool);                       \
+  macro(int64_t);                    \
+  macro(int16_t);                    \
+  macro(uint8_t);                    \
+  macro(int8_t);                     \
+  macro(phi::dtype::float16);        \
+  macro(phi::dtype::bfloat16);       \
+  macro(phi::dtype::complex<float>); \
+  macro(phi::dtype::complex<double>);
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -22,8 +22,8 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/lod_utils.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
-#include "paddle/phi/kernels/gpu/concat_and_split.h"
 namespace phi {
@@ -104,7 +104,8 @@ void ConcatKernel(const Context& dev_ctx,
        continue;
      }
    }
-    ConcatImpl<T, Context>(dev_ctx, inputs, axis, out);
+    phi::funcs::ConcatFunctor<Context, T> functor;
+    functor(dev_ctx, inputs, axis, out);
  }
 }

--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace phi {
 template <typename T, typename Context>
@@ -53,7 +53,8 @@ void SplitKernel(const Context& dev_ctx,
    paddle::operators::StridedMemcpyWithAxis0<T>(
        dev_ctx, x, shape_refer, &outs);
  } else {
-    SplitImpl<T, Context>(dev_ctx, x, shape_refer, axis, &outs);
+    phi::funcs::SplitFunctor<Context, T> functor;
+    functor(dev_ctx, x, shape_refer, axis, &outs);
  }
 }

--- a/paddle/phi/kernels/gpu/unbind_kernel.cu
+++ b/paddle/phi/kernels/gpu/unbind_kernel.cu
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unbind_kernel_impl.h"
+#include "paddle/phi/kernels/unbind_kernel.h"
+PD_REGISTER_KERNEL(unbind,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnbindKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/impl/unbind_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unbind_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/unbind_kernel.h"
+namespace phi {
+template <typename T, typename Context>
+void UnbindKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  std::vector<DenseTensor*> outs) {
+  auto x_dims = x.dims();
+  axis = axis < 0 ? x_dims.size() + axis : axis;
+  std::vector<const DenseTensor*> shape_refer;
+  for (size_t j = 0; j < outs.size(); ++j) {
+    ctx.template Alloc<T>(outs[j]);
+    shape_refer.emplace_back(outs[j]);
+  }
+  phi::funcs::SplitFunctor<Context, T> functor;
+  functor(ctx, x, shape_refer, axis, &outs);
+}
+}  // namespace phi
--- a/paddle/phi/kernels/unbind_kernel.h
+++ b/paddle/phi/kernels/unbind_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T, typename Context>
+void UnbindKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  std::vector<DenseTensor*> outs);
+}  // namespace phi