[PHI] Migrate roll op (#40257)

* [PHI] Migrate roll op * 【phi】migrate eigh op to phi (#40213) * migrate eigh to phi * optimize code * modify code according to comment * conflict resolution * [PHI] Migrate roll op * [PHI] Fix converage of roll_sig * [PHI] Fix infermate of roll_sig * [Phi] Fix unittest coverage of roll op * [PHI] Fix infermeta in unary * [PHI] Fix parameter type of roll op * [PHI] Fix parameter type of roll op * [PHI] Fix parameter of roll op Co-authored-by: N crystal <62974595+Zjq9409@users.noreply.github.com>

[PHI] Migrate roll op (#40257)
* [PHI] Migrate roll op * 【phi】migrate eigh op to phi (#40213) * migrate eigh to phi * optimize code * modify code according to comment * conflict resolution * [PHI] Migrate roll op * [PHI] Fix converage of roll_sig * [PHI] Fix infermate of roll_sig * [Phi] Fix unittest coverage of roll op * [PHI] Fix infermeta in unary * [PHI] Fix parameter type of roll op * [PHI] Fix parameter type of roll op * [PHI] Fix parameter of roll op Co-authored-by: N crystal <62974595+Zjq9409@users.noreply.github.com>
44d46d03 · chenenquan · GitHub · 99452af7 · 44d46d03 · 99452af7
13 changed file
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/operators/roll_op.h"
-
 #include <memory>
 #include <vector>

+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"

 namespace paddle {
 namespace operators {
@@ -29,43 +32,6 @@ class RollOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of RollOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of RollOp should not be null."));
-
-    auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
-    auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");
-
-    if (!ctx->HasInput("ShiftsTensor")) {
-      if (dims.size() != 0) {
-        PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
-                          platform::errors::InvalidArgument(
-                              "When dims.size() != 0, dims.size() "
-                              "should be equal to "
-                              "shifts.size(). But received "
-                              "dims.size() = %d, shifts.size() = %d",
-                              dims.size(), shifts.size()));
-      } else {
-        PADDLE_ENFORCE_EQ(shifts.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "When dims.size() == 0, shifts.size() "
-                              "should be equal to 1, But received "
-                              "shifts.size() = %d",
-                              shifts.size()));
-      }
-    }
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    auto type = ctx->GetInputsVarType("X")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -149,29 +115,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RollGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle

 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roll, RollInferShapeFunctor,
+                            PD_INFER_META(phi::RollInferMeta));
+
 REGISTER_OPERATOR(roll, ops::RollOp, ops::RollOpMaker,
                  ops::RollGradMaker<paddle::framework::OpDesc>,
-                  ops::RollGradMaker<paddle::imperative::OpBase>);
+                  ops::RollGradMaker<paddle::imperative::OpBase>,
+                  RollInferShapeFunctor);
 REGISTER_OPERATOR(roll_grad, ops::RollGradOp,
                  ops::RollGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    roll, ops::RollKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    roll_grad, ops::RollGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>);

 REGISTER_OP_VERSION(roll)
    .AddCheckpoint(

--- a/paddle/fluid/operators/roll_op.cu
+++ b/paddle/fluid/operators/roll_op.cu
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/roll_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/utils/array.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T, size_t Rank>
-__global__ void RollCudaKernel(const T* input, T* output, int64_t N,
-                               phi::Array<int64_t, Rank> shifts,
-                               phi::Array<int64_t, Rank> strides,
-                               phi::Array<int64_t, Rank> sizes) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-
-  int64_t output_idx = idx;
-  int64_t new_dim_idx = 0;
-
-#pragma unroll
-  for (size_t i = 0; i < Rank; i++) {
-    new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
-    if (new_dim_idx >= sizes[i]) {
-      output_idx += (shifts[i] - sizes[i]) * strides[i];
-    } else {
-      output_idx += shifts[i] * strides[i];
-    }
-  }
-  output[output_idx] = input[idx];
-}
-
-template <typename T>
-class RollKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-
-    size_t nums = shifts.size();
-    auto input_dim = in->dims();
-    auto stride_dim = phi::stride(input_dim);
-
-    std::vector<int64_t> strides(nums), sizes(nums);
-    if (dims.size() == 0) {
-      strides[0] = 1;
-      sizes[0] = numel;
-      shifts[0] = (shifts[0] % numel + numel) % numel;
-    } else {
-      for (size_t i = 0; i < nums; i++) {
-        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-        int64_t size = input_dim[dim];
-
-        if (size != 0) {
-          shifts[i] = (shifts[i] % size + size) % size;
-          strides[i] = stride_dim[dim];
-          sizes[i] = size;
-        }
-      }
-    }
-
-#define CALL_ROLL_CUDA_KERNEL(N)                                               \
-  case N: {                                                                    \
-    phi::Array<int64_t, N> _strides;                                           \
-    phi::Array<int64_t, N> _shifts;                                            \
-    phi::Array<int64_t, N> _sizes;                                             \
-    for (size_t idx = 0; idx < N; ++idx) {                                     \
-      _strides[idx] = strides[idx];                                            \
-      _shifts[idx] = shifts[idx];                                              \
-      _sizes[idx] = sizes[idx];                                                \
-    }                                                                          \
-    RollCudaKernel<                                                            \
-        T,                                                                     \
-        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,  \
-             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel,   \
-                                                   _shifts, _strides, _sizes); \
-    break;                                                                     \
-  }
-
-    switch (nums) {
-      CALL_ROLL_CUDA_KERNEL(1);
-      CALL_ROLL_CUDA_KERNEL(2);
-      CALL_ROLL_CUDA_KERNEL(3);
-      CALL_ROLL_CUDA_KERNEL(4);
-      CALL_ROLL_CUDA_KERNEL(5);
-      CALL_ROLL_CUDA_KERNEL(6);
-      CALL_ROLL_CUDA_KERNEL(7);
-      CALL_ROLL_CUDA_KERNEL(8);
-      CALL_ROLL_CUDA_KERNEL(9);
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "shifts.size() should be less than 10, But received shifts.size() "
-            "= %d",
-            shifts.size()));
-    }
-  }
-};
-
-template <typename T>
-class RollGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* out = context.Output<LoDTensor>(framework::GradVarName("X"));
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    size_t nums = shifts.size();
-    auto input_dim = in->dims();
-    auto stride_dim = phi::stride(input_dim);
-
-    std::vector<int64_t> strides(nums), sizes(nums);
-    if (dims.size() == 0) {
-      strides[0] = 1;
-      sizes[0] = numel;
-      shifts[0] = ((-shifts[0]) % numel + numel) % numel;
-    } else {
-      for (size_t i = 0; i < nums; i++) {
-        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-        int64_t size = input_dim[dim];
-        if (size != 0) {
-          shifts[i] = ((-shifts[i]) % size + size) % size;
-          strides[i] = stride_dim[dim];
-          sizes[i] = size;
-        }
-      }
-    }
-
-    switch (nums) {
-      CALL_ROLL_CUDA_KERNEL(1);
-      CALL_ROLL_CUDA_KERNEL(2);
-      CALL_ROLL_CUDA_KERNEL(3);
-      CALL_ROLL_CUDA_KERNEL(4);
-      CALL_ROLL_CUDA_KERNEL(5);
-      CALL_ROLL_CUDA_KERNEL(6);
-      CALL_ROLL_CUDA_KERNEL(7);
-      CALL_ROLL_CUDA_KERNEL(8);
-      CALL_ROLL_CUDA_KERNEL(9);
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "shifts.size() should be less than 10, But received shifts.size() "
-            "= %d",
-            shifts.size()));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roll, ops::RollKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    roll_grad, ops::RollGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1016,6 +1016,37 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
  ReshapeInferMeta(x, shape, out, config);
 }

+void RollInferMeta(const MetaTensor& x,
+                   const ScalarArray& shifts,
+                   const std::vector<int64_t>& axis,
+                   MetaTensor* out) {
+  auto shifts_data = shifts.GetData();
+
+  if (axis.size() != 0) {
+    PADDLE_ENFORCE_EQ(
+        axis.size(),
+        shifts_data.size(),
+        phi::errors::InvalidArgument("When dims.size() != 0, dims.size() "
+                                     "should be equal to "
+                                     "shifts.size(). But received "
+                                     "dims.size() = %d, shifts.size() = %d",
+                                     axis.size(),
+                                     shifts_data.size()));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        shifts_data.size(),
+        1,
+        phi::errors::InvalidArgument("When dims.size() == 0, shifts.size() "
+                                     "should be equal to 1, But received "
+                                     "shifts.size() = %d",
+                                     shifts_data.size()));
+  }
+
+  out->set_dims(x.dims());
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) {
  auto in_dim = input.dims();
  out->set_dims(phi::make_ddim({in_dim.size()}));

--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -164,6 +164,11 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
                                MetaTensor* out,
                                MetaConfig config = MetaConfig());

+void RollInferMeta(const MetaTensor& x,
+                   const ScalarArray& shifts,
+                   const std::vector<int64_t>& axis,
+                   MetaTensor* out);
+
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out);

 void ShardIndexInferMeta(const MetaTensor& in,

--- a/paddle/phi/kernels/cpu/roll_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roll_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_grad_kernel.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/roll_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad) {
+  std::vector<T> out_vec;
+  paddle::framework::TensorToVector(out_grad, dev_ctx, &out_vec);
+
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  DDim input_dim = out_grad.dims();
+  auto dims = axis;
+
+  // axis = none, reshape to 1-D tensor
+  if (dims.size() == 0) {
+    dims.push_back(0l);
+    input_dim = phi::Dim<1>(out_vec.size());
+  }
+
+  for (size_t i = 0; i < nums; i++) {
+    ShiftAlongDim(out_vec.data(), input_dim, dims[i], 0 - shifts_data[i]);
+  }
+
+  dev_ctx.template Alloc<T>(x_grad);
+  paddle::framework::TensorFromVector(out_vec, dev_ctx, x_grad);
+  x_grad->Resize(out_grad.dims());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RollGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/cpu/roll_kernel.cc
+++ b/paddle/phi/kernels/cpu/roll_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_kernel.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/roll_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out) {
+  std::vector<T> out_vec;
+  paddle::framework::TensorToVector(x, dev_ctx, &out_vec);
+
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  DDim input_dim = x.dims();
+  auto dims = axis;
+
+  // axis = none, reshape to 1-D tensor
+  if (dims.size() == 0) {
+    dims.push_back(0l);
+    input_dim = phi::Dim<1>(out_vec.size());
+  }
+
+  for (size_t i = 0; i < nums; i++) {
+    PADDLE_ENFORCE_EQ(
+        dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()),
+        true,
+        phi::errors::OutOfRange(
+            "Attr(axis[%d]) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.",
+            i,
+            input_dim.size(),
+            input_dim.size() - 1,
+            i,
+            dims[i]));
+    ShiftAlongDim(out_vec.data(), input_dim, dims[i], shifts_data[i]);
+  }
+  dev_ctx.template Alloc<T>(out);
+  paddle::framework::TensorFromVector(out_vec, dev_ctx, out);
+  out->Resize(x.dims());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RollKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/fluid/operators/roll_op.h
+++ b/paddle/fluid/operators/roll_op.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,21 +13,16 @@
 // limitations under the License.

 #pragma once
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/enforce.h"

-namespace paddle {
-namespace operators {
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"

-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
+namespace phi {

 template <typename T>
-inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim,
+inline void ShiftAlongDim(T* data,
+                          const DDim& input_dim,
+                          int64_t dim,
                          int64_t shift) {
  if (dim < 0) {
    dim += input_dim.size();
@@ -78,92 +73,4 @@ inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim,
  }
 }

-template <typename DeviceContext, typename T>
-class RollKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_var = context.InputVar("X");
-    auto* output_var = context.OutputVar("Out");
-    auto& input = input_var->Get<LoDTensor>();
-    auto* output = output_var->GetMutable<LoDTensor>();
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    std::vector<T> out_vec;
-    paddle::framework::TensorToVector(input, context.device_context(),
-                                      &out_vec);
-
-    size_t nums = shifts.size();
-    DDim input_dim = input.dims();
-
-    // axis = none, reshape to 1-D tensor
-    if (dims.size() == 0) {
-      dims.push_back(0l);
-      input_dim = framework::Dim<1>(out_vec.size());
-    }
-
-    for (size_t i = 0; i < nums; i++) {
-      PADDLE_ENFORCE_EQ(
-          dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), true,
-          platform::errors::OutOfRange(
-              "Attr(axis[%d]) is out of range, It's expected "
-              "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.",
-              i, input_dim.size(), input_dim.size() - 1, i, dims[i]));
-      shift_along_dim(out_vec.data(), input_dim, dims[i], shifts[i]);
-    }
-    output->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input.dims());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class RollGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_var = context.InputVar(framework::GradVarName("Out"));
-    auto* output_var = context.OutputVar(framework::GradVarName("X"));
-    auto& input = input_var->Get<LoDTensor>();
-    auto* output = output_var->GetMutable<LoDTensor>();
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    std::vector<T> out_vec;
-    paddle::framework::TensorToVector(input, context.device_context(),
-                                      &out_vec);
-
-    size_t nums = shifts.size();
-    DDim input_dim = input.dims();
-
-    // axis = none, reshape to 1-D tensor
-    if (dims.size() == 0) {
-      dims.push_back(0l);
-      input_dim = framework::Dim<1>(out_vec.size());
-    }
-
-    for (size_t i = 0; i < nums; i++) {
-      shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]);
-    }
-    output->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input.dims());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/roll_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_grad_kernel.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad) {
+  auto* in_data = out_grad.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(x_grad);
+  int64_t numel = out_grad.numel();
+  auto stream = dev_ctx.stream();
+
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  auto input_dim = out_grad.dims();
+  auto stride_dim = phi::stride(input_dim);
+
+  std::vector<int64_t> strides(nums), sizes(nums);
+  if (axis.size() == 0) {
+    strides[0] = 1;
+    sizes[0] = numel;
+    shifts_data[0] = ((-shifts_data[0]) % numel + numel) % numel;
+  } else {
+    for (size_t i = 0; i < nums; i++) {
+      int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
+      int64_t size = input_dim[dim];
+      if (size != 0) {
+        shifts_data[i] = ((-shifts_data[i]) % size + size) % size;
+        strides[i] = stride_dim[dim];
+        sizes[i] = size;
+      }
+    }
+  }
+
+  switch (nums) {
+    CALL_ROLL_CUDA_KERNEL(1);
+    CALL_ROLL_CUDA_KERNEL(2);
+    CALL_ROLL_CUDA_KERNEL(3);
+    CALL_ROLL_CUDA_KERNEL(4);
+    CALL_ROLL_CUDA_KERNEL(5);
+    CALL_ROLL_CUDA_KERNEL(6);
+    CALL_ROLL_CUDA_KERNEL(7);
+    CALL_ROLL_CUDA_KERNEL(8);
+    CALL_ROLL_CUDA_KERNEL(9);
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "shifts.size() should be less than 10, But received shifts.size() "
+          "= %d",
+          shifts_data.size()));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RollGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/gpu/roll_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_kernel.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/array.h"
+#include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out) {
+  auto* in_data = x.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t numel = x.numel();
+  auto stream = dev_ctx.stream();
+
+  auto shifts_data = shifts.GetData();
+
+  size_t nums = shifts_data.size();
+  auto input_dim = x.dims();
+  auto stride_dim = phi::stride(input_dim);
+
+  std::vector<int64_t> strides(nums), sizes(nums);
+  if (axis.size() == 0) {
+    strides[0] = 1;
+    sizes[0] = numel;
+    shifts_data[0] = (shifts_data[0] % numel + numel) % numel;
+  } else {
+    for (size_t i = 0; i < nums; i++) {
+      int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
+      int64_t size = input_dim[dim];
+
+      if (size != 0) {
+        shifts_data[i] = (shifts_data[i] % size + size) % size;
+        strides[i] = stride_dim[dim];
+        sizes[i] = size;
+      }
+    }
+  }
+
+  switch (nums) {
+    CALL_ROLL_CUDA_KERNEL(1);
+    CALL_ROLL_CUDA_KERNEL(2);
+    CALL_ROLL_CUDA_KERNEL(3);
+    CALL_ROLL_CUDA_KERNEL(4);
+    CALL_ROLL_CUDA_KERNEL(5);
+    CALL_ROLL_CUDA_KERNEL(6);
+    CALL_ROLL_CUDA_KERNEL(7);
+    CALL_ROLL_CUDA_KERNEL(8);
+    CALL_ROLL_CUDA_KERNEL(9);
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "shifts.size() should be less than 10, But received shifts.size() "
+          "= %d",
+          shifts_data.size()));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RollKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/gpu/roll_kernel_impl.h
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/utils/array.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, size_t Rank>
+__global__ void RollCudaKernel(const T* input,
+                               T* output,
+                               int64_t N,
+                               phi::Array<int64_t, Rank> shifts,
+                               phi::Array<int64_t, Rank> strides,
+                               phi::Array<int64_t, Rank> sizes) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int64_t output_idx = idx;
+  int64_t new_dim_idx = 0;
+
+#pragma unroll
+  for (size_t i = 0; i < Rank; i++) {
+    new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
+    if (new_dim_idx >= sizes[i]) {
+      output_idx += (shifts[i] - sizes[i]) * strides[i];
+    } else {
+      output_idx += shifts[i] * strides[i];
+    }
+  }
+  output[output_idx] = input[idx];
+}
+
+#define CALL_ROLL_CUDA_KERNEL(N)                                              \
+  case N: {                                                                   \
+    phi::Array<int64_t, N> _strides;                                          \
+    phi::Array<int64_t, N> _shifts;                                           \
+    phi::Array<int64_t, N> _sizes;                                            \
+    for (size_t idx = 0; idx < N; ++idx) {                                    \
+      _strides[idx] = strides[idx];                                           \
+      _shifts[idx] = shifts_data[idx];                                        \
+      _sizes[idx] = sizes[idx];                                               \
+    }                                                                         \
+    RollCudaKernel<                                                           \
+        T,                                                                    \
+        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
+             PADDLE_CUDA_NUM_THREADS,                                         \
+             0,                                                               \
+             stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes);  \
+    break;                                                                    \
+  }
+
+}  // namespace phi
--- a/paddle/phi/kernels/roll_grad_kernel.h
+++ b/paddle/phi/kernels/roll_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
--- a/paddle/phi/kernels/roll_kernel.h
+++ b/paddle/phi/kernels/roll_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out);
+
+}  // namespace phi
--- a/paddle/phi/ops/compat/roll_sig.cc
+++ b/paddle/phi/ops/compat/roll_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RollOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("ShiftsTensor")) {
+    return KernelSignature("roll", {"X"}, {"ShiftsTensor", "axis"}, {"Out"});
+  }
+  return KernelSignature("roll", {"X"}, {"shifts", "axis"}, {"Out"});
+}
+
+KernelSignature RollGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("roll_grad",
+                         {"X", GradVarName("Out")},
+                         {"shifts", "axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(roll, phi::RollOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roll_grad, phi::RollGradOpArgumentMapping);