roll optimize (#32880)

3fc56aa0 · sunli · GitHub · 07fadc4e · 3fc56aa0 · 3fc56aa0
5 changed file
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 #include "paddle/fluid/operators/roll_op.h"
 #include <memory>
 #include <vector>
 #include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
@@ -37,12 +39,22 @@ class RollOp : public framework::OperatorWithKernel {
    auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
    auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");
+    if (dims.size() != 0) {
      PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
                        platform::errors::InvalidArgument(
-                          "Attr(dims).size() should be equl to "
+                            "When dims.size() != 0, dims.size() "
-                          "Attr(shifts).size(). But received "
+                            "should be equal to "
-                          "Attr(dims).size() = %d, Attr(shifts).size() = %d",
+                            "shifts.size(). But received "
+                            "dims.size() = %d, shifts.size() = %d",
                            dims.size(), shifts.size()));
+    } else {
+      PADDLE_ENFORCE_EQ(shifts.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "When dims.size() == 0, shifts.size() "
+                            "should be equal to 1, But received "
+                            "shifts.size() = %d",
+                            shifts.size()));
+    }
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
    auto type = ctx->GetInputsVarType("X")[0];
@@ -95,7 +107,7 @@ class RollOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<std::vector<int64_t>>(
        "axis",
        "Axis along which to roll. It must have the same size "
-        "with shifts.")
+        "with shifts or size == 0")
        .SetDefault({});
    AddComment(R"DOC(
    Roll the tensor along the given dimension(s). 
@@ -151,8 +163,9 @@ REGISTER_OP_VERSION(roll)
        paddle::framework::compatible::OpVersionDesc()
            .NewAttr("axis",
                     "(std::vector<int64_t>) Axis along which to roll. "
-                     "It must have the same size with shifts.",
+                     "It must have the same size with shifts, or size = 0.",
                     std::vector<int64_t>())
-            .DeleteAttr("dims",
+            .DeleteAttr(
+                "dims",
                "(std::vector<int64_t>) Dims along which to roll. "
-                        "It must have the same size with shifts."));
+                "It must have the same size with shifts, or size = 0."));
--- a/paddle/fluid/operators/roll_op.cu
+++ b/paddle/fluid/operators/roll_op.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
+#include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/roll_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -24,26 +25,31 @@ using platform::PADDLE_CUDA_NUM_THREADS;
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-template <typename T>
+template <typename T, size_t Rank>
-__global__ void roll_cuda_kernel(const T* input, T* output, int64_t N,
+__global__ void RollCudaKernel(const T* input, T* output, int64_t N,
-                                 int64_t* shifts, int64_t* strides,
+                               paddle::framework::Array<int64_t, Rank> shifts,
-                                 int64_t* sizes, int64_t nums) {
+                               paddle::framework::Array<int64_t, Rank> strides,
+                               paddle::framework::Array<int64_t, Rank> sizes) {
  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx >= N) {
    return;
  }
  int64_t output_idx = idx;
  int64_t dim_idx, dim_idx_shift;
-  for (int64_t i = 0; i < nums; i++) {
-    dim_idx = idx % (strides[i] * sizes[i]) / strides[i];
+#pragma unroll Rank
+  for (size_t i = 0; i < Rank; i++) {
+    dim_idx = (idx / strides[i]) % sizes[i];
    dim_idx_shift = (dim_idx + shifts[i]) % sizes[i];
    output_idx = output_idx + (dim_idx_shift - dim_idx) * strides[i];
  }
  output[output_idx] = input[idx];
 }
-template <typename DeviceContext, typename T>
+template <typename T>
-class RollCUDAKernel : public framework::OpKernel<T> {
+class RollKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in = context.Input<LoDTensor>("X");
@@ -61,50 +67,62 @@ class RollCUDAKernel : public framework::OpKernel<T> {
    auto input_dim = in->dims();
    auto stride_dim = framework::stride(input_dim);
-    int64_t dim, size;
+    std::vector<int64_t> strides(nums), sizes(nums);
-    size_t gpu_memory_size_ = sizeof(int64_t) * nums;
+    if (dims.size() == 0) {
-    std::vector<int64_t> strides, sizes;
+      strides[0] = 1;
-    strides.resize(nums);
+      sizes[0] = numel;
-    sizes.resize(nums);
+      shifts[0] = (shifts[0] % numel + numel) % numel;
-    paddle::memory::AllocationPtr shifts_gpu =
+    } else {
-        memory::Alloc(context.GetPlace(), gpu_memory_size_);
-    paddle::memory::AllocationPtr strides_gpu =
-        memory::Alloc(context.GetPlace(), gpu_memory_size_);
-    paddle::memory::AllocationPtr sizes_gpu =
-        memory::Alloc(context.GetPlace(), gpu_memory_size_);
      for (size_t i = 0; i < nums; i++) {
-      dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
+        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-      size = input_dim[dim];
+        int64_t size = input_dim[dim];
        shifts[i] = (shifts[i] % size + size) % size;
        strides[i] = stride_dim[dim];
        sizes[i] = size;
      }
-    paddle::memory::Copy(
+    }
-        BOOST_GET_CONST(platform::CUDAPlace, shifts_gpu->place()),
-        shifts_gpu->ptr(), platform::CPUPlace(), shifts.data(),
+#define CALL_ROLL_CUDA_KERNEL(N)                                               \
-        gpu_memory_size_, stream);
+  case N: {                                                                    \
-    paddle::memory::Copy(
+    paddle::framework::Array<int64_t, N> _strides;                             \
-        BOOST_GET_CONST(platform::CUDAPlace, strides_gpu->place()),
+    paddle::framework::Array<int64_t, N> _shifts;                              \
-        strides_gpu->ptr(), platform::CPUPlace(), strides.data(),
+    paddle::framework::Array<int64_t, N> _sizes;                               \
-        gpu_memory_size_, stream);
+    for (size_t idx = 0; idx < N; ++idx) {                                     \
-    paddle::memory::Copy(
+      _strides[idx] = strides[idx];                                            \
-        BOOST_GET_CONST(platform::CUDAPlace, sizes_gpu->place()),
+      _shifts[idx] = shifts[idx];                                              \
-        sizes_gpu->ptr(), platform::CPUPlace(), sizes.data(), gpu_memory_size_,
+      _sizes[idx] = sizes[idx];                                                \
-        stream);
+    }                                                                          \
-    int64_t* shifts_ptr = reinterpret_cast<int64_t*>(shifts_gpu->ptr());
+    RollCudaKernel<                                                            \
-    int64_t* strides_ptr = reinterpret_cast<int64_t*>(strides_gpu->ptr());
+        T,                                                                     \
-    int64_t* sizes_ptr = reinterpret_cast<int64_t*>(sizes_gpu->ptr());
+        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,  \
+             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel,   \
-    roll_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
+                                                   _shifts, _strides, _sizes); \
-                           PADDLE_CUDA_NUM_THREADS,
+    break;                                                                     \
-                       PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+  }
-        in_data, out_data, numel, shifts_ptr, strides_ptr, sizes_ptr, nums);
+    switch (nums) {
+      CALL_ROLL_CUDA_KERNEL(1);
+      CALL_ROLL_CUDA_KERNEL(2);
+      CALL_ROLL_CUDA_KERNEL(3);
+      CALL_ROLL_CUDA_KERNEL(4);
+      CALL_ROLL_CUDA_KERNEL(5);
+      CALL_ROLL_CUDA_KERNEL(6);
+      CALL_ROLL_CUDA_KERNEL(7);
+      CALL_ROLL_CUDA_KERNEL(8);
+      CALL_ROLL_CUDA_KERNEL(9);
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "shifts.size() should be less than 10, But received shifts.size() "
+            "= %d",
+            shifts.size()));
+    }
  }
 };
-template <typename DeviceContext, typename T>
+template <typename T>
-class RollGradCUDAKernel : public framework::OpKernel<T> {
+class RollGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in = context.Input<LoDTensor>(framework::GradVarName("Out"));
@@ -121,46 +139,38 @@ class RollGradCUDAKernel : public framework::OpKernel<T> {
    auto input_dim = in->dims();
    auto stride_dim = framework::stride(input_dim);
-    int64_t dim, size;
+    std::vector<int64_t> strides(nums), sizes(nums);
-    size_t gpu_memory_size_ = sizeof(int64_t) * nums;
+    if (dims.size() == 0) {
-    std::vector<int64_t> strides, sizes;
+      strides[0] = 1;
-    strides.resize(nums);
+      sizes[0] = numel;
-    sizes.resize(nums);
+      shifts[0] = ((-shifts[0]) % numel + numel) % numel;
-    paddle::memory::AllocationPtr shifts_gpu =
+    } else {
-        memory::Alloc(context.GetPlace(), gpu_memory_size_);
-    paddle::memory::AllocationPtr strides_gpu =
-        memory::Alloc(context.GetPlace(), gpu_memory_size_);
-    paddle::memory::AllocationPtr sizes_gpu =
-        memory::Alloc(context.GetPlace(), gpu_memory_size_);
      for (size_t i = 0; i < nums; i++) {
-      dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
+        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-      size = input_dim[dim];
+        int64_t size = input_dim[dim];
-      shifts[i] = ((0 - shifts[i]) % size + size) % size;
+        shifts[i] = ((-shifts[i]) % size + size) % size;
        strides[i] = stride_dim[dim];
        sizes[i] = size;
      }
+    }
-    paddle::memory::Copy(
+    switch (nums) {
-        BOOST_GET_CONST(platform::CUDAPlace, shifts_gpu->place()),
+      CALL_ROLL_CUDA_KERNEL(1);
-        shifts_gpu->ptr(), platform::CPUPlace(), shifts.data(),
+      CALL_ROLL_CUDA_KERNEL(2);
-        gpu_memory_size_, stream);
+      CALL_ROLL_CUDA_KERNEL(3);
-    paddle::memory::Copy(
+      CALL_ROLL_CUDA_KERNEL(4);
-        BOOST_GET_CONST(platform::CUDAPlace, strides_gpu->place()),
+      CALL_ROLL_CUDA_KERNEL(5);
-        strides_gpu->ptr(), platform::CPUPlace(), strides.data(),
+      CALL_ROLL_CUDA_KERNEL(6);
-        gpu_memory_size_, stream);
+      CALL_ROLL_CUDA_KERNEL(7);
-    paddle::memory::Copy(
+      CALL_ROLL_CUDA_KERNEL(8);
-        BOOST_GET_CONST(platform::CUDAPlace, sizes_gpu->place()),
+      CALL_ROLL_CUDA_KERNEL(9);
-        sizes_gpu->ptr(), platform::CPUPlace(), sizes.data(), gpu_memory_size_,
+      default:
-        stream);
+        PADDLE_THROW(platform::errors::InvalidArgument(
-    int64_t* shifts_ptr = reinterpret_cast<int64_t*>(shifts_gpu->ptr());
+            "shifts.size() should be less than 10, But received shifts.size() "
-    int64_t* strides_ptr = reinterpret_cast<int64_t*>(strides_gpu->ptr());
+            "= %d",
-    int64_t* sizes_ptr = reinterpret_cast<int64_t*>(sizes_gpu->ptr());
+            shifts.size()));
+    }
-    roll_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                           PADDLE_CUDA_NUM_THREADS,
-                       PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        in_data, out_data, numel, shifts_ptr, strides_ptr, sizes_ptr, nums);
  }
 };
@@ -169,13 +179,12 @@ class RollGradCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    roll, ops::RollCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    roll, ops::RollKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::RollKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::RollKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::RollKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
-    roll_grad,
+    roll_grad, ops::RollGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::RollGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
-    ops::RollGradCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/roll_op.h
+++ b/paddle/fluid/operators/roll_op.h
@@ -88,7 +88,13 @@ class RollKernel : public framework::OpKernel<T> {
    TensorToVector(input, context.device_context(), &out_vec);
    size_t nums = shifts.size();
-    const DDim input_dim = input.dims();
+    DDim input_dim = input.dims();
+    // axis = none, reshape to 1-D tensor
+    if (dims.size() == 0) {
+      dims.push_back(0l);
+      input_dim = framework::Dim<1>(out_vec.size());
+    }
    for (size_t i = 0; i < nums; i++) {
      PADDLE_ENFORCE_EQ(
@@ -101,7 +107,7 @@ class RollKernel : public framework::OpKernel<T> {
    }
    output->mutable_data<T>(context.GetPlace());
    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input_dim);
+    output->Resize(input.dims());
  }
 };
@@ -120,14 +126,20 @@ class RollGradKernel : public framework::OpKernel<T> {
    TensorToVector(input, context.device_context(), &out_vec);
    size_t nums = shifts.size();
-    const DDim input_dim = input.dims();
+    DDim input_dim = input.dims();
+    // axis = none, reshape to 1-D tensor
+    if (dims.size() == 0) {
+      dims.push_back(0l);
+      input_dim = framework::Dim<1>(out_vec.size());
+    }
    for (size_t i = 0; i < nums; i++) {
      shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]);
    }
    output->mutable_data<T>(context.GetPlace());
    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input_dim);
+    output->Resize(input.dims());
  }
 };

--- a/python/paddle/fluid/tests/unittests/test_roll_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roll_op.py
@@ -63,6 +63,7 @@ class TestRollAPI(unittest.TestCase):
    def test_roll_op_api(self):
        self.input_data()
+        paddle.enable_static()
        # case 1:
        with program_guard(Program(), Program()):
            x = fluid.layers.data(name='x', shape=[-1, 3])

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -459,28 +459,22 @@ def roll(x, shifts, axis=None, name=None):
    if axis:
        check_type(axis, 'axis', (list, tuple), 'roll')
+    else:
+        axis = []
    check_type(shifts, 'shifts', (list, tuple), 'roll')
    if in_dygraph_mode():
-        if axis is None:
+        return core.ops.roll(x, 'axis', axis, 'shifts', shifts)
-            x = core.ops.reshape(x, 'shape', [-1, 1])
-            axis = [0]
-        out = core.ops.roll(x, 'axis', axis, 'shifts', shifts)
-        return core.ops.reshape(out, 'shape', origin_shape)
    out = helper.create_variable_for_type_inference(x.dtype)
-    if axis is None:
-        x = reshape(x, shape=[-1, 1])
-        axis = [0]
    helper.append_op(
        type='roll',
        inputs={'X': x},
        outputs={'Out': out},
        attrs={'axis': axis,
               'shifts': shifts})
-    out = layers.reshape(out, shape=origin_shape)
    return out