未验证 提交 3fc56aa0 编写于 作者: S sunli 提交者: GitHub

roll optimize (#32880)

上级 07fadc4e
......@@ -13,8 +13,10 @@
// limitations under the License.
#include "paddle/fluid/operators/roll_op.h"
#include <memory>
#include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
......@@ -37,12 +39,22 @@ class RollOp : public framework::OperatorWithKernel {
auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");
PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
platform::errors::InvalidArgument(
"Attr(dims).size() should be equl to "
"Attr(shifts).size(). But received "
"Attr(dims).size() = %d, Attr(shifts).size() = %d",
dims.size(), shifts.size()));
if (dims.size() != 0) {
PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
platform::errors::InvalidArgument(
"When dims.size() != 0, dims.size() "
"should be equal to "
"shifts.size(). But received "
"dims.size() = %d, shifts.size() = %d",
dims.size(), shifts.size()));
} else {
PADDLE_ENFORCE_EQ(shifts.size(), 1,
platform::errors::InvalidArgument(
"When dims.size() == 0, shifts.size() "
"should be equal to 1, But received "
"shifts.size() = %d",
shifts.size()));
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
auto type = ctx->GetInputsVarType("X")[0];
......@@ -95,7 +107,7 @@ class RollOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<std::vector<int64_t>>(
"axis",
"Axis along which to roll. It must have the same size "
"with shifts.")
"with shifts or size == 0")
.SetDefault({});
AddComment(R"DOC(
Roll the tensor along the given dimension(s).
......@@ -151,8 +163,9 @@ REGISTER_OP_VERSION(roll)
paddle::framework::compatible::OpVersionDesc()
.NewAttr("axis",
"(std::vector<int64_t>) Axis along which to roll. "
"It must have the same size with shifts.",
"It must have the same size with shifts, or size = 0.",
std::vector<int64_t>())
.DeleteAttr("dims",
"(std::vector<int64_t>) Dims along which to roll. "
"It must have the same size with shifts."));
.DeleteAttr(
"dims",
"(std::vector<int64_t>) Dims along which to roll. "
"It must have the same size with shifts, or size = 0."));
......@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/roll_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
......@@ -24,26 +25,31 @@ using platform::PADDLE_CUDA_NUM_THREADS;
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T>
__global__ void roll_cuda_kernel(const T* input, T* output, int64_t N,
int64_t* shifts, int64_t* strides,
int64_t* sizes, int64_t nums) {
template <typename T, size_t Rank>
__global__ void RollCudaKernel(const T* input, T* output, int64_t N,
paddle::framework::Array<int64_t, Rank> shifts,
paddle::framework::Array<int64_t, Rank> strides,
paddle::framework::Array<int64_t, Rank> sizes) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
int64_t output_idx = idx;
int64_t dim_idx, dim_idx_shift;
for (int64_t i = 0; i < nums; i++) {
dim_idx = idx % (strides[i] * sizes[i]) / strides[i];
#pragma unroll Rank
for (size_t i = 0; i < Rank; i++) {
dim_idx = (idx / strides[i]) % sizes[i];
dim_idx_shift = (dim_idx + shifts[i]) % sizes[i];
output_idx = output_idx + (dim_idx_shift - dim_idx) * strides[i];
}
output[output_idx] = input[idx];
}
template <typename DeviceContext, typename T>
class RollCUDAKernel : public framework::OpKernel<T> {
template <typename T>
class RollKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
......@@ -61,50 +67,62 @@ class RollCUDAKernel : public framework::OpKernel<T> {
auto input_dim = in->dims();
auto stride_dim = framework::stride(input_dim);
int64_t dim, size;
size_t gpu_memory_size_ = sizeof(int64_t) * nums;
std::vector<int64_t> strides, sizes;
strides.resize(nums);
sizes.resize(nums);
paddle::memory::AllocationPtr shifts_gpu =
memory::Alloc(context.GetPlace(), gpu_memory_size_);
paddle::memory::AllocationPtr strides_gpu =
memory::Alloc(context.GetPlace(), gpu_memory_size_);
paddle::memory::AllocationPtr sizes_gpu =
memory::Alloc(context.GetPlace(), gpu_memory_size_);
for (size_t i = 0; i < nums; i++) {
dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
size = input_dim[dim];
shifts[i] = (shifts[i] % size + size) % size;
strides[i] = stride_dim[dim];
sizes[i] = size;
std::vector<int64_t> strides(nums), sizes(nums);
if (dims.size() == 0) {
strides[0] = 1;
sizes[0] = numel;
shifts[0] = (shifts[0] % numel + numel) % numel;
} else {
for (size_t i = 0; i < nums; i++) {
int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
int64_t size = input_dim[dim];
shifts[i] = (shifts[i] % size + size) % size;
strides[i] = stride_dim[dim];
sizes[i] = size;
}
}
#define CALL_ROLL_CUDA_KERNEL(N) \
case N: { \
paddle::framework::Array<int64_t, N> _strides; \
paddle::framework::Array<int64_t, N> _shifts; \
paddle::framework::Array<int64_t, N> _sizes; \
for (size_t idx = 0; idx < N; ++idx) { \
_strides[idx] = strides[idx]; \
_shifts[idx] = shifts[idx]; \
_sizes[idx] = sizes[idx]; \
} \
RollCudaKernel< \
T, \
N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel, \
_shifts, _strides, _sizes); \
break; \
}
switch (nums) {
CALL_ROLL_CUDA_KERNEL(1);
CALL_ROLL_CUDA_KERNEL(2);
CALL_ROLL_CUDA_KERNEL(3);
CALL_ROLL_CUDA_KERNEL(4);
CALL_ROLL_CUDA_KERNEL(5);
CALL_ROLL_CUDA_KERNEL(6);
CALL_ROLL_CUDA_KERNEL(7);
CALL_ROLL_CUDA_KERNEL(8);
CALL_ROLL_CUDA_KERNEL(9);
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"shifts.size() should be less than 10, But received shifts.size() "
"= %d",
shifts.size()));
}
paddle::memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, shifts_gpu->place()),
shifts_gpu->ptr(), platform::CPUPlace(), shifts.data(),
gpu_memory_size_, stream);
paddle::memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, strides_gpu->place()),
strides_gpu->ptr(), platform::CPUPlace(), strides.data(),
gpu_memory_size_, stream);
paddle::memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, sizes_gpu->place()),
sizes_gpu->ptr(), platform::CPUPlace(), sizes.data(), gpu_memory_size_,
stream);
int64_t* shifts_ptr = reinterpret_cast<int64_t*>(shifts_gpu->ptr());
int64_t* strides_ptr = reinterpret_cast<int64_t*>(strides_gpu->ptr());
int64_t* sizes_ptr = reinterpret_cast<int64_t*>(sizes_gpu->ptr());
roll_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
in_data, out_data, numel, shifts_ptr, strides_ptr, sizes_ptr, nums);
}
};
template <typename DeviceContext, typename T>
class RollGradCUDAKernel : public framework::OpKernel<T> {
template <typename T>
class RollGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>(framework::GradVarName("Out"));
......@@ -121,46 +139,38 @@ class RollGradCUDAKernel : public framework::OpKernel<T> {
auto input_dim = in->dims();
auto stride_dim = framework::stride(input_dim);
int64_t dim, size;
size_t gpu_memory_size_ = sizeof(int64_t) * nums;
std::vector<int64_t> strides, sizes;
strides.resize(nums);
sizes.resize(nums);
paddle::memory::AllocationPtr shifts_gpu =
memory::Alloc(context.GetPlace(), gpu_memory_size_);
paddle::memory::AllocationPtr strides_gpu =
memory::Alloc(context.GetPlace(), gpu_memory_size_);
paddle::memory::AllocationPtr sizes_gpu =
memory::Alloc(context.GetPlace(), gpu_memory_size_);
for (size_t i = 0; i < nums; i++) {
dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
size = input_dim[dim];
shifts[i] = ((0 - shifts[i]) % size + size) % size;
strides[i] = stride_dim[dim];
sizes[i] = size;
std::vector<int64_t> strides(nums), sizes(nums);
if (dims.size() == 0) {
strides[0] = 1;
sizes[0] = numel;
shifts[0] = ((-shifts[0]) % numel + numel) % numel;
} else {
for (size_t i = 0; i < nums; i++) {
int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
int64_t size = input_dim[dim];
shifts[i] = ((-shifts[i]) % size + size) % size;
strides[i] = stride_dim[dim];
sizes[i] = size;
}
}
paddle::memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, shifts_gpu->place()),
shifts_gpu->ptr(), platform::CPUPlace(), shifts.data(),
gpu_memory_size_, stream);
paddle::memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, strides_gpu->place()),
strides_gpu->ptr(), platform::CPUPlace(), strides.data(),
gpu_memory_size_, stream);
paddle::memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, sizes_gpu->place()),
sizes_gpu->ptr(), platform::CPUPlace(), sizes.data(), gpu_memory_size_,
stream);
int64_t* shifts_ptr = reinterpret_cast<int64_t*>(shifts_gpu->ptr());
int64_t* strides_ptr = reinterpret_cast<int64_t*>(strides_gpu->ptr());
int64_t* sizes_ptr = reinterpret_cast<int64_t*>(sizes_gpu->ptr());
roll_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
in_data, out_data, numel, shifts_ptr, strides_ptr, sizes_ptr, nums);
switch (nums) {
CALL_ROLL_CUDA_KERNEL(1);
CALL_ROLL_CUDA_KERNEL(2);
CALL_ROLL_CUDA_KERNEL(3);
CALL_ROLL_CUDA_KERNEL(4);
CALL_ROLL_CUDA_KERNEL(5);
CALL_ROLL_CUDA_KERNEL(6);
CALL_ROLL_CUDA_KERNEL(7);
CALL_ROLL_CUDA_KERNEL(8);
CALL_ROLL_CUDA_KERNEL(9);
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"shifts.size() should be less than 10, But received shifts.size() "
"= %d",
shifts.size()));
}
}
};
......@@ -169,13 +179,12 @@ class RollGradCUDAKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
roll, ops::RollCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::RollCUDAKernel<paddle::platform::CUDADeviceContext, double>,
ops::RollCUDAKernel<paddle::platform::CUDADeviceContext, int>,
ops::RollCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
roll, ops::RollKernel<paddle::platform::CUDADeviceContext, float>,
ops::RollKernel<paddle::platform::CUDADeviceContext, double>,
ops::RollKernel<paddle::platform::CUDADeviceContext, int>,
ops::RollKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
roll_grad,
ops::RollGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::RollGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
ops::RollGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
ops::RollGradCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
roll_grad, ops::RollGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::RollGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::RollGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::RollGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
......@@ -88,7 +88,13 @@ class RollKernel : public framework::OpKernel<T> {
TensorToVector(input, context.device_context(), &out_vec);
size_t nums = shifts.size();
const DDim input_dim = input.dims();
DDim input_dim = input.dims();
// axis = none, reshape to 1-D tensor
if (dims.size() == 0) {
dims.push_back(0l);
input_dim = framework::Dim<1>(out_vec.size());
}
for (size_t i = 0; i < nums; i++) {
PADDLE_ENFORCE_EQ(
......@@ -101,7 +107,7 @@ class RollKernel : public framework::OpKernel<T> {
}
output->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(out_vec, context.device_context(), output);
output->Resize(input_dim);
output->Resize(input.dims());
}
};
......@@ -120,14 +126,20 @@ class RollGradKernel : public framework::OpKernel<T> {
TensorToVector(input, context.device_context(), &out_vec);
size_t nums = shifts.size();
const DDim input_dim = input.dims();
DDim input_dim = input.dims();
// axis = none, reshape to 1-D tensor
if (dims.size() == 0) {
dims.push_back(0l);
input_dim = framework::Dim<1>(out_vec.size());
}
for (size_t i = 0; i < nums; i++) {
shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]);
}
output->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(out_vec, context.device_context(), output);
output->Resize(input_dim);
output->Resize(input.dims());
}
};
......
......@@ -63,6 +63,7 @@ class TestRollAPI(unittest.TestCase):
def test_roll_op_api(self):
self.input_data()
paddle.enable_static()
# case 1:
with program_guard(Program(), Program()):
x = fluid.layers.data(name='x', shape=[-1, 3])
......
......@@ -459,28 +459,22 @@ def roll(x, shifts, axis=None, name=None):
if axis:
check_type(axis, 'axis', (list, tuple), 'roll')
else:
axis = []
check_type(shifts, 'shifts', (list, tuple), 'roll')
if in_dygraph_mode():
if axis is None:
x = core.ops.reshape(x, 'shape', [-1, 1])
axis = [0]
out = core.ops.roll(x, 'axis', axis, 'shifts', shifts)
return core.ops.reshape(out, 'shape', origin_shape)
return core.ops.roll(x, 'axis', axis, 'shifts', shifts)
out = helper.create_variable_for_type_inference(x.dtype)
if axis is None:
x = reshape(x, shape=[-1, 1])
axis = [0]
helper.append_op(
type='roll',
inputs={'X': x},
outputs={'Out': out},
attrs={'axis': axis,
'shifts': shifts})
out = layers.reshape(out, shape=origin_shape)
return out
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册