未验证 提交 3fc56aa0 编写于 作者: S sunli 提交者: GitHub

roll optimize (#32880)

上级 07fadc4e
...@@ -13,8 +13,10 @@ ...@@ -13,8 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/roll_op.h" #include "paddle/fluid/operators/roll_op.h"
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
...@@ -37,12 +39,22 @@ class RollOp : public framework::OperatorWithKernel { ...@@ -37,12 +39,22 @@ class RollOp : public framework::OperatorWithKernel {
auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis"); auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts"); auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");
if (dims.size() != 0) {
PADDLE_ENFORCE_EQ(dims.size(), shifts.size(), PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Attr(dims).size() should be equl to " "When dims.size() != 0, dims.size() "
"Attr(shifts).size(). But received " "should be equal to "
"Attr(dims).size() = %d, Attr(shifts).size() = %d", "shifts.size(). But received "
"dims.size() = %d, shifts.size() = %d",
dims.size(), shifts.size())); dims.size(), shifts.size()));
} else {
PADDLE_ENFORCE_EQ(shifts.size(), 1,
platform::errors::InvalidArgument(
"When dims.size() == 0, shifts.size() "
"should be equal to 1, But received "
"shifts.size() = %d",
shifts.size()));
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
auto type = ctx->GetInputsVarType("X")[0]; auto type = ctx->GetInputsVarType("X")[0];
...@@ -95,7 +107,7 @@ class RollOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -95,7 +107,7 @@ class RollOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<std::vector<int64_t>>( AddAttr<std::vector<int64_t>>(
"axis", "axis",
"Axis along which to roll. It must have the same size " "Axis along which to roll. It must have the same size "
"with shifts.") "with shifts or size == 0")
.SetDefault({}); .SetDefault({});
AddComment(R"DOC( AddComment(R"DOC(
Roll the tensor along the given dimension(s). Roll the tensor along the given dimension(s).
...@@ -151,8 +163,9 @@ REGISTER_OP_VERSION(roll) ...@@ -151,8 +163,9 @@ REGISTER_OP_VERSION(roll)
paddle::framework::compatible::OpVersionDesc() paddle::framework::compatible::OpVersionDesc()
.NewAttr("axis", .NewAttr("axis",
"(std::vector<int64_t>) Axis along which to roll. " "(std::vector<int64_t>) Axis along which to roll. "
"It must have the same size with shifts.", "It must have the same size with shifts, or size = 0.",
std::vector<int64_t>()) std::vector<int64_t>())
.DeleteAttr("dims", .DeleteAttr(
"dims",
"(std::vector<int64_t>) Dims along which to roll. " "(std::vector<int64_t>) Dims along which to roll. "
"It must have the same size with shifts.")); "It must have the same size with shifts, or size = 0."));
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "paddle/fluid/framework/array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/roll_op.h" #include "paddle/fluid/operators/roll_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -24,26 +25,31 @@ using platform::PADDLE_CUDA_NUM_THREADS; ...@@ -24,26 +25,31 @@ using platform::PADDLE_CUDA_NUM_THREADS;
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
template <typename T> template <typename T, size_t Rank>
__global__ void roll_cuda_kernel(const T* input, T* output, int64_t N, __global__ void RollCudaKernel(const T* input, T* output, int64_t N,
int64_t* shifts, int64_t* strides, paddle::framework::Array<int64_t, Rank> shifts,
int64_t* sizes, int64_t nums) { paddle::framework::Array<int64_t, Rank> strides,
paddle::framework::Array<int64_t, Rank> sizes) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) { if (idx >= N) {
return; return;
} }
int64_t output_idx = idx; int64_t output_idx = idx;
int64_t dim_idx, dim_idx_shift; int64_t dim_idx, dim_idx_shift;
for (int64_t i = 0; i < nums; i++) {
dim_idx = idx % (strides[i] * sizes[i]) / strides[i]; #pragma unroll Rank
for (size_t i = 0; i < Rank; i++) {
dim_idx = (idx / strides[i]) % sizes[i];
dim_idx_shift = (dim_idx + shifts[i]) % sizes[i]; dim_idx_shift = (dim_idx + shifts[i]) % sizes[i];
output_idx = output_idx + (dim_idx_shift - dim_idx) * strides[i]; output_idx = output_idx + (dim_idx_shift - dim_idx) * strides[i];
} }
output[output_idx] = input[idx]; output[output_idx] = input[idx];
} }
template <typename DeviceContext, typename T> template <typename T>
class RollCUDAKernel : public framework::OpKernel<T> { class RollKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X"); auto* in = context.Input<LoDTensor>("X");
...@@ -61,50 +67,62 @@ class RollCUDAKernel : public framework::OpKernel<T> { ...@@ -61,50 +67,62 @@ class RollCUDAKernel : public framework::OpKernel<T> {
auto input_dim = in->dims(); auto input_dim = in->dims();
auto stride_dim = framework::stride(input_dim); auto stride_dim = framework::stride(input_dim);
int64_t dim, size; std::vector<int64_t> strides(nums), sizes(nums);
size_t gpu_memory_size_ = sizeof(int64_t) * nums; if (dims.size() == 0) {
std::vector<int64_t> strides, sizes; strides[0] = 1;
strides.resize(nums); sizes[0] = numel;
sizes.resize(nums); shifts[0] = (shifts[0] % numel + numel) % numel;
paddle::memory::AllocationPtr shifts_gpu = } else {
memory::Alloc(context.GetPlace(), gpu_memory_size_);
paddle::memory::AllocationPtr strides_gpu =
memory::Alloc(context.GetPlace(), gpu_memory_size_);
paddle::memory::AllocationPtr sizes_gpu =
memory::Alloc(context.GetPlace(), gpu_memory_size_);
for (size_t i = 0; i < nums; i++) { for (size_t i = 0; i < nums; i++) {
dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
size = input_dim[dim]; int64_t size = input_dim[dim];
shifts[i] = (shifts[i] % size + size) % size; shifts[i] = (shifts[i] % size + size) % size;
strides[i] = stride_dim[dim]; strides[i] = stride_dim[dim];
sizes[i] = size; sizes[i] = size;
} }
paddle::memory::Copy( }
BOOST_GET_CONST(platform::CUDAPlace, shifts_gpu->place()),
shifts_gpu->ptr(), platform::CPUPlace(), shifts.data(), #define CALL_ROLL_CUDA_KERNEL(N) \
gpu_memory_size_, stream); case N: { \
paddle::memory::Copy( paddle::framework::Array<int64_t, N> _strides; \
BOOST_GET_CONST(platform::CUDAPlace, strides_gpu->place()), paddle::framework::Array<int64_t, N> _shifts; \
strides_gpu->ptr(), platform::CPUPlace(), strides.data(), paddle::framework::Array<int64_t, N> _sizes; \
gpu_memory_size_, stream); for (size_t idx = 0; idx < N; ++idx) { \
paddle::memory::Copy( _strides[idx] = strides[idx]; \
BOOST_GET_CONST(platform::CUDAPlace, sizes_gpu->place()), _shifts[idx] = shifts[idx]; \
sizes_gpu->ptr(), platform::CPUPlace(), sizes.data(), gpu_memory_size_, _sizes[idx] = sizes[idx]; \
stream); } \
int64_t* shifts_ptr = reinterpret_cast<int64_t*>(shifts_gpu->ptr()); RollCudaKernel< \
int64_t* strides_ptr = reinterpret_cast<int64_t*>(strides_gpu->ptr()); T, \
int64_t* sizes_ptr = reinterpret_cast<int64_t*>(sizes_gpu->ptr()); N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel, \
roll_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / _shifts, _strides, _sizes); \
PADDLE_CUDA_NUM_THREADS, break; \
PADDLE_CUDA_NUM_THREADS, 0, stream>>>( }
in_data, out_data, numel, shifts_ptr, strides_ptr, sizes_ptr, nums);
switch (nums) {
CALL_ROLL_CUDA_KERNEL(1);
CALL_ROLL_CUDA_KERNEL(2);
CALL_ROLL_CUDA_KERNEL(3);
CALL_ROLL_CUDA_KERNEL(4);
CALL_ROLL_CUDA_KERNEL(5);
CALL_ROLL_CUDA_KERNEL(6);
CALL_ROLL_CUDA_KERNEL(7);
CALL_ROLL_CUDA_KERNEL(8);
CALL_ROLL_CUDA_KERNEL(9);
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"shifts.size() should be less than 10, But received shifts.size() "
"= %d",
shifts.size()));
}
} }
}; };
template <typename DeviceContext, typename T> template <typename T>
class RollGradCUDAKernel : public framework::OpKernel<T> { class RollGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>(framework::GradVarName("Out")); auto* in = context.Input<LoDTensor>(framework::GradVarName("Out"));
...@@ -121,46 +139,38 @@ class RollGradCUDAKernel : public framework::OpKernel<T> { ...@@ -121,46 +139,38 @@ class RollGradCUDAKernel : public framework::OpKernel<T> {
auto input_dim = in->dims(); auto input_dim = in->dims();
auto stride_dim = framework::stride(input_dim); auto stride_dim = framework::stride(input_dim);
int64_t dim, size; std::vector<int64_t> strides(nums), sizes(nums);
size_t gpu_memory_size_ = sizeof(int64_t) * nums; if (dims.size() == 0) {
std::vector<int64_t> strides, sizes; strides[0] = 1;
strides.resize(nums); sizes[0] = numel;
sizes.resize(nums); shifts[0] = ((-shifts[0]) % numel + numel) % numel;
paddle::memory::AllocationPtr shifts_gpu = } else {
memory::Alloc(context.GetPlace(), gpu_memory_size_);
paddle::memory::AllocationPtr strides_gpu =
memory::Alloc(context.GetPlace(), gpu_memory_size_);
paddle::memory::AllocationPtr sizes_gpu =
memory::Alloc(context.GetPlace(), gpu_memory_size_);
for (size_t i = 0; i < nums; i++) { for (size_t i = 0; i < nums; i++) {
dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
size = input_dim[dim]; int64_t size = input_dim[dim];
shifts[i] = ((0 - shifts[i]) % size + size) % size;
shifts[i] = ((-shifts[i]) % size + size) % size;
strides[i] = stride_dim[dim]; strides[i] = stride_dim[dim];
sizes[i] = size; sizes[i] = size;
} }
}
paddle::memory::Copy( switch (nums) {
BOOST_GET_CONST(platform::CUDAPlace, shifts_gpu->place()), CALL_ROLL_CUDA_KERNEL(1);
shifts_gpu->ptr(), platform::CPUPlace(), shifts.data(), CALL_ROLL_CUDA_KERNEL(2);
gpu_memory_size_, stream); CALL_ROLL_CUDA_KERNEL(3);
paddle::memory::Copy( CALL_ROLL_CUDA_KERNEL(4);
BOOST_GET_CONST(platform::CUDAPlace, strides_gpu->place()), CALL_ROLL_CUDA_KERNEL(5);
strides_gpu->ptr(), platform::CPUPlace(), strides.data(), CALL_ROLL_CUDA_KERNEL(6);
gpu_memory_size_, stream); CALL_ROLL_CUDA_KERNEL(7);
paddle::memory::Copy( CALL_ROLL_CUDA_KERNEL(8);
BOOST_GET_CONST(platform::CUDAPlace, sizes_gpu->place()), CALL_ROLL_CUDA_KERNEL(9);
sizes_gpu->ptr(), platform::CPUPlace(), sizes.data(), gpu_memory_size_, default:
stream); PADDLE_THROW(platform::errors::InvalidArgument(
int64_t* shifts_ptr = reinterpret_cast<int64_t*>(shifts_gpu->ptr()); "shifts.size() should be less than 10, But received shifts.size() "
int64_t* strides_ptr = reinterpret_cast<int64_t*>(strides_gpu->ptr()); "= %d",
int64_t* sizes_ptr = reinterpret_cast<int64_t*>(sizes_gpu->ptr()); shifts.size()));
}
roll_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
in_data, out_data, numel, shifts_ptr, strides_ptr, sizes_ptr, nums);
} }
}; };
...@@ -169,13 +179,12 @@ class RollGradCUDAKernel : public framework::OpKernel<T> { ...@@ -169,13 +179,12 @@ class RollGradCUDAKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
roll, ops::RollCUDAKernel<paddle::platform::CUDADeviceContext, float>, roll, ops::RollKernel<paddle::platform::CUDADeviceContext, float>,
ops::RollCUDAKernel<paddle::platform::CUDADeviceContext, double>, ops::RollKernel<paddle::platform::CUDADeviceContext, double>,
ops::RollCUDAKernel<paddle::platform::CUDADeviceContext, int>, ops::RollKernel<paddle::platform::CUDADeviceContext, int>,
ops::RollCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::RollKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
roll_grad, roll_grad, ops::RollGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::RollGradCUDAKernel<paddle::platform::CUDADeviceContext, float>, ops::RollGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::RollGradCUDAKernel<paddle::platform::CUDADeviceContext, double>, ops::RollGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::RollGradCUDAKernel<paddle::platform::CUDADeviceContext, int>, ops::RollGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
ops::RollGradCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
...@@ -88,7 +88,13 @@ class RollKernel : public framework::OpKernel<T> { ...@@ -88,7 +88,13 @@ class RollKernel : public framework::OpKernel<T> {
TensorToVector(input, context.device_context(), &out_vec); TensorToVector(input, context.device_context(), &out_vec);
size_t nums = shifts.size(); size_t nums = shifts.size();
const DDim input_dim = input.dims(); DDim input_dim = input.dims();
// axis = none, reshape to 1-D tensor
if (dims.size() == 0) {
dims.push_back(0l);
input_dim = framework::Dim<1>(out_vec.size());
}
for (size_t i = 0; i < nums; i++) { for (size_t i = 0; i < nums; i++) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -101,7 +107,7 @@ class RollKernel : public framework::OpKernel<T> { ...@@ -101,7 +107,7 @@ class RollKernel : public framework::OpKernel<T> {
} }
output->mutable_data<T>(context.GetPlace()); output->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(out_vec, context.device_context(), output); framework::TensorFromVector(out_vec, context.device_context(), output);
output->Resize(input_dim); output->Resize(input.dims());
} }
}; };
...@@ -120,14 +126,20 @@ class RollGradKernel : public framework::OpKernel<T> { ...@@ -120,14 +126,20 @@ class RollGradKernel : public framework::OpKernel<T> {
TensorToVector(input, context.device_context(), &out_vec); TensorToVector(input, context.device_context(), &out_vec);
size_t nums = shifts.size(); size_t nums = shifts.size();
const DDim input_dim = input.dims(); DDim input_dim = input.dims();
// axis = none, reshape to 1-D tensor
if (dims.size() == 0) {
dims.push_back(0l);
input_dim = framework::Dim<1>(out_vec.size());
}
for (size_t i = 0; i < nums; i++) { for (size_t i = 0; i < nums; i++) {
shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]); shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]);
} }
output->mutable_data<T>(context.GetPlace()); output->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(out_vec, context.device_context(), output); framework::TensorFromVector(out_vec, context.device_context(), output);
output->Resize(input_dim); output->Resize(input.dims());
} }
}; };
......
...@@ -63,6 +63,7 @@ class TestRollAPI(unittest.TestCase): ...@@ -63,6 +63,7 @@ class TestRollAPI(unittest.TestCase):
def test_roll_op_api(self): def test_roll_op_api(self):
self.input_data() self.input_data()
paddle.enable_static()
# case 1: # case 1:
with program_guard(Program(), Program()): with program_guard(Program(), Program()):
x = fluid.layers.data(name='x', shape=[-1, 3]) x = fluid.layers.data(name='x', shape=[-1, 3])
......
...@@ -459,28 +459,22 @@ def roll(x, shifts, axis=None, name=None): ...@@ -459,28 +459,22 @@ def roll(x, shifts, axis=None, name=None):
if axis: if axis:
check_type(axis, 'axis', (list, tuple), 'roll') check_type(axis, 'axis', (list, tuple), 'roll')
else:
axis = []
check_type(shifts, 'shifts', (list, tuple), 'roll') check_type(shifts, 'shifts', (list, tuple), 'roll')
if in_dygraph_mode(): if in_dygraph_mode():
if axis is None: return core.ops.roll(x, 'axis', axis, 'shifts', shifts)
x = core.ops.reshape(x, 'shape', [-1, 1])
axis = [0]
out = core.ops.roll(x, 'axis', axis, 'shifts', shifts)
return core.ops.reshape(out, 'shape', origin_shape)
out = helper.create_variable_for_type_inference(x.dtype) out = helper.create_variable_for_type_inference(x.dtype)
if axis is None:
x = reshape(x, shape=[-1, 1])
axis = [0]
helper.append_op( helper.append_op(
type='roll', type='roll',
inputs={'X': x}, inputs={'X': x},
outputs={'Out': out}, outputs={'Out': out},
attrs={'axis': axis, attrs={'axis': axis,
'shifts': shifts}) 'shifts': shifts})
out = layers.reshape(out, shape=origin_shape)
return out return out
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册