Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
3fc56aa0
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3fc56aa0
编写于
7月 01, 2021
作者:
S
sunli
提交者:
GitHub
7月 01, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
roll optimize (#32880)
上级
07fadc4e
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
149 addition
and
120 deletion
+149
-120
paddle/fluid/operators/roll_op.cc
paddle/fluid/operators/roll_op.cc
+24
-11
paddle/fluid/operators/roll_op.cu
paddle/fluid/operators/roll_op.cu
+104
-95
paddle/fluid/operators/roll_op.h
paddle/fluid/operators/roll_op.h
+16
-4
python/paddle/fluid/tests/unittests/test_roll_op.py
python/paddle/fluid/tests/unittests/test_roll_op.py
+1
-0
python/paddle/tensor/manipulation.py
python/paddle/tensor/manipulation.py
+4
-10
未找到文件。
paddle/fluid/operators/roll_op.cc
浏览文件 @
3fc56aa0
...
...
@@ -13,8 +13,10 @@
// limitations under the License.
#include "paddle/fluid/operators/roll_op.h"
#include <memory>
#include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
namespace
paddle
{
...
...
@@ -37,12 +39,22 @@ class RollOp : public framework::OperatorWithKernel {
auto
dims
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int64_t
>>
(
"axis"
);
auto
shifts
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int64_t
>>
(
"shifts"
);
PADDLE_ENFORCE_EQ
(
dims
.
size
(),
shifts
.
size
(),
platform
::
errors
::
InvalidArgument
(
"Attr(dims).size() should be equl to "
"Attr(shifts).size(). But received "
"Attr(dims).size() = %d, Attr(shifts).size() = %d"
,
dims
.
size
(),
shifts
.
size
()));
if
(
dims
.
size
()
!=
0
)
{
PADDLE_ENFORCE_EQ
(
dims
.
size
(),
shifts
.
size
(),
platform
::
errors
::
InvalidArgument
(
"When dims.size() != 0, dims.size() "
"should be equal to "
"shifts.size(). But received "
"dims.size() = %d, shifts.size() = %d"
,
dims
.
size
(),
shifts
.
size
()));
}
else
{
PADDLE_ENFORCE_EQ
(
shifts
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"When dims.size() == 0, shifts.size() "
"should be equal to 1, But received "
"shifts.size() = %d"
,
shifts
.
size
()));
}
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"X"
));
auto
type
=
ctx
->
GetInputsVarType
(
"X"
)[
0
];
...
...
@@ -95,7 +107,7 @@ class RollOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
std
::
vector
<
int64_t
>>
(
"axis"
,
"Axis along which to roll. It must have the same size "
"with shifts
.
"
)
"with shifts
or size == 0
"
)
.
SetDefault
({});
AddComment
(
R"DOC(
Roll the tensor along the given dimension(s).
...
...
@@ -151,8 +163,9 @@ REGISTER_OP_VERSION(roll)
paddle
::
framework
::
compatible
::
OpVersionDesc
()
.
NewAttr
(
"axis"
,
"(std::vector<int64_t>) Axis along which to roll. "
"It must have the same size with shifts."
,
"It must have the same size with shifts
, or size = 0
."
,
std
::
vector
<
int64_t
>
())
.
DeleteAttr
(
"dims"
,
"(std::vector<int64_t>) Dims along which to roll. "
"It must have the same size with shifts."
));
.
DeleteAttr
(
"dims"
,
"(std::vector<int64_t>) Dims along which to roll. "
"It must have the same size with shifts, or size = 0."
));
paddle/fluid/operators/roll_op.cu
浏览文件 @
3fc56aa0
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/roll_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
...
...
@@ -24,26 +25,31 @@ using platform::PADDLE_CUDA_NUM_THREADS;
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
template
<
typename
T
>
__global__
void
roll_cuda_kernel
(
const
T
*
input
,
T
*
output
,
int64_t
N
,
int64_t
*
shifts
,
int64_t
*
strides
,
int64_t
*
sizes
,
int64_t
nums
)
{
template
<
typename
T
,
size_t
Rank
>
__global__
void
RollCudaKernel
(
const
T
*
input
,
T
*
output
,
int64_t
N
,
paddle
::
framework
::
Array
<
int64_t
,
Rank
>
shifts
,
paddle
::
framework
::
Array
<
int64_t
,
Rank
>
strides
,
paddle
::
framework
::
Array
<
int64_t
,
Rank
>
sizes
)
{
int64_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
idx
>=
N
)
{
return
;
}
int64_t
output_idx
=
idx
;
int64_t
dim_idx
,
dim_idx_shift
;
for
(
int64_t
i
=
0
;
i
<
nums
;
i
++
)
{
dim_idx
=
idx
%
(
strides
[
i
]
*
sizes
[
i
])
/
strides
[
i
];
#pragma unroll Rank
for
(
size_t
i
=
0
;
i
<
Rank
;
i
++
)
{
dim_idx
=
(
idx
/
strides
[
i
])
%
sizes
[
i
];
dim_idx_shift
=
(
dim_idx
+
shifts
[
i
])
%
sizes
[
i
];
output_idx
=
output_idx
+
(
dim_idx_shift
-
dim_idx
)
*
strides
[
i
];
}
output
[
output_idx
]
=
input
[
idx
];
}
template
<
typename
DeviceContext
,
typename
T
>
class
RollCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
template
<
typename
T
>
class
RollKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
in
=
context
.
Input
<
LoDTensor
>
(
"X"
);
...
...
@@ -61,50 +67,62 @@ class RollCUDAKernel : public framework::OpKernel<T> {
auto
input_dim
=
in
->
dims
();
auto
stride_dim
=
framework
::
stride
(
input_dim
);
int64_t
dim
,
size
;
size_t
gpu_memory_size_
=
sizeof
(
int64_t
)
*
nums
;
std
::
vector
<
int64_t
>
strides
,
sizes
;
strides
.
resize
(
nums
);
sizes
.
resize
(
nums
);
paddle
::
memory
::
AllocationPtr
shifts_gpu
=
memory
::
Alloc
(
context
.
GetPlace
(),
gpu_memory_size_
);
paddle
::
memory
::
AllocationPtr
strides_gpu
=
memory
::
Alloc
(
context
.
GetPlace
(),
gpu_memory_size_
);
paddle
::
memory
::
AllocationPtr
sizes_gpu
=
memory
::
Alloc
(
context
.
GetPlace
(),
gpu_memory_size_
);
for
(
size_t
i
=
0
;
i
<
nums
;
i
++
)
{
dim
=
dims
[
i
]
>=
0
?
dims
[
i
]
:
dims
[
i
]
+
input_dim
.
size
();
size
=
input_dim
[
dim
];
shifts
[
i
]
=
(
shifts
[
i
]
%
size
+
size
)
%
size
;
strides
[
i
]
=
stride_dim
[
dim
];
sizes
[
i
]
=
size
;
std
::
vector
<
int64_t
>
strides
(
nums
),
sizes
(
nums
);
if
(
dims
.
size
()
==
0
)
{
strides
[
0
]
=
1
;
sizes
[
0
]
=
numel
;
shifts
[
0
]
=
(
shifts
[
0
]
%
numel
+
numel
)
%
numel
;
}
else
{
for
(
size_t
i
=
0
;
i
<
nums
;
i
++
)
{
int
dim
=
dims
[
i
]
>=
0
?
dims
[
i
]
:
dims
[
i
]
+
input_dim
.
size
();
int64_t
size
=
input_dim
[
dim
];
shifts
[
i
]
=
(
shifts
[
i
]
%
size
+
size
)
%
size
;
strides
[
i
]
=
stride_dim
[
dim
];
sizes
[
i
]
=
size
;
}
}
#define CALL_ROLL_CUDA_KERNEL(N) \
case N: { \
paddle::framework::Array<int64_t, N> _strides; \
paddle::framework::Array<int64_t, N> _shifts; \
paddle::framework::Array<int64_t, N> _sizes; \
for (size_t idx = 0; idx < N; ++idx) { \
_strides[idx] = strides[idx]; \
_shifts[idx] = shifts[idx]; \
_sizes[idx] = sizes[idx]; \
} \
RollCudaKernel< \
T, \
N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel, \
_shifts, _strides, _sizes); \
break; \
}
switch
(
nums
)
{
CALL_ROLL_CUDA_KERNEL
(
1
);
CALL_ROLL_CUDA_KERNEL
(
2
);
CALL_ROLL_CUDA_KERNEL
(
3
);
CALL_ROLL_CUDA_KERNEL
(
4
);
CALL_ROLL_CUDA_KERNEL
(
5
);
CALL_ROLL_CUDA_KERNEL
(
6
);
CALL_ROLL_CUDA_KERNEL
(
7
);
CALL_ROLL_CUDA_KERNEL
(
8
);
CALL_ROLL_CUDA_KERNEL
(
9
);
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"shifts.size() should be less than 10, But received shifts.size() "
"= %d"
,
shifts
.
size
()));
}
paddle
::
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
shifts_gpu
->
place
()),
shifts_gpu
->
ptr
(),
platform
::
CPUPlace
(),
shifts
.
data
(),
gpu_memory_size_
,
stream
);
paddle
::
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
strides_gpu
->
place
()),
strides_gpu
->
ptr
(),
platform
::
CPUPlace
(),
strides
.
data
(),
gpu_memory_size_
,
stream
);
paddle
::
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
sizes_gpu
->
place
()),
sizes_gpu
->
ptr
(),
platform
::
CPUPlace
(),
sizes
.
data
(),
gpu_memory_size_
,
stream
);
int64_t
*
shifts_ptr
=
reinterpret_cast
<
int64_t
*>
(
shifts_gpu
->
ptr
());
int64_t
*
strides_ptr
=
reinterpret_cast
<
int64_t
*>
(
strides_gpu
->
ptr
());
int64_t
*
sizes_ptr
=
reinterpret_cast
<
int64_t
*>
(
sizes_gpu
->
ptr
());
roll_cuda_kernel
<<<
(
numel
+
PADDLE_CUDA_NUM_THREADS
-
1
)
/
PADDLE_CUDA_NUM_THREADS
,
PADDLE_CUDA_NUM_THREADS
,
0
,
stream
>>>
(
in_data
,
out_data
,
numel
,
shifts_ptr
,
strides_ptr
,
sizes_ptr
,
nums
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
RollGradCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
template
<
typename
T
>
class
RollGradKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
in
=
context
.
Input
<
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
));
...
...
@@ -121,46 +139,38 @@ class RollGradCUDAKernel : public framework::OpKernel<T> {
auto
input_dim
=
in
->
dims
();
auto
stride_dim
=
framework
::
stride
(
input_dim
);
int64_t
dim
,
size
;
size_t
gpu_memory_size_
=
sizeof
(
int64_t
)
*
nums
;
std
::
vector
<
int64_t
>
strides
,
sizes
;
strides
.
resize
(
nums
);
sizes
.
resize
(
nums
);
paddle
::
memory
::
AllocationPtr
shifts_gpu
=
memory
::
Alloc
(
context
.
GetPlace
(),
gpu_memory_size_
);
paddle
::
memory
::
AllocationPtr
strides_gpu
=
memory
::
Alloc
(
context
.
GetPlace
(),
gpu_memory_size_
);
paddle
::
memory
::
AllocationPtr
sizes_gpu
=
memory
::
Alloc
(
context
.
GetPlace
(),
gpu_memory_size_
);
for
(
size_t
i
=
0
;
i
<
nums
;
i
++
)
{
dim
=
dims
[
i
]
>=
0
?
dims
[
i
]
:
dims
[
i
]
+
input_dim
.
size
();
size
=
input_dim
[
dim
];
shifts
[
i
]
=
((
0
-
shifts
[
i
])
%
size
+
size
)
%
size
;
strides
[
i
]
=
stride_dim
[
dim
];
sizes
[
i
]
=
size
;
std
::
vector
<
int64_t
>
strides
(
nums
),
sizes
(
nums
);
if
(
dims
.
size
()
==
0
)
{
strides
[
0
]
=
1
;
sizes
[
0
]
=
numel
;
shifts
[
0
]
=
((
-
shifts
[
0
])
%
numel
+
numel
)
%
numel
;
}
else
{
for
(
size_t
i
=
0
;
i
<
nums
;
i
++
)
{
int
dim
=
dims
[
i
]
>=
0
?
dims
[
i
]
:
dims
[
i
]
+
input_dim
.
size
();
int64_t
size
=
input_dim
[
dim
];
shifts
[
i
]
=
((
-
shifts
[
i
])
%
size
+
size
)
%
size
;
strides
[
i
]
=
stride_dim
[
dim
];
sizes
[
i
]
=
size
;
}
}
paddle
::
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
shifts_gpu
->
place
()),
shifts_gpu
->
ptr
(),
platform
::
CPUPlace
(),
shifts
.
data
(),
gpu_memory_size_
,
stream
);
paddle
::
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
strides_gpu
->
place
()),
strides_gpu
->
ptr
(),
platform
::
CPUPlace
(),
strides
.
data
(),
gpu_memory_size_
,
stream
);
paddle
::
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
sizes_gpu
->
place
()),
sizes_gpu
->
ptr
(),
platform
::
CPUPlace
(),
sizes
.
data
(),
gpu_memory_size_
,
stream
);
int64_t
*
shifts_ptr
=
reinterpret_cast
<
int64_t
*>
(
shifts_gpu
->
ptr
());
int64_t
*
strides_ptr
=
reinterpret_cast
<
int64_t
*>
(
strides_gpu
->
ptr
());
int64_t
*
sizes_ptr
=
reinterpret_cast
<
int64_t
*>
(
sizes_gpu
->
ptr
());
roll_cuda_kernel
<<<
(
numel
+
PADDLE_CUDA_NUM_THREADS
-
1
)
/
PADDLE_CUDA_NUM_THREADS
,
PADDLE_CUDA_NUM_THREADS
,
0
,
stream
>>>
(
in_data
,
out_data
,
numel
,
shifts_ptr
,
strides_ptr
,
sizes_ptr
,
nums
);
switch
(
nums
)
{
CALL_ROLL_CUDA_KERNEL
(
1
);
CALL_ROLL_CUDA_KERNEL
(
2
);
CALL_ROLL_CUDA_KERNEL
(
3
);
CALL_ROLL_CUDA_KERNEL
(
4
);
CALL_ROLL_CUDA_KERNEL
(
5
);
CALL_ROLL_CUDA_KERNEL
(
6
);
CALL_ROLL_CUDA_KERNEL
(
7
);
CALL_ROLL_CUDA_KERNEL
(
8
);
CALL_ROLL_CUDA_KERNEL
(
9
);
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"shifts.size() should be less than 10, But received shifts.size() "
"= %d"
,
shifts
.
size
()));
}
}
};
...
...
@@ -169,13 +179,12 @@ class RollGradCUDAKernel : public framework::OpKernel<T> {
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
roll
,
ops
::
Roll
CUDA
Kernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
Roll
CUDA
Kernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
Roll
CUDA
Kernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
Roll
CUDA
Kernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
roll
,
ops
::
RollKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
RollKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
RollKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
RollKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
roll_grad
,
ops
::
RollGradCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
RollGradCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
RollGradCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
RollGradCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
roll_grad
,
ops
::
RollGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
RollGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
RollGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
RollGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/roll_op.h
浏览文件 @
3fc56aa0
...
...
@@ -88,7 +88,13 @@ class RollKernel : public framework::OpKernel<T> {
TensorToVector
(
input
,
context
.
device_context
(),
&
out_vec
);
size_t
nums
=
shifts
.
size
();
const
DDim
input_dim
=
input
.
dims
();
DDim
input_dim
=
input
.
dims
();
// axis = none, reshape to 1-D tensor
if
(
dims
.
size
()
==
0
)
{
dims
.
push_back
(
0l
);
input_dim
=
framework
::
Dim
<
1
>
(
out_vec
.
size
());
}
for
(
size_t
i
=
0
;
i
<
nums
;
i
++
)
{
PADDLE_ENFORCE_EQ
(
...
...
@@ -101,7 +107,7 @@ class RollKernel : public framework::OpKernel<T> {
}
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
framework
::
TensorFromVector
(
out_vec
,
context
.
device_context
(),
output
);
output
->
Resize
(
input
_dim
);
output
->
Resize
(
input
.
dims
()
);
}
};
...
...
@@ -120,14 +126,20 @@ class RollGradKernel : public framework::OpKernel<T> {
TensorToVector
(
input
,
context
.
device_context
(),
&
out_vec
);
size_t
nums
=
shifts
.
size
();
const
DDim
input_dim
=
input
.
dims
();
DDim
input_dim
=
input
.
dims
();
// axis = none, reshape to 1-D tensor
if
(
dims
.
size
()
==
0
)
{
dims
.
push_back
(
0l
);
input_dim
=
framework
::
Dim
<
1
>
(
out_vec
.
size
());
}
for
(
size_t
i
=
0
;
i
<
nums
;
i
++
)
{
shift_along_dim
(
out_vec
.
data
(),
input_dim
,
dims
[
i
],
0
-
shifts
[
i
]);
}
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
framework
::
TensorFromVector
(
out_vec
,
context
.
device_context
(),
output
);
output
->
Resize
(
input
_dim
);
output
->
Resize
(
input
.
dims
()
);
}
};
...
...
python/paddle/fluid/tests/unittests/test_roll_op.py
浏览文件 @
3fc56aa0
...
...
@@ -63,6 +63,7 @@ class TestRollAPI(unittest.TestCase):
def
test_roll_op_api
(
self
):
self
.
input_data
()
paddle
.
enable_static
()
# case 1:
with
program_guard
(
Program
(),
Program
()):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
-
1
,
3
])
...
...
python/paddle/tensor/manipulation.py
浏览文件 @
3fc56aa0
...
...
@@ -459,28 +459,22 @@ def roll(x, shifts, axis=None, name=None):
if
axis
:
check_type
(
axis
,
'axis'
,
(
list
,
tuple
),
'roll'
)
else
:
axis
=
[]
check_type
(
shifts
,
'shifts'
,
(
list
,
tuple
),
'roll'
)
if
in_dygraph_mode
():
if
axis
is
None
:
x
=
core
.
ops
.
reshape
(
x
,
'shape'
,
[
-
1
,
1
])
axis
=
[
0
]
out
=
core
.
ops
.
roll
(
x
,
'axis'
,
axis
,
'shifts'
,
shifts
)
return
core
.
ops
.
reshape
(
out
,
'shape'
,
origin_shape
)
return
core
.
ops
.
roll
(
x
,
'axis'
,
axis
,
'shifts'
,
shifts
)
out
=
helper
.
create_variable_for_type_inference
(
x
.
dtype
)
if
axis
is
None
:
x
=
reshape
(
x
,
shape
=
[
-
1
,
1
])
axis
=
[
0
]
helper
.
append_op
(
type
=
'roll'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'axis'
:
axis
,
'shifts'
:
shifts
})
out
=
layers
.
reshape
(
out
,
shape
=
origin_shape
)
return
out
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录