Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
a400b76d
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a400b76d
编写于
12月 22, 2020
作者:
1
123malin
提交者:
GitHub
12月 22, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Roll cuda kernel (#29655)
* test=develop, optimize roll_op_cuda_kernel
上级
e7ac74c8
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
163 addition
and
8 deletion
+163
-8
paddle/fluid/operators/roll_op.cc
paddle/fluid/operators/roll_op.cc
+1
-0
paddle/fluid/operators/roll_op.cu
paddle/fluid/operators/roll_op.cu
+162
-8
未找到文件。
paddle/fluid/operators/roll_op.cc
浏览文件 @
a400b76d
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/operators/roll_op.h"
#include "paddle/fluid/operators/roll_op.h"
#include <memory>
#include <memory>
#include <vector>
#include <vector>
...
...
paddle/fluid/operators/roll_op.cu
浏览文件 @
a400b76d
...
@@ -12,16 +12,170 @@
...
@@ -12,16 +12,170 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/roll_op.h"
#include "paddle/fluid/operators/roll_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace
paddle
{
namespace
operators
{
using
platform
::
PADDLE_CUDA_NUM_THREADS
;
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
template
<
typename
T
>
__global__
void
roll_cuda_kernel
(
const
T
*
input
,
T
*
output
,
int64_t
N
,
int64_t
*
shifts
,
int64_t
*
strides
,
int64_t
*
sizes
,
int64_t
nums
)
{
int64_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
idx
>=
N
)
{
return
;
}
int64_t
output_idx
=
idx
;
int64_t
dim_idx
,
dim_idx_shift
;
for
(
int64_t
i
=
0
;
i
<
nums
;
i
++
)
{
dim_idx
=
idx
%
(
strides
[
i
]
*
sizes
[
i
])
/
strides
[
i
];
dim_idx_shift
=
(
dim_idx
+
shifts
[
i
])
%
sizes
[
i
];
output_idx
=
output_idx
+
(
dim_idx_shift
-
dim_idx
)
*
strides
[
i
];
}
output
[
output_idx
]
=
input
[
idx
];
}
template
<
typename
DeviceContext
,
typename
T
>
class
RollCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
in
=
context
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
LoDTensor
>
(
"Out"
);
std
::
vector
<
int64_t
>
shifts
=
context
.
Attr
<
std
::
vector
<
int64_t
>>
(
"shifts"
);
std
::
vector
<
int64_t
>
dims
=
context
.
Attr
<
std
::
vector
<
int64_t
>>
(
"axis"
);
auto
*
in_data
=
in
->
data
<
T
>
();
auto
*
out_data
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int64_t
numel
=
in
->
numel
();
auto
stream
=
context
.
template
device_context
<
platform
::
CUDADeviceContext
>().
stream
();
size_t
nums
=
shifts
.
size
();
auto
input_dim
=
in
->
dims
();
auto
stride_dim
=
framework
::
stride
(
input_dim
);
int64_t
dim
,
size
;
size_t
gpu_memory_size_
=
sizeof
(
int64_t
)
*
nums
;
std
::
vector
<
int64_t
>
strides
,
sizes
;
strides
.
resize
(
nums
);
sizes
.
resize
(
nums
);
paddle
::
memory
::
AllocationPtr
shifts_gpu
=
memory
::
Alloc
(
context
.
GetPlace
(),
gpu_memory_size_
);
paddle
::
memory
::
AllocationPtr
strides_gpu
=
memory
::
Alloc
(
context
.
GetPlace
(),
gpu_memory_size_
);
paddle
::
memory
::
AllocationPtr
sizes_gpu
=
memory
::
Alloc
(
context
.
GetPlace
(),
gpu_memory_size_
);
for
(
size_t
i
=
0
;
i
<
nums
;
i
++
)
{
dim
=
dims
[
i
]
>=
0
?
dims
[
i
]
:
dims
[
i
]
+
input_dim
.
size
();
size
=
input_dim
[
dim
];
shifts
[
i
]
=
(
shifts
[
i
]
%
size
+
size
)
%
size
;
strides
[
i
]
=
stride_dim
[
dim
];
sizes
[
i
]
=
size
;
}
paddle
::
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
shifts_gpu
->
place
()),
shifts_gpu
->
ptr
(),
platform
::
CPUPlace
(),
shifts
.
data
(),
gpu_memory_size_
,
stream
);
paddle
::
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
strides_gpu
->
place
()),
strides_gpu
->
ptr
(),
platform
::
CPUPlace
(),
strides
.
data
(),
gpu_memory_size_
,
stream
);
paddle
::
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
sizes_gpu
->
place
()),
sizes_gpu
->
ptr
(),
platform
::
CPUPlace
(),
sizes
.
data
(),
gpu_memory_size_
,
stream
);
int64_t
*
shifts_ptr
=
reinterpret_cast
<
int64_t
*>
(
shifts_gpu
->
ptr
());
int64_t
*
strides_ptr
=
reinterpret_cast
<
int64_t
*>
(
strides_gpu
->
ptr
());
int64_t
*
sizes_ptr
=
reinterpret_cast
<
int64_t
*>
(
sizes_gpu
->
ptr
());
roll_cuda_kernel
<<<
(
numel
+
PADDLE_CUDA_NUM_THREADS
-
1
)
/
PADDLE_CUDA_NUM_THREADS
,
PADDLE_CUDA_NUM_THREADS
,
0
,
stream
>>>
(
in_data
,
out_data
,
numel
,
shifts_ptr
,
strides_ptr
,
sizes_ptr
,
nums
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
RollGradCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
in
=
context
.
Input
<
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
out
=
context
.
Output
<
LoDTensor
>
(
framework
::
GradVarName
(
"X"
));
std
::
vector
<
int64_t
>
shifts
=
context
.
Attr
<
std
::
vector
<
int64_t
>>
(
"shifts"
);
std
::
vector
<
int64_t
>
dims
=
context
.
Attr
<
std
::
vector
<
int64_t
>>
(
"axis"
);
auto
*
in_data
=
in
->
data
<
T
>
();
auto
*
out_data
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int64_t
numel
=
in
->
numel
();
auto
stream
=
context
.
template
device_context
<
platform
::
CUDADeviceContext
>().
stream
();
size_t
nums
=
shifts
.
size
();
auto
input_dim
=
in
->
dims
();
auto
stride_dim
=
framework
::
stride
(
input_dim
);
int64_t
dim
,
size
;
size_t
gpu_memory_size_
=
sizeof
(
int64_t
)
*
nums
;
std
::
vector
<
int64_t
>
strides
,
sizes
;
strides
.
resize
(
nums
);
sizes
.
resize
(
nums
);
paddle
::
memory
::
AllocationPtr
shifts_gpu
=
memory
::
Alloc
(
context
.
GetPlace
(),
gpu_memory_size_
);
paddle
::
memory
::
AllocationPtr
strides_gpu
=
memory
::
Alloc
(
context
.
GetPlace
(),
gpu_memory_size_
);
paddle
::
memory
::
AllocationPtr
sizes_gpu
=
memory
::
Alloc
(
context
.
GetPlace
(),
gpu_memory_size_
);
for
(
size_t
i
=
0
;
i
<
nums
;
i
++
)
{
dim
=
dims
[
i
]
>=
0
?
dims
[
i
]
:
dims
[
i
]
+
input_dim
.
size
();
size
=
input_dim
[
dim
];
shifts
[
i
]
=
((
0
-
shifts
[
i
])
%
size
+
size
)
%
size
;
strides
[
i
]
=
stride_dim
[
dim
];
sizes
[
i
]
=
size
;
}
paddle
::
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
shifts_gpu
->
place
()),
shifts_gpu
->
ptr
(),
platform
::
CPUPlace
(),
shifts
.
data
(),
gpu_memory_size_
,
stream
);
paddle
::
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
strides_gpu
->
place
()),
strides_gpu
->
ptr
(),
platform
::
CPUPlace
(),
strides
.
data
(),
gpu_memory_size_
,
stream
);
paddle
::
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
sizes_gpu
->
place
()),
sizes_gpu
->
ptr
(),
platform
::
CPUPlace
(),
sizes
.
data
(),
gpu_memory_size_
,
stream
);
int64_t
*
shifts_ptr
=
reinterpret_cast
<
int64_t
*>
(
shifts_gpu
->
ptr
());
int64_t
*
strides_ptr
=
reinterpret_cast
<
int64_t
*>
(
strides_gpu
->
ptr
());
int64_t
*
sizes_ptr
=
reinterpret_cast
<
int64_t
*>
(
sizes_gpu
->
ptr
());
roll_cuda_kernel
<<<
(
numel
+
PADDLE_CUDA_NUM_THREADS
-
1
)
/
PADDLE_CUDA_NUM_THREADS
,
PADDLE_CUDA_NUM_THREADS
,
0
,
stream
>>>
(
in_data
,
out_data
,
numel
,
shifts_ptr
,
strides_ptr
,
sizes_ptr
,
nums
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
roll
,
ops
::
RollKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
roll
,
ops
::
Roll
CUDA
Kernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
RollKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
Roll
CUDA
Kernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
RollKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
Roll
CUDA
Kernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
RollKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
Roll
CUDA
Kernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
roll_grad
,
ops
::
RollGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
roll_grad
,
ops
::
RollGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
RollGradCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
RollGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
RollGradCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
RollGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
RollGradCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
RollGradCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录