Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
50cafa0b
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
50cafa0b
编写于
3月 19, 2021
作者:
Z
zlsh80826
提交者:
GitHub
3月 19, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
remove redundant sync, set collect/dist kernel to context stream, sub_lod memcpy opt (#31641)
上级
1d197f6c
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
10 addition
and
11 deletion
+10
-11
paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+2
-2
paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
.../fluid/operators/detection/distribute_fpn_proposals_op.cu
+8
-8
paddle/fluid/operators/detection/generate_proposals_op.cu
paddle/fluid/operators/detection/generate_proposals_op.cu
+0
-1
未找到文件。
paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
浏览文件 @
50cafa0b
...
@@ -198,8 +198,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
...
@@ -198,8 +198,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
int
threads
=
kNumCUDAThreads
;
int
threads
=
kNumCUDAThreads
;
// get length-based lod by batch ids
// get length-based lod by batch ids
GetLengthLoD
<<<
blocks
,
threads
>>>
(
real_post_num
,
out_id_data
,
GetLengthLoD
<<<
blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
length_lod_data
);
real_post_num
,
out_id_data
,
length_lod_data
);
std
::
vector
<
int
>
length_lod_cpu
(
lod_size
);
std
::
vector
<
int
>
length_lod_cpu
(
lod_size
);
memory
::
Copy
(
platform
::
CPUPlace
(),
length_lod_cpu
.
data
(),
place
,
memory
::
Copy
(
platform
::
CPUPlace
(),
length_lod_cpu
.
data
(),
place
,
length_lod_data
,
sizeof
(
int
)
*
lod_size
,
dev_ctx
.
stream
());
length_lod_data
,
sizeof
(
int
)
*
lod_size
,
dev_ctx
.
stream
());
...
...
paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
浏览文件 @
50cafa0b
...
@@ -131,11 +131,10 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
...
@@ -131,11 +131,10 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
int
dist_blocks
=
NumBlocks
(
roi_num
);
int
dist_blocks
=
NumBlocks
(
roi_num
);
int
threads
=
kNumCUDAThreads
;
int
threads
=
kNumCUDAThreads
;
// get target levels and sub_lod list
// get target levels and sub_lod list
GPUDistFpnProposalsHelper
<
T
><<<
dist_blocks
,
threads
>>>
(
GPUDistFpnProposalsHelper
<
T
><<<
dist_blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
roi_num
,
fpn_rois
->
data
<
T
>
(),
lod_size
,
refer_level
,
refer_scale
,
roi_num
,
fpn_rois
->
data
<
T
>
(),
lod_size
,
refer_level
,
refer_scale
,
max_level
,
min_level
,
roi_batch_id_list_gpu
.
data
<
int
>
(),
max_level
,
min_level
,
roi_batch_id_list_gpu
.
data
<
int
>
(),
sub_lod_list_data
,
target_lvls_data
,
pixel_offset
);
sub_lod_list_data
,
target_lvls_data
,
pixel_offset
);
dev_ctx
.
Wait
();
auto
place
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dev_ctx
.
GetPlace
());
auto
place
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
dev_ctx
.
GetPlace
());
Tensor
index_in_t
;
Tensor
index_in_t
;
...
@@ -172,17 +171,18 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
...
@@ -172,17 +171,18 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
int
start
=
0
;
int
start
=
0
;
auto
multi_rois_num
=
ctx
.
MultiOutput
<
Tensor
>
(
"MultiLevelRoIsNum"
);
auto
multi_rois_num
=
ctx
.
MultiOutput
<
Tensor
>
(
"MultiLevelRoIsNum"
);
std
::
vector
<
int
>
sub_lod_list_cpu
(
lod_size
*
num_level
);
memory
::
Copy
(
platform
::
CPUPlace
(),
sub_lod_list_cpu
.
data
(),
place
,
sub_lod_list_data
,
sizeof
(
int
)
*
lod_size
*
num_level
,
dev_ctx
.
stream
());
dev_ctx
.
Wait
();
for
(
int
i
=
0
;
i
<
num_level
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_level
;
++
i
)
{
Tensor
sub_lod
=
sub_lod_list
.
Slice
(
i
,
i
+
1
);
Tensor
sub_lod
=
sub_lod_list
.
Slice
(
i
,
i
+
1
);
int
*
sub_lod_data
=
sub_lod
.
data
<
int
>
();
// transfer length-based lod to offset-based lod
// transfer length-based lod to offset-based lod
std
::
vector
<
size_t
>
offset
(
1
,
0
);
std
::
vector
<
size_t
>
offset
(
1
,
0
);
std
::
vector
<
int
>
sub_lod_cpu
(
lod_size
);
memory
::
Copy
(
platform
::
CPUPlace
(),
sub_lod_cpu
.
data
(),
place
,
sub_lod_data
,
sizeof
(
int
)
*
lod_size
,
dev_ctx
.
stream
());
dev_ctx
.
Wait
();
for
(
int
j
=
0
;
j
<
lod_size
;
++
j
)
{
for
(
int
j
=
0
;
j
<
lod_size
;
++
j
)
{
offset
.
emplace_back
(
offset
.
back
()
+
sub_lod_
cpu
[
j
]);
offset
.
emplace_back
(
offset
.
back
()
+
sub_lod_
list_cpu
[
i
*
lod_size
+
j
]);
}
}
int
sub_rois_num
=
offset
.
back
();
int
sub_rois_num
=
offset
.
back
();
...
...
paddle/fluid/operators/detection/generate_proposals_op.cu
浏览文件 @
50cafa0b
...
@@ -198,7 +198,6 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
...
@@ -198,7 +198,6 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
memory
::
Copy
(
place
,
rpn_roi_probs_data
+
num_proposals
,
place
,
memory
::
Copy
(
place
,
rpn_roi_probs_data
+
num_proposals
,
place
,
scores
.
data
<
T
>
(),
sizeof
(
T
)
*
scores
.
numel
(),
scores
.
data
<
T
>
(),
sizeof
(
T
)
*
scores
.
numel
(),
dev_ctx
.
stream
());
dev_ctx
.
stream
());
dev_ctx
.
Wait
();
num_proposals
+=
proposals
.
dims
()[
0
];
num_proposals
+=
proposals
.
dims
()[
0
];
offset
.
emplace_back
(
num_proposals
);
offset
.
emplace_back
(
num_proposals
);
tmp_num
.
push_back
(
proposals
.
dims
()[
0
]);
tmp_num
.
push_back
(
proposals
.
dims
()[
0
]);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录