未验证 提交 50cafa0b 编写于 作者: Z zlsh80826 提交者: GitHub

remove redundant sync, set collect/dist kernel to context stream, sub_lod memcpy opt (#31641)

上级 1d197f6c
......@@ -198,8 +198,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
int threads = kNumCUDAThreads;
// get length-based lod by batch ids
GetLengthLoD<<<blocks, threads>>>(real_post_num, out_id_data,
length_lod_data);
GetLengthLoD<<<blocks, threads, 0, dev_ctx.stream()>>>(
real_post_num, out_id_data, length_lod_data);
std::vector<int> length_lod_cpu(lod_size);
memory::Copy(platform::CPUPlace(), length_lod_cpu.data(), place,
length_lod_data, sizeof(int) * lod_size, dev_ctx.stream());
......
......@@ -131,11 +131,10 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
int dist_blocks = NumBlocks(roi_num);
int threads = kNumCUDAThreads;
// get target levels and sub_lod list
GPUDistFpnProposalsHelper<T><<<dist_blocks, threads>>>(
GPUDistFpnProposalsHelper<T><<<dist_blocks, threads, 0, dev_ctx.stream()>>>(
roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
max_level, min_level, roi_batch_id_list_gpu.data<int>(),
sub_lod_list_data, target_lvls_data, pixel_offset);
dev_ctx.Wait();
auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
Tensor index_in_t;
......@@ -172,17 +171,18 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
int start = 0;
auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");
std::vector<int> sub_lod_list_cpu(lod_size * num_level);
memory::Copy(platform::CPUPlace(), sub_lod_list_cpu.data(), place,
sub_lod_list_data, sizeof(int) * lod_size * num_level,
dev_ctx.stream());
dev_ctx.Wait();
for (int i = 0; i < num_level; ++i) {
Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
int* sub_lod_data = sub_lod.data<int>();
// transfer length-based lod to offset-based lod
std::vector<size_t> offset(1, 0);
std::vector<int> sub_lod_cpu(lod_size);
memory::Copy(platform::CPUPlace(), sub_lod_cpu.data(), place,
sub_lod_data, sizeof(int) * lod_size, dev_ctx.stream());
dev_ctx.Wait();
for (int j = 0; j < lod_size; ++j) {
offset.emplace_back(offset.back() + sub_lod_cpu[j]);
offset.emplace_back(offset.back() + sub_lod_list_cpu[i * lod_size + j]);
}
int sub_rois_num = offset.back();
......
......@@ -198,7 +198,6 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
scores.data<T>(), sizeof(T) * scores.numel(),
dev_ctx.stream());
dev_ctx.Wait();
num_proposals += proposals.dims()[0];
offset.emplace_back(num_proposals);
tmp_num.push_back(proposals.dims()[0]);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册