Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
8c3777df
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
8c3777df
编写于
6月 07, 2022
作者:
W
Wilber
提交者:
GitHub
6月 07, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[multi-stream] Fix split and concat problem. (#43039)
上级
9bb39d48
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
29 addition
and
29 deletion
+29
-29
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+0
-6
paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
+2
-0
paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
+1
-1
paddle/fluid/memory/memcpy.cc
paddle/fluid/memory/memcpy.cc
+1
-1
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+3
-1
paddle/phi/backends/gpu/gpu_context.cc
paddle/phi/backends/gpu/gpu_context.cc
+1
-2
paddle/phi/kernels/funcs/concat_and_split_functor.cu
paddle/phi/kernels/funcs/concat_and_split_functor.cu
+21
-18
未找到文件。
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
8c3777df
...
...
@@ -1090,12 +1090,6 @@ CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
process_level_allocator_enabled
=
true
;
}
// TODO(Jingzhuangzhuang): Fix trt error when allocator_strategy is
// auto_growth
if
(
config
.
tensorrt_engine_enabled
())
{
gflags
.
push_back
(
"--allocator_strategy=naive_best_fit"
);
}
if
(
framework
::
InitGflags
(
gflags
))
{
VLOG
(
3
)
<<
"The following gpu analysis configurations only take effect "
"for the first predictor: "
;
...
...
paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
浏览文件 @
8c3777df
...
...
@@ -87,9 +87,11 @@ endif()
if
(
WITH_GPU
)
if
(
NOT WIN32
)
add_definitions
(
"-DPADDLE_WITH_GPU"
)
set
(
CUDA_LIB
"/usr/local/cuda/lib64/"
CACHE STRING
"CUDA Library"
)
include_directories
(
"
${
CUDA_LIB
}
/../include"
)
else
()
set
(
CUDA_LIB
""
...
...
paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
浏览文件 @
8c3777df
...
...
@@ -157,7 +157,7 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) {
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
threads
.
emplace_back
(
paddle
::
test
::
SingleThreadPrediction
,
pred_pool
.
Retrive
(
i
),
&
my_input_data_map
,
&
infer_output_data
,
2
);
&
infer_output_data
,
10
);
}
// thread join & check outputs
...
...
paddle/fluid/memory/memcpy.cc
浏览文件 @
8c3777df
...
...
@@ -648,7 +648,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform
::
SetDeviceId
(
dst_place
.
device
);
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by
th
ream("
<<
stream
<<
")"
;
<<
dst_place
<<
" by
st
ream("
<<
stream
<<
")"
;
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync:CPU->GPU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
8c3777df
...
...
@@ -54,7 +54,9 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
auto
&
desired_dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
);
if
(
default_dev_ctx
->
stream
()
==
desired_dev_ctx
.
stream
())
{
return
Alloc
(
place
,
size
);
return
paddle
::
memory
::
Alloc
(
desired_dev_ctx
.
GetPlace
(),
size
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
desired_dev_ctx
.
stream
())));
}
else
{
return
allocation
::
CUDADeviceContextAllocatorPool
::
Instance
().
Alloc
(
desired_dev_ctx
,
size
);
...
...
paddle/phi/backends/gpu/gpu_context.cc
浏览文件 @
8c3777df
...
...
@@ -504,8 +504,7 @@ struct GPUContext::Impl {
void
AddStreamCallback
(
const
std
::
function
<
void
()
>&
callback
)
const
{
// NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may
// launch too
// many threads and result in thread oversubscription.
// launch too many threads and result in thread oversubscription.
auto
*
callback_func
=
new
std
::
function
<
void
()
>
(
std
::
move
(
callback
));
auto
*
func
=
new
std
::
function
<
void
()
>
([
this
,
callback_func
]
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
stream_call_back_mtx_
);
...
...
paddle/phi/kernels/funcs/concat_and_split_functor.cu
浏览文件 @
8c3777df
...
...
@@ -276,10 +276,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
int64_t
out_row
=
in_row
,
out_col
=
0
;
int
inputs_col_num
=
in_num
+
1
;
std
::
vector
<
const
T
*>
inputs_data_vec
(
in_num
);
std
::
vector
<
int64_t
>
inputs_col_vec
(
inputs_col_num
);
const
T
**
inputs_data
=
inputs_data_vec
.
data
();
int64_t
*
inputs_col
=
inputs_col_vec
.
data
();
paddle
::
memory
::
AllocationPtr
data_alloc
,
col_alloc
;
// There are some differences between hip runtime and NV runtime.
// In NV, when the pageable memory data less than 64K is transferred from
...
...
@@ -289,16 +286,22 @@ struct ConcatFunctor<phi::GPUContext, T> {
// 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP
paddle
::
memory
::
AllocationPtr
data_alloc
,
col_alloc
;
// TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc
=
paddle
::
memory
::
Alloc
(
paddle
::
platform
::
CUDAPinnedPlace
(),
in_num
*
sizeof
(
T
*
));
inputs_data
=
reinterpret_cast
<
const
T
**>
(
data_alloc
->
ptr
());
// TODO(chentianyu03): try to find a method to remove the Alloc function
col_alloc
=
paddle
::
memory
::
Alloc
(
paddle
::
platform
::
CUDAPinnedPlace
(),
inputs_col_num
*
sizeof
(
int
));
inputs_col
=
reinterpret_cast
<
int64_t
*>
(
col_alloc
->
ptr
());
#else
// TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
// allocator.
data_alloc
=
paddle
::
memory
::
Alloc
(
paddle
::
platform
::
CPUPlace
(),
in_num
*
sizeof
(
T
*
));
col_alloc
=
paddle
::
memory
::
Alloc
(
paddle
::
platform
::
CPUPlace
(),
(
inputs_col_num
)
*
sizeof
(
int64_t
));
#endif
const
T
**
inputs_data
=
reinterpret_cast
<
const
T
**>
(
data_alloc
->
ptr
());
int64_t
*
inputs_col
=
reinterpret_cast
<
int64_t
*>
(
col_alloc
->
ptr
());
inputs_col
[
0
]
=
0
;
bool
has_same_shape
=
true
;
...
...
@@ -387,7 +390,6 @@ struct ConcatFunctor<phi::GPUContext, T> {
output
->
data
<
T
>
());
}
#ifdef PADDLE_WITH_HIP
// Prevent the pinned memory value from being covered and release the memory
// after the launch kernel of the stream is executed (reapply pinned memory
// next time)
...
...
@@ -401,7 +403,6 @@ struct ConcatFunctor<phi::GPUContext, T> {
paddle
::
memory
::
allocation
::
Allocator
::
AllocationDeleter
(
col_alloc_released
);
});
#endif
}
};
...
...
@@ -432,10 +433,7 @@ class SplitFunctor<phi::GPUContext, T> {
bool
has_same_shape
=
true
;
int
outputs_cols_num
=
o_num
+
1
;
std
::
vector
<
T
*>
outputs_data_vec
(
o_num
);
std
::
vector
<
int64_t
>
outputs_cols_vec
(
outputs_cols_num
);
T
**
outputs_data
=
outputs_data_vec
.
data
();
int64_t
*
outputs_cols
=
outputs_cols_vec
.
data
();
paddle
::
memory
::
AllocationPtr
data_alloc
,
cols_alloc
;
// There are some differences between hip runtime and NV runtime.
// In NV, when the pageable memory data less than 64K is transferred from
...
...
@@ -445,16 +443,22 @@ class SplitFunctor<phi::GPUContext, T> {
// 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP
paddle
::
memory
::
AllocationPtr
data_alloc
,
cols_alloc
;
// TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc
=
paddle
::
memory
::
Alloc
(
paddle
::
platform
::
CUDAPinnedPlace
(),
o_num
*
sizeof
(
T
*
));
outputs_data
=
reinterpret_cast
<
T
**>
(
data_alloc
->
ptr
());
// TODO(chentianyu03): try to find a method to remove the Alloc function
cols_alloc
=
paddle
::
memory
::
Alloc
(
paddle
::
platform
::
CUDAPinnedPlace
(),
(
outputs_cols_num
)
*
sizeof
(
int64_t
));
outputs_cols
=
reinterpret_cast
<
int64_t
*>
(
cols_alloc
->
ptr
());
#else
// TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
// allocator.
data_alloc
=
paddle
::
memory
::
Alloc
(
paddle
::
platform
::
CPUPlace
(),
o_num
*
sizeof
(
T
*
));
cols_alloc
=
paddle
::
memory
::
Alloc
(
paddle
::
platform
::
CPUPlace
(),
(
outputs_cols_num
)
*
sizeof
(
int64_t
));
#endif
T
**
outputs_data
=
reinterpret_cast
<
T
**>
(
data_alloc
->
ptr
());
int64_t
*
outputs_cols
=
reinterpret_cast
<
int64_t
*>
(
cols_alloc
->
ptr
());
outputs_cols
[
0
]
=
0
;
for
(
int
i
=
0
;
i
<
o_num
;
++
i
)
{
...
...
@@ -547,7 +551,7 @@ class SplitFunctor<phi::GPUContext, T> {
static_cast
<
int
>
(
outputs_cols_num
),
dev_out_gpu_data
);
}
#ifdef PADDLE_WITH_HIP
// Prevent the pinned memory value from being covered and release the memory
// after the launch kernel of the stream is executed (reapply pinned memory
// next time)
...
...
@@ -559,7 +563,6 @@ class SplitFunctor<phi::GPUContext, T> {
paddle
::
memory
::
allocation
::
Allocator
::
AllocationDeleter
(
cols_alloc_released
);
});
#endif
}
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录