Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
20da7703
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
20da7703
编写于
7月 07, 2021
作者:
X
xiayanming
提交者:
GitHub
7月 07, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[HIP] 解决hipMemcpy无法overlap的问题,修改后AMD GPU性能提升大于10% (#33982)
上级
758dd7bb
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
83 addition
and
22 deletion
+83
-22
paddle/fluid/operators/math/concat_and_split.cu
paddle/fluid/operators/math/concat_and_split.cu
+83
-22
未找到文件。
paddle/fluid/operators/math/concat_and_split.cu
浏览文件 @
20da7703
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
...
...
@@ -242,8 +243,28 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
int
in_col
=
input
[
0
].
numel
()
/
in_row
;
int
out_row
=
in_row
,
out_col
=
0
;
std
::
vector
<
const
T
*>
inputs_data
(
in_num
);
std
::
vector
<
int
>
inputs_col
(
in_num
+
1
);
int
inputs_col_num
=
in_num
+
1
;
std
::
vector
<
const
T
*>
inputs_data_vec
(
in_num
);
std
::
vector
<
int
>
inputs_col_vec
(
inputs_col_num
);
const
T
**
inputs_data
=
inputs_data_vec
.
data
();
int
*
inputs_col
=
inputs_col_vec
.
data
();
// There are some differences between hip runtime and NV runtime.
// In NV, when the pageable memory data less than 64K is transferred from
// hosttodevice, it will be automatically asynchronous.
// However, only pinned memory in hip can copy asynchronously
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
// 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP
memory
::
AllocationPtr
data_alloc
,
col_alloc
;
data_alloc
=
memory
::
Alloc
(
platform
::
CUDAPinnedPlace
(),
in_num
*
sizeof
(
T
*
));
inputs_data
=
reinterpret_cast
<
const
T
**>
(
data_alloc
->
ptr
());
col_alloc
=
memory
::
Alloc
(
platform
::
CUDAPinnedPlace
(),
inputs_col_num
*
sizeof
(
int
));
inputs_col
=
reinterpret_cast
<
int
*>
(
col_alloc
->
ptr
());
#endif
inputs_col
[
0
]
=
0
;
bool
has_same_shape
=
true
;
...
...
@@ -264,12 +285,11 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
memory
::
allocation
::
AllocationPtr
tmp_dev_ins_data
;
const
T
**
dev_ins_data
=
nullptr
;
if
(
!
has_same_shape
||
in_num
<
2
||
in_num
>
4
)
{
tmp_dev_ins_data
=
memory
::
Alloc
(
context
,
inputs_data
.
size
()
*
sizeof
(
T
*
));
tmp_dev_ins_data
=
memory
::
Alloc
(
context
,
in_num
*
sizeof
(
T
*
));
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
context
.
GetPlace
()),
tmp_dev_ins_data
->
ptr
(),
platform
::
CPUPlace
(),
static_cast
<
void
*>
(
inputs_data
.
data
()
),
inputs_data
.
size
()
*
sizeof
(
T
*
),
context
.
stream
());
static_cast
<
void
*>
(
inputs_data
),
in_num
*
sizeof
(
T
*
),
context
.
stream
());
dev_ins_data
=
reinterpret_cast
<
const
T
**>
(
tmp_dev_ins_data
->
ptr
());
}
...
...
@@ -292,17 +312,29 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
}
}
else
{
auto
tmp_dev_ins_col_data
=
memory
::
Alloc
(
context
,
inputs_col
.
size
()
*
sizeof
(
int
));
memory
::
Alloc
(
context
,
inputs_col
_num
*
sizeof
(
int
));
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
context
.
GetPlace
()),
tmp_dev_ins_col_data
->
ptr
(),
platform
::
CPUPlace
(),
static_cast
<
void
*>
(
inputs_col
.
data
()
),
inputs_col
.
size
()
*
sizeof
(
int
),
context
.
stream
());
static_cast
<
void
*>
(
inputs_col
),
inputs_col_num
*
sizeof
(
int
),
context
.
stream
());
int
*
dev_ins_col_data
=
static_cast
<
int
*>
(
tmp_dev_ins_col_data
->
ptr
());
ConcatKernel
<<<
grid_dims
,
block_dims
,
0
,
context
.
stream
()
>>>
(
dev_ins_data
,
dev_ins_col_data
,
static_cast
<
int
>
(
inputs_col
.
size
()
),
dev_ins_data
,
dev_ins_col_data
,
static_cast
<
int
>
(
inputs_col
_num
),
out_row
,
out_col
,
output
->
data
<
T
>
());
}
#ifdef PADDLE_WITH_HIP
// Prevent the pinned memory value from being covered and release the memory
// after the launch kernel of the stream is executed (reapply pinned memory
// next time)
auto
*
data_alloc_released
=
data_alloc
.
release
();
auto
*
col_alloc_released
=
col_alloc
.
release
();
context
.
AddStreamCallback
([
data_alloc_released
,
col_alloc_released
]
{
memory
::
allocation
::
AllocationDeleter
deleter
;
deleter
(
data_alloc_released
);
deleter
(
col_alloc_released
);
});
#endif
}
};
...
...
@@ -313,6 +345,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
template
<
typename
T
>
class
SplitFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
SplitFunctor
();
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
const
framework
::
Tensor
*>&
ref_inputs
,
...
...
@@ -329,8 +362,27 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
int64_t
in_col
=
0
,
in_row
=
out_row
;
bool
has_same_shape
=
true
;
std
::
vector
<
T
*>
outputs_data
(
o_num
);
std
::
vector
<
int64_t
>
outputs_cols
(
o_num
+
1
);
int
outputs_cols_num
=
o_num
+
1
;
std
::
vector
<
T
*>
outputs_data_vec
(
o_num
);
std
::
vector
<
int64_t
>
outputs_cols_vec
(
outputs_cols_num
);
T
**
outputs_data
=
outputs_data_vec
.
data
();
int64_t
*
outputs_cols
=
outputs_cols_vec
.
data
();
// There are some differences between hip runtime and NV runtime.
// In NV, when the pageable memory data less than 64K is transferred from
// hosttodevice, it will be automatically asynchronous.
// However, only pinned memory in hip can copy asynchronously
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
// 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP
memory
::
AllocationPtr
data_alloc
,
cols_alloc
;
data_alloc
=
memory
::
Alloc
(
platform
::
CUDAPinnedPlace
(),
o_num
*
sizeof
(
T
*
));
outputs_data
=
reinterpret_cast
<
T
**>
(
data_alloc
->
ptr
());
cols_alloc
=
memory
::
Alloc
(
platform
::
CUDAPinnedPlace
(),
(
outputs_cols_num
)
*
sizeof
(
int64_t
));
outputs_cols
=
reinterpret_cast
<
int64_t
*>
(
cols_alloc
->
ptr
());
#endif
outputs_cols
[
0
]
=
0
;
for
(
int
i
=
0
;
i
<
o_num
;
++
i
)
{
...
...
@@ -354,12 +406,11 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
memory
::
allocation
::
AllocationPtr
tmp_dev_outs_data
;
T
**
dev_out_gpu_data
=
nullptr
;
if
(
!
has_same_shape
||
o_num
<
2
||
o_num
>
4
)
{
tmp_dev_outs_data
=
memory
::
Alloc
(
context
,
outputs_data
.
size
()
*
sizeof
(
T
*
));
tmp_dev_outs_data
=
memory
::
Alloc
(
context
,
o_num
*
sizeof
(
T
*
));
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
context
.
GetPlace
()),
tmp_dev_outs_data
->
ptr
(),
platform
::
CPUPlace
(),
reinterpret_cast
<
void
*>
(
outputs_data
.
data
()
),
outputs_data
.
size
()
*
sizeof
(
T
*
),
context
.
stream
());
reinterpret_cast
<
void
*>
(
outputs_data
),
o_num
*
sizeof
(
T
*
),
context
.
stream
());
dev_out_gpu_data
=
reinterpret_cast
<
T
**>
(
tmp_dev_outs_data
->
ptr
());
}
...
...
@@ -382,20 +433,30 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
}
}
else
{
auto
tmp_dev_ins_col_data
=
memory
::
Alloc
(
context
,
outputs_cols
.
size
()
*
sizeof
(
int64_t
));
memory
::
Alloc
(
context
,
outputs_cols_num
*
sizeof
(
int64_t
));
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
context
.
GetPlace
()),
tmp_dev_ins_col_data
->
ptr
(),
platform
::
CPUPlace
(),
reinterpret_cast
<
void
*>
(
outputs_cols
.
data
()
),
outputs_cols
.
size
()
*
sizeof
(
int64_t
),
context
.
stream
());
reinterpret_cast
<
void
*>
(
outputs_cols
),
outputs_cols
_num
*
sizeof
(
int64_t
),
context
.
stream
());
int64_t
*
dev_outs_col_data
=
reinterpret_cast
<
int64_t
*>
(
tmp_dev_ins_col_data
->
ptr
());
SplitKernel
<<<
grid_dims
,
block_dims
,
0
,
context
.
stream
()
>>>
(
input
.
data
<
T
>
(),
in_row
,
in_col
,
dev_outs_col_data
,
static_cast
<
int
>
(
outputs_cols
.
size
()
),
dev_out_gpu_data
);
static_cast
<
int
>
(
outputs_cols
_num
),
dev_out_gpu_data
);
}
#ifdef PADDLE_WITH_HIP
// Prevent the pinned memory value from being covered and release the memory
// after the launch kernel of the stream is executed (reapply pinned memory
// next time)
auto
*
data_alloc_released
=
data_alloc
.
release
();
auto
*
cols_alloc_released
=
cols_alloc
.
release
();
context
.
AddStreamCallback
([
data_alloc_released
,
cols_alloc_released
]
{
memory
::
allocation
::
AllocationDeleter
deleter
;
deleter
(
data_alloc_released
);
deleter
(
cols_alloc_released
);
});
#endif
}
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录