Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
a79d4a75
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a79d4a75
编写于
8月 17, 2022
作者:
W
Wilber
提交者:
GitHub
8月 17, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix multi stream error. (#45196)
* fix multi stream error.
上级
f59c666c
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
41 addition
and
20 deletion
+41
-20
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+4
-2
paddle/fluid/operators/fc_op.h
paddle/fluid/operators/fc_op.h
+4
-2
paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
...d/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+11
-5
paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
...luid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+8
-4
paddle/fluid/operators/fused/multihead_matmul_op.cu
paddle/fluid/operators/fused/multihead_matmul_op.cu
+11
-6
paddle/fluid/operators/matmul_op.cc
paddle/fluid/operators/matmul_op.cc
+3
-1
未找到文件。
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
浏览文件 @
a79d4a75
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include <thread>
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
...
@@ -194,8 +195,9 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
...
@@ -194,8 +195,9 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
static_unique_ptr_cast
<
Allocation
>
(
std
::
move
(
underlying_allocation
)),
static_unique_ptr_cast
<
Allocation
>
(
std
::
move
(
underlying_allocation
)),
default_stream_
,
default_stream_
,
this
);
this
);
VLOG
(
8
)
<<
"Allocate "
<<
allocation
->
size
()
<<
" bytes at address "
VLOG
(
8
)
<<
"Thread "
<<
std
::
this_thread
::
get_id
()
<<
" Allocate "
<<
allocation
->
ptr
()
<<
" , stream: "
<<
default_stream_
;
<<
allocation
->
size
()
<<
" bytes at address "
<<
allocation
->
ptr
()
<<
" , stream: "
<<
default_stream_
;
return
allocation
;
return
allocation
;
}
}
...
...
paddle/fluid/operators/fc_op.h
浏览文件 @
a79d4a75
...
@@ -69,6 +69,8 @@ class FCOpKernel : public framework::OpKernel<T> {
...
@@ -69,6 +69,8 @@ class FCOpKernel : public framework::OpKernel<T> {
auto
w_dims
=
w
->
dims
();
auto
w_dims
=
w
->
dims
();
bool
padding_weights
=
ctx
.
Attr
<
bool
>
(
"padding_weights"
);
bool
padding_weights
=
ctx
.
Attr
<
bool
>
(
"padding_weights"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
std
::
vector
<
int64_t
>
output_dims
;
std
::
vector
<
int64_t
>
output_dims
;
FCOutputSize
(
FCOutputSize
(
input
->
dims
(),
w_dims
,
output_dims
,
in_num_col_dims
,
padding_weights
);
input
->
dims
(),
w_dims
,
output_dims
,
in_num_col_dims
,
padding_weights
);
...
@@ -82,9 +84,9 @@ class FCOpKernel : public framework::OpKernel<T> {
...
@@ -82,9 +84,9 @@ class FCOpKernel : public framework::OpKernel<T> {
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
w_data
=
w
->
data
<
T
>
();
const
T
*
w_data
=
w
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
output_data
=
dev_ctx
.
template
Alloc
<
T
>(
output
,
output
->
numel
()
*
sizeof
(
T
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
phi
::
funcs
::
FCFunctor
<
DeviceContext
,
T
>
fc
;
phi
::
funcs
::
FCFunctor
<
DeviceContext
,
T
>
fc
;
fc
(
dev_ctx
,
fc
(
dev_ctx
,
M
,
M
,
...
...
paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
浏览文件 @
a79d4a75
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#include <paddle/fluid/platform/device_context.h>
#include <paddle/fluid/platform/device_context.h>
#include <algorithm>
#include <algorithm>
#include <cstdint>
#include <type_traits>
#include <type_traits>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/convert_utils.h"
...
@@ -49,12 +50,16 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
...
@@ -49,12 +50,16 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
#else
#else
cudaGetDevice
(
&
device_id
);
cudaGetDevice
(
&
device_id
);
#endif
#endif
auto
&
dev_ctx
=
context
.
template
device_context
<
phi
::
GPUContext
>();
in_ids_
.
Resize
(
in_dim
);
in_ids_
.
Resize
(
in_dim
);
in_embs_
.
Resize
(
in_dim
);
in_embs_
.
Resize
(
in_dim
);
int64_t
*
in_ids_d
=
in_ids_
.
mutable_data
<
int64_t
>
(
platform
::
CUDAPlace
(
device_id
));
int64_t
*
in_ids_d
=
dev_ctx
.
template
Alloc
<
int64_t
>(
int64_t
*
in_embs_d
=
&
in_ids_
,
in_ids_
.
numel
()
*
sizeof
(
int64_t
));
in_embs_
.
mutable_data
<
int64_t
>
(
platform
::
CUDAPlace
(
device_id
));
int64_t
*
in_embs_d
=
dev_ctx
.
template
Alloc
<
int64_t
>(
&
in_embs_
,
in_embs_
.
numel
()
*
sizeof
(
int64_t
));
std
::
vector
<
int64_t
>
in1s
,
in2s
;
std
::
vector
<
int64_t
>
in1s
,
in2s
;
for
(
int
i
=
0
;
i
<
input_num
;
++
i
)
{
for
(
int
i
=
0
;
i
<
input_num
;
++
i
)
{
...
@@ -99,7 +104,8 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
...
@@ -99,7 +104,8 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
auto
*
bias_d
=
bias
->
data
<
T
>
();
auto
*
bias_d
=
bias
->
data
<
T
>
();
auto
*
scale_d
=
scale
->
data
<
T
>
();
auto
*
scale_d
=
scale
->
data
<
T
>
();
auto
*
output_d
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
output_d
=
dev_ctx
.
template
Alloc
<
T
>(
out
,
out
->
numel
()
*
sizeof
(
T
));
float
eps
=
context
.
Attr
<
float
>
(
"epsilon"
);
float
eps
=
context
.
Attr
<
float
>
(
"epsilon"
);
if
(
std
::
is_same
<
T
,
paddle
::
platform
::
float16
>::
value
)
{
if
(
std
::
is_same
<
T
,
paddle
::
platform
::
float16
>::
value
)
{
...
...
paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
浏览文件 @
a79d4a75
...
@@ -395,9 +395,10 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
...
@@ -395,9 +395,10 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
w_data
=
w
->
data
<
T
>
();
const
T
*
w_data
=
w
->
data
<
T
>
();
T
*
out_data
=
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
auto
*
out_data
=
dev_ctx
.
template
Alloc
<
T
>(
out
,
out
->
numel
()
*
sizeof
(
T
));
auto
blas
=
phi
::
funcs
::
GetBlas
<
phi
::
GPUContext
,
T
>
(
dev_ctx
);
auto
blas
=
phi
::
funcs
::
GetBlas
<
phi
::
GPUContext
,
T
>
(
dev_ctx
);
blas
.
GEMM
(
false
,
blas
.
GEMM
(
false
,
false
,
false
,
...
@@ -425,9 +426,12 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
...
@@ -425,9 +426,12 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
auto
*
mean
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Mean"
);
auto
*
mean
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Mean"
);
auto
*
variance
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Variance"
);
auto
*
variance
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Variance"
);
T
*
mean_data
=
mean
?
mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
())
:
nullptr
;
T
*
mean_data
=
T
*
variance_data
=
mean
?
dev_ctx
.
template
Alloc
<
T
>(
mean
,
mean
->
numel
()
*
sizeof
(
T
))
variance
?
variance
->
mutable_data
<
T
>
(
ctx
.
GetPlace
())
:
nullptr
;
:
nullptr
;
T
*
variance_data
=
variance
?
dev_ctx
.
template
Alloc
<
T
>(
variance
,
variance
->
numel
()
*
sizeof
(
T
))
:
nullptr
;
bool
with_relu
=
bool
with_relu
=
(
ctx
.
Attr
<
std
::
string
>
(
"activation_type"
)
==
"relu"
)
?
true
:
false
;
(
ctx
.
Attr
<
std
::
string
>
(
"activation_type"
)
==
"relu"
)
?
true
:
false
;
...
...
paddle/fluid/operators/fused/multihead_matmul_op.cu
浏览文件 @
a79d4a75
...
@@ -287,7 +287,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
...
@@ -287,7 +287,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
// if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
// if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
if
(
bias_qk
&&
bias_qk
->
numel
()
==
(
batch
*
seq_len
))
{
if
(
bias_qk
&&
bias_qk
->
numel
()
==
(
batch
*
seq_len
))
{
temp_bias_tensor
.
Resize
({
batch
*
head_number
*
seq_len
*
seq_len
});
temp_bias_tensor
.
Resize
({
batch
*
head_number
*
seq_len
*
seq_len
});
auto
*
temp_qk_bias
=
temp_bias_tensor
.
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
temp_qk_bias
=
device_ctx
.
template
Alloc
<
T
>(
&
temp_bias_tensor
,
temp_bias_tensor
.
numel
()
*
sizeof
(
T
));
int
grid
=
batch
*
head_number
*
seq_len
;
int
grid
=
batch
*
head_number
*
seq_len
;
int
block
=
round_up
(
seq_len
);
int
block
=
round_up
(
seq_len
);
broadcast
<<<
grid
,
block
,
0
,
stream
>>>
(
broadcast
<<<
grid
,
block
,
0
,
stream
>>>
(
...
@@ -297,7 +298,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
...
@@ -297,7 +298,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
if
(
!
bias_qk
)
{
if
(
!
bias_qk
)
{
int
size
=
batch
*
head_number
*
seq_len
*
seq_len
;
int
size
=
batch
*
head_number
*
seq_len
*
seq_len
;
temp_bias_tensor
.
Resize
({
size
});
temp_bias_tensor
.
Resize
({
size
});
auto
*
temp_qk_bias
=
temp_bias_tensor
.
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
temp_qk_bias
=
device_ctx
.
template
Alloc
<
T
>(
&
temp_bias_tensor
,
temp_bias_tensor
.
numel
()
*
sizeof
(
T
));
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
hipMemset
(
temp_qk_bias
,
0
,
sizeof
(
float
)
*
size
);
hipMemset
(
temp_qk_bias
,
0
,
sizeof
(
float
)
*
size
);
#else
#else
...
@@ -310,7 +312,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
...
@@ -310,7 +312,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
auto
*
out
=
context
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
*
out
=
context
.
Output
<
framework
::
Tensor
>
(
"Out"
);
out
->
Resize
({
batch
,
seq_len
,
all_head_size
});
out
->
Resize
({
batch
,
seq_len
,
all_head_size
});
auto
*
output_d
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
output_d
=
device_ctx
.
template
Alloc
<
T
>(
out
,
out
->
numel
()
*
sizeof
(
T
));
// (B*S, hidden)
// (B*S, hidden)
const
Tensor
input_matrix
=
const
Tensor
input_matrix
=
...
@@ -324,7 +327,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
...
@@ -324,7 +327,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
phi
::
make_ddim
({
batch
,
seq_len
,
3
,
head_number
,
head_size
});
phi
::
make_ddim
({
batch
,
seq_len
,
3
,
head_number
,
head_size
});
temp_out_tensor
.
Resize
(
temp_out_tensor
.
Resize
(
{
batch
*
seq_len
,
phi
::
product
(
temp_out_dims
)
/
(
batch
*
seq_len
)});
{
batch
*
seq_len
,
phi
::
product
(
temp_out_dims
)
/
(
batch
*
seq_len
)});
auto
*
temp_out_data
=
temp_out_tensor
.
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
temp_out_data
=
device_ctx
.
template
Alloc
<
T
>(
&
temp_out_tensor
,
temp_out_tensor
.
numel
()
*
sizeof
(
T
));
// (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
// (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
auto
blas
=
phi
::
funcs
::
GetBlas
<
phi
::
GPUContext
,
T
>
(
device_ctx
);
auto
blas
=
phi
::
funcs
::
GetBlas
<
phi
::
GPUContext
,
T
>
(
device_ctx
);
...
@@ -336,8 +340,9 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
...
@@ -336,8 +340,9 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
// B * head_number * S * S * 1 + B * S * 3 * N * H
// B * head_number * S * S * 1 + B * S * 3 * N * H
int
scratch_size
=
batch
*
head_number
*
seq_len
*
seq_len
*
1
;
int
scratch_size
=
batch
*
head_number
*
seq_len
*
seq_len
*
1
;
multihead_temp_tensor
.
Resize
({
scratch_size
+
temp_out_tensor
.
numel
()});
multihead_temp_tensor
.
Resize
({
scratch_size
+
temp_out_tensor
.
numel
()});
auto
*
multihead_temp_data
=
auto
*
multihead_temp_data
=
device_ctx
.
template
Alloc
<
T
>(
multihead_temp_tensor
.
mutable_data
<
T
>
(
context
.
GetPlace
());
&
multihead_temp_tensor
,
multihead_temp_tensor
.
numel
()
*
sizeof
(
T
));
auto
*
qkptr
=
multihead_temp_data
;
auto
*
qkptr
=
multihead_temp_data
;
auto
*
tptr
=
multihead_temp_data
+
scratch_size
;
auto
*
tptr
=
multihead_temp_data
+
scratch_size
;
...
...
paddle/fluid/operators/matmul_op.cc
浏览文件 @
a79d4a75
...
@@ -65,7 +65,9 @@ class MatMulKernel : public framework::OpKernel<T> {
...
@@ -65,7 +65,9 @@ class MatMulKernel : public framework::OpKernel<T> {
auto
&
y
=
GET_DATA_SAFELY
(
auto
&
y
=
GET_DATA_SAFELY
(
context
.
Input
<
framework
::
Tensor
>
(
"Y"
),
"Input"
,
"Y"
,
"MatMul"
);
context
.
Input
<
framework
::
Tensor
>
(
"Y"
),
"Input"
,
"Y"
,
"MatMul"
);
auto
*
out
=
context
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
*
out
=
context
.
Output
<
framework
::
Tensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
dev_ctx
.
template
Alloc
<
T
>(
out
,
out
->
numel
()
*
sizeof
(
T
));
auto
blas
=
phi
::
funcs
::
GetBlas
<
DeviceContext
,
T
>
(
context
);
auto
blas
=
phi
::
funcs
::
GetBlas
<
DeviceContext
,
T
>
(
context
);
auto
mat_dim_a
=
phi
::
funcs
::
CreateMatrixDescriptor
(
auto
mat_dim_a
=
phi
::
funcs
::
CreateMatrixDescriptor
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录