Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
078e8c78
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
078e8c78
编写于
10月 09, 2022
作者:
H
Haohongxiang
提交者:
GitHub
10月 09, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Dygraph] Fix Perf of FusedFeedForward and FusedAttention with AllReduce (#46780)
上级
2e217dbb
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
8 addition
and
4 deletion
+8
-4
paddle/fluid/operators/fused/fused_attention_op.cu
paddle/fluid/operators/fused/fused_attention_op.cu
+4
-2
paddle/fluid/operators/fused/fused_feedforward_op.cu
paddle/fluid/operators/fused/fused_feedforward_op.cu
+4
-2
未找到文件。
paddle/fluid/operators/fused/fused_attention_op.cu
浏览文件 @
078e8c78
...
@@ -30,7 +30,7 @@ limitations under the License. */
...
@@ -30,7 +30,7 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/collective/ProcessGroup
NCCL
.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif
#endif
...
@@ -50,13 +50,15 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT
...
@@ -50,13 +50,15 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT
if
(
map
->
has
(
ring_id
))
{
if
(
map
->
has
(
ring_id
))
{
paddle
::
distributed
::
ProcessGroup
*
pg
=
map
->
get
(
ring_id
);
paddle
::
distributed
::
ProcessGroup
*
pg
=
map
->
get
(
ring_id
);
auto
pg_nccl
=
static_cast
<
distributed
::
ProcessGroupNCCL
*>
(
pg
);
std
::
vector
<
phi
::
DenseTensor
>
in_tensor
;
std
::
vector
<
phi
::
DenseTensor
>
in_tensor
;
std
::
vector
<
phi
::
DenseTensor
>
out_tensor
;
std
::
vector
<
phi
::
DenseTensor
>
out_tensor
;
in_tensor
.
push_back
(
tensor
);
in_tensor
.
push_back
(
tensor
);
out_tensor
.
push_back
(
tensor
);
out_tensor
.
push_back
(
tensor
);
paddle
::
distributed
::
AllreduceOptions
opts
;
paddle
::
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
distributed
::
ReduceOp
::
SUM
;
opts
.
reduce_op
=
distributed
::
ReduceOp
::
SUM
;
auto
task
=
pg
->
AllReduce
(
in_tensor
,
out_tensor
,
opts
);
auto
task
=
pg
_nccl
->
AllReduce
(
in_tensor
,
out_tensor
,
opts
,
true
,
true
);
task
->
Wait
();
task
->
Wait
();
}
else
{
}
else
{
auto
dtype
=
platform
::
ToNCCLDataType
(
auto
dtype
=
platform
::
ToNCCLDataType
(
...
...
paddle/fluid/operators/fused/fused_feedforward_op.cu
浏览文件 @
078e8c78
...
@@ -23,7 +23,7 @@ limitations under the License. */
...
@@ -23,7 +23,7 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/collective/ProcessGroup
NCCL
.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif
#endif
...
@@ -43,13 +43,15 @@ static void AllReduce(phi::DenseTensor& tensor, // NOLINT
...
@@ -43,13 +43,15 @@ static void AllReduce(phi::DenseTensor& tensor, // NOLINT
if
(
map
->
has
(
ring_id
))
{
if
(
map
->
has
(
ring_id
))
{
paddle
::
distributed
::
ProcessGroup
*
pg
=
map
->
get
(
ring_id
);
paddle
::
distributed
::
ProcessGroup
*
pg
=
map
->
get
(
ring_id
);
auto
pg_nccl
=
static_cast
<
distributed
::
ProcessGroupNCCL
*>
(
pg
);
std
::
vector
<
phi
::
DenseTensor
>
in_tensor
;
std
::
vector
<
phi
::
DenseTensor
>
in_tensor
;
std
::
vector
<
phi
::
DenseTensor
>
out_tensor
;
std
::
vector
<
phi
::
DenseTensor
>
out_tensor
;
in_tensor
.
push_back
(
tensor
);
in_tensor
.
push_back
(
tensor
);
out_tensor
.
push_back
(
tensor
);
out_tensor
.
push_back
(
tensor
);
paddle
::
distributed
::
AllreduceOptions
opts
;
paddle
::
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
distributed
::
ReduceOp
::
SUM
;
opts
.
reduce_op
=
distributed
::
ReduceOp
::
SUM
;
auto
task
=
pg
->
AllReduce
(
in_tensor
,
out_tensor
,
opts
);
auto
task
=
pg
_nccl
->
AllReduce
(
in_tensor
,
out_tensor
,
opts
,
true
,
true
);
task
->
Wait
();
task
->
Wait
();
}
else
{
}
else
{
auto
dtype
=
platform
::
ToNCCLDataType
(
auto
dtype
=
platform
::
ToNCCLDataType
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录