Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
7b450e78
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7b450e78
编写于
3月 18, 2021
作者:
V
Void Main
提交者:
GitHub
3月 18, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add auto-increasing tag id for Hcom OPs (#31702)
上级
50bc1162
变更
14
显示空白变更内容
内联
并排
Showing
14 changed file
with
53 addition
and
27 deletion
+53
-27
paddle/fluid/operators/collective/c_allgather_op_npu.cc
paddle/fluid/operators/collective/c_allgather_op_npu.cc
+1
-1
paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+3
-1
paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
...fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+3
-1
paddle/fluid/operators/collective/c_allreduce_op.h
paddle/fluid/operators/collective/c_allreduce_op.h
+11
-7
paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
...fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+3
-1
paddle/fluid/operators/collective/c_broadcast_op_npu.cc
paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+1
-1
paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+3
-1
paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
+4
-4
paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
...fluid/operators/collective/c_reducescatter_op_npu_test.cc
+3
-1
paddle/fluid/operators/collective/recv_v2_op_npu.cc
paddle/fluid/operators/collective/recv_v2_op_npu.cc
+2
-2
paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+3
-1
paddle/fluid/operators/collective/send_v2_op_npu.cc
paddle/fluid/operators/collective/send_v2_op_npu.cc
+3
-3
paddle/fluid/operators/collective/send_v2_op_npu_test.cc
paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+3
-1
paddle/fluid/platform/collective_helper.h
paddle/fluid/platform/collective_helper.h
+10
-2
未找到文件。
paddle/fluid/operators/collective/c_allgather_op_npu.cc
浏览文件 @
7b450e78
...
...
@@ -35,10 +35,10 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
std
::
string
group
=
std
::
string
(
HCOM_GROUP_PREFIX
)
+
std
::
to_string
(
ring_id
);
std
::
string
tag
=
ctx
.
Attr
<
std
::
string
>
(
"tag"
);
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
int
nranks
=
comm
->
nranks
();
std
::
string
tag
=
std
::
to_string
(
ring_id
)
+
"_"
+
std
::
to_string
(
comm
->
NextTagId
());
framework
::
DDim
out_dims
=
in
->
dims
();
out_dims
[
0
]
*=
nranks
;
...
...
paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
浏览文件 @
7b450e78
...
...
@@ -119,7 +119,9 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"c_allgather"
,
{{
"X"
,
{
"X"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
ctx
.
Wait
();
std
::
vector
<
float
>
out_vec
;
...
...
paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
浏览文件 @
7b450e78
...
...
@@ -118,7 +118,9 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"c_allreduce_max"
,
{{
"X"
,
{
"X"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
ctx
.
Wait
();
std
::
vector
<
float
>
out_vec
;
...
...
paddle/fluid/operators/collective/c_allreduce_op.h
浏览文件 @
7b450e78
...
...
@@ -135,16 +135,16 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
paddle
::
framework
::
LoDTensor
tmp_in
,
tmp_out
;
tmp_in
.
Resize
({
tmp_numel
});
tmp_out
.
Resize
({
tmp_numel
});
tmp_in
.
mutable_data
<
T
>
(
place
);
// allocate
tmp_out
.
mutable_data
<
T
>
(
place
);
// allocate
auto
p_tmp_in
=
tmp_in
.
mutable_data
<
T
>
(
place
);
// allocate
auto
p_tmp_out
=
tmp_out
.
mutable_data
<
T
>
(
place
);
// allocate
void
*
sendbuff
=
reinterpret_cast
<
void
*>
(
tmp_in
.
data
<
T
>
()
+
pre_tmp_size
);
void
*
recvbuff
=
reinterpret_cast
<
void
*>
(
tmp_out
.
data
<
T
>
()
+
pre_tmp_size
);
std
::
string
tag
=
ctx
.
Attr
<
std
::
string
>
(
"tag"
);
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
std
::
string
group
=
std
::
string
(
HCOM_GROUP_PREFIX
)
+
std
::
to_string
(
ring_id
);
auto
comm
=
paddle
::
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
std
::
string
tag
=
std
::
to_string
(
ring_id
)
+
"_"
+
std
::
to_string
(
comm
->
NextTagId
());
aclrtStream
stream
=
nullptr
;
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
...
...
@@ -154,6 +154,10 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
stream
=
comm
->
stream
();
}
// we need to memset this memory firstly to avoid core by hccl
platform
::
NPUMemsetAsync
(
static_cast
<
void
*>
(
p_tmp_in
),
0
,
tmp_numel
*
sizeof
(
T
),
stream
);
platform
::
NPUMemsetAsync
(
static_cast
<
void
*>
(
p_tmp_out
),
0
,
tmp_numel
*
sizeof
(
T
),
stream
);
auto
npu_place
=
BOOST_GET_CONST
(
platform
::
NPUPlace
,
place
);
memory
::
Copy
(
npu_place
,
sendbuff
,
...
...
paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
浏览文件 @
7b450e78
...
...
@@ -117,7 +117,9 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
{{
"Out"
,
{
"Out"
}}},
attrs
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
ctx
.
Wait
();
std
::
vector
<
float
>
out_vec
;
...
...
paddle/fluid/operators/collective/c_broadcast_op_npu.cc
浏览文件 @
7b450e78
...
...
@@ -48,7 +48,7 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
int
root
=
ctx
.
Attr
<
int
>
(
"root"
);
std
::
string
group
=
std
::
string
(
HCOM_GROUP_PREFIX
)
+
std
::
to_string
(
ring_id
);
std
::
string
tag
=
ctx
.
Attr
<
std
::
string
>
(
"tag"
);
std
::
string
tag
=
std
::
to_string
(
ring_id
)
+
"_"
+
std
::
to_string
(
comm
->
NextTagId
()
);
VLOG
(
3
)
<<
"begin hccl broadcast, parameter is: "
<<
"root "
<<
root
<<
", group is "
<<
group
...
...
paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
浏览文件 @
7b450e78
...
...
@@ -113,7 +113,9 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"c_broadcast"
,
{{
"X"
,
{
"X"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
ctx
.
Wait
();
std
::
vector
<
float
>
out_vec
;
...
...
paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
浏览文件 @
7b450e78
...
...
@@ -32,10 +32,10 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
std
::
string
group
=
std
::
string
(
HCOM_GROUP_PREFIX
)
+
std
::
to_string
(
ring_id
);
std
::
string
tag
=
ctx
.
Attr
<
std
::
string
>
(
"tag"
);
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
int
nranks
=
comm
->
nranks
();
std
::
string
tag
=
std
::
to_string
(
ring_id
)
+
"_"
+
std
::
to_string
(
comm
->
NextTagId
());
auto
out_dims
=
in
->
dims
();
PADDLE_ENFORCE_EQ
(
out_dims
[
0
]
%
nranks
,
0
,
...
...
paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
浏览文件 @
7b450e78
...
...
@@ -119,7 +119,9 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"c_reducescatter"
,
{{
"X"
,
{
"X"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
ctx
.
Wait
();
std
::
vector
<
float
>
out_vec
;
...
...
paddle/fluid/operators/collective/recv_v2_op_npu.cc
浏览文件 @
7b450e78
...
...
@@ -42,7 +42,7 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
}
else
{
stream
=
comm
->
stream
();
}
std
::
string
tag
=
ctx
.
Attr
<
std
::
string
>
(
"tag"
);
std
::
string
tag
=
std
::
to_string
(
ring_id
)
+
"_"
+
std
::
to_string
(
comm
->
NextTagId
()
);
std
::
string
group
=
std
::
string
(
HCOM_GROUP_PREFIX
)
+
std
::
to_string
(
ring_id
);
int
srcRank
=
ctx
.
Attr
<
int
>
(
"peer"
);
int
srTag
=
ctx
.
Attr
<
int
>
(
"srTag"
);
...
...
paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
浏览文件 @
7b450e78
...
...
@@ -99,7 +99,9 @@ void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"recv_v2"
,
{},
{{
"Out"
,
{
"Out"
}}},
attrs
);
VLOG
(
3
)
<<
"CreateOp recv_v2"
;
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
VLOG
(
3
)
<<
"Run op recv_v2"
;
std
::
vector
<
float
>
out_vec
;
TensorToVector
(
*
tensor_out
,
ctx
,
&
out_vec
);
...
...
paddle/fluid/operators/collective/send_v2_op_npu.cc
浏览文件 @
7b450e78
...
...
@@ -42,7 +42,7 @@ class CSendOpASCENDKernel : public framework::OpKernel<T> {
}
else
{
stream
=
comm
->
stream
();
}
std
::
string
tag
=
ctx
.
Attr
<
std
::
string
>
(
"tag"
);
std
::
string
tag
=
std
::
to_string
(
ring_id
)
+
"_"
+
std
::
to_string
(
comm
->
NextTagId
()
);
std
::
string
group
=
std
::
string
(
HCOM_GROUP_PREFIX
)
+
std
::
to_string
(
ring_id
);
int
destRank
=
ctx
.
Attr
<
int
>
(
"peer"
);
int
srTag
=
ctx
.
Attr
<
int
>
(
"srTag"
);
...
...
paddle/fluid/operators/collective/send_v2_op_npu_test.cc
浏览文件 @
7b450e78
...
...
@@ -90,7 +90,9 @@ void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"send_v2"
,
{{
"X"
,
{
"X"
}}},
{},
attrs
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
VLOG
(
3
)
<<
"send run over"
;
ctx
.
Wait
();
}
...
...
paddle/fluid/platform/collective_helper.h
浏览文件 @
7b450e78
...
...
@@ -18,6 +18,7 @@
#include <memory>
#include <string>
#include <vector>
#include <atomic>
#include "boost/variant.hpp"
#include "paddle/fluid/platform/enforce.h"
...
...
@@ -148,8 +149,7 @@ class NCCLCommContext {
class
NPUDeviceContext
;
#define ENV_RANK_TABLE_FILE "RANK_TABLE_FILE"
#define ENV_RANK_ID "RANK_ID"
#define ENV_DEV_ID "DEV_ID"
#define ENV_RANK_ID "PADDLE_TRAINER_ID"
class
HCCLComm
{
public:
...
...
@@ -160,6 +160,12 @@ class HCCLComm {
virtual
aclrtStream
stream
()
const
=
0
;
virtual
NPUDeviceContext
*
dev_context
()
const
=
0
;
virtual
~
HCCLComm
()
=
default
;
unsigned
long
NextTagId
()
{
return
tag_counter_
++
;
}
private:
std
::
atomic
<
unsigned
long
>
tag_counter_
;
};
// A singleton HCCL communicator context reserves communication ring ids
...
...
@@ -208,10 +214,12 @@ class HCCLCommContext {
return
Get
(
ring_id
,
BOOST_GET_CONST
(
NPUPlace
,
place
).
device
);
}
private:
// Init global hcom
HCCLCommContext
()
{
InitHcomWorldGroup
();
}
public:
~
HCCLCommContext
(){
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
hcom_destroy
());
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录