Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
34f30f79
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
34f30f79
编写于
4月 18, 2022
作者:
L
lilong12
提交者:
GitHub
4月 18, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix bug for eager mode distributed training (#41841)
上级
f3531c7b
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
74 addition
and
56 deletion
+74
-56
paddle/fluid/distributed/collective/ProcessGroup.cc
paddle/fluid/distributed/collective/ProcessGroup.cc
+3
-2
paddle/fluid/distributed/collective/ProcessGroup.h
paddle/fluid/distributed/collective/ProcessGroup.h
+3
-1
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+3
-2
paddle/fluid/distributed/collective/ProcessGroupGloo.h
paddle/fluid/distributed/collective/ProcessGroupGloo.h
+2
-1
paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+6
-2
paddle/fluid/distributed/collective/ProcessGroupHCCL.h
paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+1
-1
paddle/fluid/distributed/collective/ProcessGroupHeter.cc
paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+9
-11
paddle/fluid/distributed/collective/ProcessGroupHeter.h
paddle/fluid/distributed/collective/ProcessGroupHeter.h
+3
-3
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+6
-2
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+1
-1
paddle/fluid/pybind/distributed_py.cc
paddle/fluid/pybind/distributed_py.cc
+22
-26
python/paddle/distributed/collective.py
python/paddle/distributed/collective.py
+13
-3
python/paddle/fluid/tests/unittests/process_group_gloo.py
python/paddle/fluid/tests/unittests/process_group_gloo.py
+2
-1
未找到文件。
paddle/fluid/distributed/collective/ProcessGroup.cc
浏览文件 @
34f30f79
...
...
@@ -35,8 +35,9 @@ bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) {
void
ProcessGroup
::
Task
::
Synchronize
()
{}
ProcessGroup
::
ProcessGroup
(
int
rank
,
int
size
,
int
gid
)
:
rank_
(
rank
),
size_
(
size
),
gid_
(
gid
)
{
ProcessGroup
::
ProcessGroup
(
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
)
:
rank_
(
rank
),
size_
(
size
),
place_
(
place
),
gid_
(
gid
)
{
if
(
gid
!=
IGNORE_ID
)
{
auto
map
=
ProcessGroupMapFromGid
::
getInstance
();
map
->
insert
(
gid_
,
this
);
...
...
paddle/fluid/distributed/collective/ProcessGroup.h
浏览文件 @
34f30f79
...
...
@@ -69,7 +69,8 @@ class ProcessGroup {
bool
is_completed_
=
false
;
};
explicit
ProcessGroup
(
int
rank
,
int
size
,
int
gid
);
explicit
ProcessGroup
(
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
);
virtual
~
ProcessGroup
()
{}
int
GetRank
()
const
{
return
rank_
;
}
...
...
@@ -145,6 +146,7 @@ class ProcessGroup {
protected:
const
int
rank_
;
const
int
size_
;
const
platform
::
Place
place_
;
const
int
gid_
;
};
...
...
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
浏览文件 @
34f30f79
...
...
@@ -165,8 +165,9 @@ ProcessGroupGloo::GlooTask::GlooTask(
ProcessGroupGloo
::
ProcessGroupGloo
(
const
std
::
shared_ptr
<
distributed
::
Store
>&
store
,
int
rank
,
int
world_size
,
int
gid
,
const
std
::
shared_ptr
<
GlooOptions
>
options
)
:
ProcessGroup
(
rank
,
world_size
,
gid
),
const
platform
::
Place
&
place
,
int
gid
,
const
std
::
shared_ptr
<
GlooOptions
>
options
)
:
ProcessGroup
(
rank
,
world_size
,
place
,
gid
),
_tag
(
0
),
_store
(
new
GlooStore
(
store
))
{
_context
=
std
::
make_shared
<
gloo
::
rendezvous
::
Context
>
(
rank
,
world_size
);
...
...
paddle/fluid/distributed/collective/ProcessGroupGloo.h
浏览文件 @
34f30f79
...
...
@@ -102,7 +102,8 @@ class ProcessGroupGloo : public ProcessGroup {
explicit
ProcessGroupGloo
(
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>&
store
,
int
rank
,
int
world_size
,
int
gid
,
std
::
shared_ptr
<
GlooOptions
>
options
);
int
world_size
,
const
platform
::
Place
&
place
,
int
gid
,
std
::
shared_ptr
<
GlooOptions
>
options
);
~
ProcessGroupGloo
()
=
default
;
...
...
paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
浏览文件 @
34f30f79
...
...
@@ -17,6 +17,7 @@
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/npu/hccl_helper.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
...
...
@@ -97,8 +98,11 @@ bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
void
ProcessGroupHCCL
::
HCCLTask
::
Synchronize
()
{
Wait
(
kWaitTimeout
);
}
ProcessGroupHCCL
::
ProcessGroupHCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
int
gid
)
:
ProcessGroup
(
rank
,
size
,
gid
),
store_
(
store
)
{}
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
)
:
ProcessGroup
(
rank
,
size
,
place
,
gid
),
store_
(
store
)
{
platform
::
SetNPUDeviceId
(
place_
.
device
);
}
void
ProcessGroupHCCL
::
BroadcastUniqueHCCLID
(
std
::
vector
<
HcclRootInfo
>&
hccl_ids
)
{
// NOLINT
...
...
paddle/fluid/distributed/collective/ProcessGroupHCCL.h
浏览文件 @
34f30f79
...
...
@@ -71,7 +71,7 @@ class ProcessGroupHCCL : public ProcessGroup {
};
ProcessGroupHCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
int
gid
);
const
platform
::
Place
&
place
,
int
gid
);
const
std
::
string
GetBackendName
()
const
override
{
return
std
::
string
(
HCCL_BACKEND_NAME
);
...
...
paddle/fluid/distributed/collective/ProcessGroupHeter.cc
浏览文件 @
34f30f79
...
...
@@ -44,13 +44,11 @@ bool ProcessGroupHeter::HeterTask::Wait(std::chrono::milliseconds timeout) {
return
true
;
}
ProcessGroupHeter
::
ProcessGroupHeter
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
int
gid
,
int
local_rank
,
int
local_size
,
int
gloo_rank
,
int
gloo_size
,
bool
with_switch
,
std
::
string
switch_endpoint
)
:
ProcessGroup
(
rank
,
size
,
gid
),
ProcessGroupHeter
::
ProcessGroupHeter
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
,
int
local_rank
,
int
local_size
,
int
gloo_rank
,
int
gloo_size
,
bool
with_switch
,
std
::
string
switch_endpoint
)
:
ProcessGroup
(
rank
,
size
,
place
,
gid
),
store_
(
store
),
local_rank_
(
local_rank
),
local_size_
(
local_size
),
...
...
@@ -60,10 +58,10 @@ ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr<Store>& store,
switch_endpoint_
(
switch_endpoint
)
{
#if defined(PADDLE_WITH_NCCL)
inner_pg_
=
std
::
make_shared
<
ProcessGroupNCCL
>
(
store
,
local_rank
,
local_size
,
IGNORE_ID
);
place_
,
IGNORE_ID
);
#elif defined(PADDLE_WITH_ASCEND_CL)
inner_pg_
=
std
::
make_shared
<
ProcessGroupHCCL
>
(
store
,
local_rank
,
local_size
,
IGNORE_ID
);
place_
,
IGNORE_ID
);
#else
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
"ProcessGroupHeter only supports NCCL and HCCL now."
);
...
...
@@ -71,8 +69,8 @@ ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr<Store>& store,
if
(
local_rank_
==
0
&&
!
with_switch_
)
{
auto
opts
=
ProcessGroupGloo
::
GlooOptions
::
create
();
opts
->
device
=
ProcessGroupGloo
::
createDefaultDevice
();
inter_pg_
=
std
::
make_shared
<
ProcessGroupGloo
>
(
store
,
gloo_rank_
,
gloo_siz
e_
,
IGNORE_ID
,
opts
);
inter_pg_
=
std
::
make_shared
<
ProcessGroupGloo
>
(
store
,
gloo_rank_
,
gloo_size_
,
plac
e_
,
IGNORE_ID
,
opts
);
}
}
...
...
paddle/fluid/distributed/collective/ProcessGroupHeter.h
浏览文件 @
34f30f79
...
...
@@ -81,9 +81,9 @@ class ProcessGroupHeter : public ProcessGroup {
};
ProcessGroupHeter
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
int
gid
,
int
local_rank
,
int
local_size
,
int
gloo
_rank
,
int
gloo_size
,
bool
with_switch
,
std
::
string
switch_endpoints
);
const
platform
::
Place
&
place
,
int
gid
,
int
local
_rank
,
int
local_size
,
int
gloo_rank
,
int
gloo_size
,
bool
with_switch
,
std
::
string
switch_endpoints
);
const
std
::
string
GetBackendName
()
const
override
{
return
std
::
string
(
HETER_BACKEND_NAME
);
...
...
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
浏览文件 @
34f30f79
...
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
...
...
@@ -103,8 +104,11 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
void
ProcessGroupNCCL
::
NCCLTask
::
Synchronize
()
{
Wait
(
kWaitTimeout
);
}
ProcessGroupNCCL
::
ProcessGroupNCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
int
gid
)
:
ProcessGroup
(
rank
,
size
,
gid
),
store_
(
store
)
{}
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
)
:
ProcessGroup
(
rank
,
size
,
place
,
gid
),
store_
(
store
)
{
platform
::
SetDeviceId
(
place_
.
device
);
}
void
ProcessGroupNCCL
::
BroadcastUniqueNCCLID
(
std
::
vector
<
ncclUniqueId
>&
nccl_ids
)
{
// NOLINT
...
...
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
浏览文件 @
34f30f79
...
...
@@ -77,7 +77,7 @@ class ProcessGroupNCCL : public ProcessGroup {
};
ProcessGroupNCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
int
gid
);
const
platform
::
Place
&
place
,
int
gid
);
const
std
::
string
GetBackendName
()
const
override
{
return
std
::
string
(
NCCL_BACKEND_NAME
);
...
...
paddle/fluid/pybind/distributed_py.cc
浏览文件 @
34f30f79
...
...
@@ -241,49 +241,42 @@ void BindDistributed(py::module *m) {
std
::
shared_ptr
<
distributed
::
ProcessGroupNCCL
>>
(
*
m
,
"ProcessGroupNCCL"
,
ProcessGroup
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
,
int
>
(),
const
platform
::
CUDAPlace
&
,
int
>
(),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"group_id"
)
=
0
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
py
::
arg
(
"place"
),
py
::
arg
(
"group_id"
)
=
0
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#endif
#if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_ASCEND_CL))
py
::
class_
<
distributed
::
ProcessGroupHeter
,
std
::
shared_ptr
<
distributed
::
ProcessGroupHeter
>>
(
*
m
,
"ProcessGroupHeter"
,
ProcessGroup
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
bool
,
std
::
string
>
(),
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
,
#if defined(PADDLE_WITH_ASCEND_CL)
const
platform
::
NPUPlace
&
,
#else
const
platform
::
CUDAPlace
&
,
#endif
int
,
int
,
int
,
int
,
int
,
bool
,
std
::
string
>
(),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"gid"
)
=
0
,
py
::
arg
(
"local_rank"
)
=
0
,
py
::
arg
(
"
place"
),
py
::
arg
(
"
gid"
)
=
0
,
py
::
arg
(
"local_rank"
)
=
0
,
py
::
arg
(
"local_size"
)
=
1
,
py
::
arg
(
"gloo_rank"
)
=
0
,
py
::
arg
(
"gloo_size"
)
=
1
,
py
::
arg
(
"with_switch"
)
=
false
,
py
::
arg
(
"switch_endpoint"
)
=
""
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#endif
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
py
::
class_
<
distributed
::
ProcessGroupHCCL
,
std
::
shared_ptr
<
distributed
::
ProcessGroupHCCL
>>
(
*
m
,
"ProcessGroupHCCL"
,
ProcessGroup
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
,
int
>
(),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"group_id"
)
=
0
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_ASCEND_CL))
py
::
class_
<
distributed
::
ProcessGroupHeter
,
std
::
shared_ptr
<
distributed
::
ProcessGroupHeter
>>
(
*
m
,
"ProcessGroupHeter"
,
ProcessGroup
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
bool
,
std
::
string
>
(),
const
platform
::
NPUPlace
&
,
int
>
(),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"gid"
)
=
0
,
py
::
arg
(
"local_rank"
)
=
0
,
py
::
arg
(
"local_size"
)
=
1
,
py
::
arg
(
"gloo_rank"
)
=
0
,
py
::
arg
(
"gloo_rank"
)
=
1
,
py
::
arg
(
"with_switch"
)
=
false
,
py
::
arg
(
"switch_endpoint"
)
=
""
,
py
::
arg
(
"place"
),
py
::
arg
(
"group_id"
)
=
0
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#endif
#endif
py
::
class_
<
distributed
::
ProcessGroup
::
Task
,
...
...
@@ -299,10 +292,12 @@ void BindDistributed(py::module *m) {
py
::
class_
<
ProcessGroupGloo
,
std
::
shared_ptr
<
ProcessGroupGloo
>>
(
*
m
,
"ProcessGroupGloo"
,
ProcessGroup
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>
&
,
int
,
int
,
int
,
std
::
shared_ptr
<
GlooOptions
>
&>
(),
int
,
const
platform
::
CPUPlace
&
,
int
,
std
::
shared_ptr
<
GlooOptions
>
&>
(),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
py
::
init
([](
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>
&
store
,
int
rank
,
int
world_size
,
int
gid
)
{
int
rank
,
int
world_size
,
const
platform
::
CPUPlace
&
place
,
int
gid
)
{
auto
opts
=
GlooOptions
::
create
();
char
*
ifname
=
getenv
(
GLOO_SOCKET_IFNAME_ENV
.
c_str
());
if
(
ifname
&&
strlen
(
ifname
)
>
1
)
{
...
...
@@ -312,10 +307,11 @@ void BindDistributed(py::module *m) {
opts
->
device
=
ProcessGroupGloo
::
createDefaultDevice
();
}
return
std
::
make_shared
<
ProcessGroupGloo
>
(
store
,
rank
,
world_size
,
gid
,
opts
);
place
,
gid
,
opts
);
}),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"group_id"
)
=
0
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
py
::
arg
(
"place"
),
py
::
arg
(
"group_id"
)
=
0
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def_static
(
"create_default_device"
,
&
ProcessGroupGloo
::
createDefaultDevice
);
#endif
...
...
python/paddle/distributed/collective.py
浏览文件 @
34f30f79
...
...
@@ -228,14 +228,23 @@ def _new_process_group_impl(backend,
pg_options
,
group_id
=
0
):
pg
=
None
genv
=
_get_global_env
()
assert
backend
in
_valid_backend_list
,
"Unsupported backend: %s."
%
backend
if
backend
==
"gloo"
:
pg
=
core
.
ProcessGroupGloo
(
store
,
rank
,
world_size
,
group_id
)
place
=
core
.
CPUPlace
()
pg
=
core
.
ProcessGroupGloo
(
store
,
rank
,
world_size
,
place
,
group_id
)
elif
backend
==
"nccl"
:
pg
=
core
.
ProcessGroupNCCL
(
store
,
rank
,
world_size
,
group_id
)
place
=
core
.
CUDAPlace
(
genv
.
device_id
)
pg
=
core
.
ProcessGroupNCCL
(
store
,
rank
,
world_size
,
place
,
group_id
)
elif
backend
==
"hccl"
:
pg
=
core
.
ProcessGroupHCCL
(
store
,
rank
,
world_size
,
group_id
)
place
=
core
.
NPUPlace
(
genv
.
device_id
)
pg
=
core
.
ProcessGroupHCCL
(
store
,
rank
,
world_size
,
place
,
group_id
)
elif
backend
==
"heter"
:
place
=
None
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
genv
.
device_id
)
elif
core
.
is_compiled_with_npu
():
place
=
core
.
NPUPlace
(
genv
.
device_id
)
cluster_id
=
int
(
os
.
getenv
(
"CLUSTER_ID"
,
"-1"
))
assert
cluster_id
>=
0
,
"please set the CLUSTER_ID variable."
cluster_size
=
os
.
getenv
(
"CLUSTER_SIZE"
,
None
)
...
...
@@ -253,6 +262,7 @@ def _new_process_group_impl(backend,
store
,
rank
=
global_rank
,
world_size
=
global_world_size
,
place
=
place
,
gid
=
0
,
local_rank
=
rank
,
local_size
=
world_size
,
...
...
python/paddle/fluid/tests/unittests/process_group_gloo.py
浏览文件 @
34f30f79
...
...
@@ -47,7 +47,8 @@ class TestProcessGroupFp32(unittest.TestCase):
is_master
=
True
if
rank
==
0
else
False
store
=
paddle
.
fluid
.
core
.
TCPStore
(
"127.0.0.1"
,
6272
,
is_master
,
nranks
,
datetime
.
timedelta
(
0
))
pg
=
paddle
.
fluid
.
core
.
ProcessGroupGloo
(
store
,
rank
,
nranks
)
place
=
paddle
.
fluid
.
core
.
CPUPlace
()
pg
=
paddle
.
fluid
.
core
.
ProcessGroupGloo
(
store
,
rank
,
nranks
,
place
)
# test allreduce sum
# rank 0
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录