Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
7dc7fc4b
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7dc7fc4b
编写于
7月 13, 2022
作者:
C
caozhou
提交者:
GitHub
7月 13, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Auto Parallel] Add comm init control by socket (#44148)
* add comm init control by socket * avoid single card instance failure
上级
42468de1
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
78 addition
and
11 deletion
+78
-11
python/paddle/distributed/auto_parallel/engine.py
python/paddle/distributed/auto_parallel/engine.py
+61
-3
python/paddle/distributed/auto_parallel/process_group.py
python/paddle/distributed/auto_parallel/process_group.py
+17
-8
未找到文件。
python/paddle/distributed/auto_parallel/engine.py
浏览文件 @
7dc7fc4b
...
...
@@ -15,6 +15,7 @@
import
copy
import
logging
from
collections
import
defaultdict
import
socket
import
paddle
import
paddle.utils
as
utils
...
...
@@ -36,7 +37,8 @@ from paddle.distributed import fleet
from
paddle.distributed.utils
import
get_logger
from
paddle.distributed.passes
import
new_pass
,
PassContext
# from .cluster import Cluster, get_default_cluster
from
..collective
import
_get_global_env
from
.cluster
import
Cluster
,
get_default_cluster
from
.planner_v2
import
Planner
from
.parallelizer_v2
import
Parallelizer
from
.dist_op
import
DistributedOperator
...
...
@@ -60,8 +62,8 @@ class Engine:
self
.
inputs_spec
=
self
.
_validate_spec
(
inputs_spec
)
self
.
labels_spec
=
self
.
_validate_spec
(
labels_spec
)
self
.
cluster
=
cluster
#
if self.cluster is None:
#
self.cluster = get_default_cluster()
if
self
.
cluster
is
None
:
self
.
cluster
=
get_default_cluster
()
self
.
strategy
=
strategy
if
self
.
strategy
is
None
:
self
.
strategy
=
fleet
.
DistributedStrategy
()
...
...
@@ -314,10 +316,66 @@ class Engine:
# Traverse different rank programs and traverse each op of them,
# instantiate communication by process_mapping.
all_process_groups
=
get_all_process_groups
()
has_recv_by_socket
=
[]
# This is a magic number and the rank number for training is usually less than 5000
magic_num
=
5000
genv
=
_get_global_env
()
cur_rank_ip
,
cur_rank_port
=
genv
.
current_endpoint
.
split
(
":"
)
cur_rank_recv_port
=
int
(
cur_rank_port
)
+
magic_num
server_socket
=
None
# Large enough for recv rank
buff_size
=
1024
server_socket
=
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
server_socket
.
bind
((
cur_rank_ip
,
cur_rank_recv_port
))
# The 10 is an empirical value
server_socket
.
listen
(
10
)
client_sockets
=
{}
for
process_group
in
all_process_groups
:
if
self
.
_cur_rank
not
in
process_group
.
ranks
:
continue
if
len
(
process_group
.
ranks
)
==
2
:
index
=
process_group
.
ranks
.
index
(
self
.
_cur_rank
)
is_send
=
True
if
index
==
0
else
False
if
is_send
:
recv_rank
=
process_group
.
ranks
[
1
]
recv_rank_ip
,
recv_rank_port
=
genv
.
trainer_endpoints
[
recv_rank
].
split
(
":"
)
connect_port
=
int
(
recv_rank_port
)
+
magic_num
client_socket
=
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
client_socket
.
connect
((
recv_rank_ip
,
connect_port
))
client_socket
.
send
(
str
(
self
.
_cur_rank
).
encode
(
'utf-8'
))
rank
=
client_socket
.
recv
(
buff_size
).
decode
(
'utf-8'
)
rank
=
int
(
rank
)
if
rank
!=
recv_rank
:
raise
ValueError
(
"Please check comm pair, the recv rank should be {} but got {}."
.
format
(
recv_rank
,
rank
))
else
:
print
(
"It is able to instantiate {} as sender now."
.
format
(
process_group
.
ranks
))
client_socket
.
close
()
else
:
send_rank
=
process_group
.
ranks
[
0
]
while
True
:
if
send_rank
not
in
has_recv_by_socket
:
client_socket
,
recv_addr
=
server_socket
.
accept
(
)
rank
=
int
(
client_socket
.
recv
(
buff_size
).
decode
())
client_sockets
[
rank
]
=
client_socket
has_recv_by_socket
.
append
(
rank
)
else
:
client_sockets
[
send_rank
].
send
(
str
(
self
.
_cur_rank
).
encode
(
"utf-8"
))
client_sockets
[
send_rank
].
close
()
print
(
"It is able to instantiate {} as recver now."
.
format
(
process_group
.
ranks
))
break
process_group
.
instantiate
()
server_socket
.
close
()
self
.
_place
=
_get_device
()
if
isinstance
(
self
.
_place
,
fluid
.
CUDAPlace
):
...
...
python/paddle/distributed/auto_parallel/process_group.py
浏览文件 @
7dc7fc4b
...
...
@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License
from
collections
import
OrderedDict
import
paddle
import
paddle.fluid.core
as
core
from
..collective
import
_get_global_env
...
...
@@ -130,16 +132,23 @@ class ProcessGroup:
else
:
assert
False
,
(
"No CUDA device found"
)
# TODO(shenliang03): This is a temporary solution to solve the problem of
# hang caused by cross-creation of new_group
tmp
=
paddle
.
to_tensor
(
[
1
],
dtype
=
"int32"
)
if
_non_static_mode
()
else
fill_constant
(
[
0
],
dtype
=
"int32"
,
value
=
"1"
)
paddle
.
distributed
.
all_reduce
(
tmp
,
use_calc_stream
=
True
)
paddle
.
distributed
.
wait
(
tmp
)
# TODO(shenliang03): This is a temporary solution to solve the problem of
# hang caused by cross-creation of new_group
paddle
.
framework
.
_in_legacy_dygraph
()
paddle
.
set_device
(
'gpu:%d'
%
paddle
.
distributed
.
ParallelEnv
().
dev_id
)
tmp
=
paddle
.
to_tensor
(
[
1
],
dtype
=
"int32"
)
if
_non_static_mode
()
else
fill_constant
(
[
0
],
dtype
=
"int32"
,
value
=
"1"
)
paddle
.
distributed
.
all_reduce
(
tmp
,
use_calc_stream
=
True
,
group
=
self
)
paddle
.
distributed
.
wait
(
tmp
,
group
=
self
)
paddle
.
enable_static
()
self
.
_is_instantiate
=
True
def
is_member
(
self
):
return
True
# def __eq__(self, other):
# if not isinstance(other, ProcessGroup):
# return False
...
...
@@ -158,5 +167,5 @@ class ProcessGroup:
# Note that Process group 0 is reserved for representing all ranks.
# At the beginning, group 0 is empty and new ranks will be added automatically.
_g_process_group_map
=
{}
_g_process_group_map
=
OrderedDict
()
_g_process_group_map
[
0
]
=
ProcessGroup
(
0
,
[])
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录