Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
b0bd93de
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b0bd93de
编写于
12月 31, 2020
作者:
L
lilong12
提交者:
GitHub
12月 31, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Disable gloo by default (#29805)
* update, test=develop
上级
b6fd2629
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
40 addition
and
43 deletion
+40
-43
python/paddle/distributed/fleet/base/role_maker.py
python/paddle/distributed/fleet/base/role_maker.py
+1
-8
python/paddle/distributed/fleet/launch.py
python/paddle/distributed/fleet/launch.py
+1
-1
python/paddle/distributed/fleet/launch_utils.py
python/paddle/distributed/fleet/launch_utils.py
+3
-3
python/paddle/distributed/parallel.py
python/paddle/distributed/parallel.py
+34
-31
python/paddle/fluid/tests/unittests/test_collective_api_base.py
.../paddle/fluid/tests/unittests/test_collective_api_base.py
+1
-0
未找到文件。
python/paddle/distributed/fleet/base/role_maker.py
浏览文件 @
b0bd93de
...
...
@@ -220,15 +220,8 @@ class Gloo(object):
rank
,
nodes
=
self
.
_get_rank_nodes
(
Role
.
WORKER
)
gloo
=
init
(
rank
,
nodes
,
"WORKER"
)
self
.
_worker_comm
=
gloo
else
:
rank
,
nodes
=
self
.
_get_rank_nodes
(
Role
.
SERVER
)
gloo
=
init
(
rank
,
nodes
,
"SERVER"
)
self
.
_server_comm
=
gloo
# TODO (sandyhouse): initialize gloo for server and all
if
self
.
_need_init_all
:
rank
,
nodes
=
self
.
_get_rank_nodes
(
Role
.
ALL
)
gloo
=
init
(
rank
,
nodes
,
"ALL"
)
self
.
_nodes_comm
=
gloo
if
start_http_server
:
http_server_d
[
"running"
]
=
False
http_server
.
join
()
...
...
python/paddle/distributed/fleet/launch.py
浏览文件 @
b0bd93de
...
...
@@ -219,7 +219,7 @@ def launch_collective(args):
global_envs
=
copy
.
copy
(
os
.
environ
.
copy
())
gloo_rendezvous_dir
=
tempfile
.
mkdtemp
()
# add gloo env
global_envs
[
"PADDLE_WITH_GLOO"
]
=
str
(
os
.
getenv
(
"PADDLE_WITH_GLOO"
,
"
1
"
))
global_envs
[
"PADDLE_WITH_GLOO"
]
=
str
(
os
.
getenv
(
"PADDLE_WITH_GLOO"
,
"
0
"
))
global_envs
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"3"
global_envs
[
"PADDLE_GLOO_FS_PATH"
]
=
gloo_rendezvous_dir
...
...
python/paddle/distributed/fleet/launch_utils.py
浏览文件 @
b0bd93de
...
...
@@ -954,7 +954,7 @@ class ParameterServerLauncher(object):
"TRAINING_ROLE"
:
"PSERVER"
,
"PADDLE_TRAINERS_NUM"
:
str
(
self
.
worker_num
),
"POD_IP"
:
cur_server
.
endpoint
.
split
(
":"
)[
0
],
"PADDLE_WITH_GLOO"
:
str
(
os
.
getenv
(
"PADDLE_WITH_GLOO"
,
"
1
"
)),
"PADDLE_WITH_GLOO"
:
str
(
os
.
getenv
(
"PADDLE_WITH_GLOO"
,
"
0
"
)),
"PADDLE_GLOO_RENDEZVOUS"
:
"3"
,
"PADDLE_GLOO_FS_PATH"
:
self
.
gloo_rendezvous_dir
,
"PADDLE_GLOO_HTTP_ENDPOINT"
:
self
.
http_port
...
...
@@ -1018,7 +1018,7 @@ class ParameterServerLauncher(object):
self
.
heter_worker_endpoints
,
"TRAINING_ROLE"
:
"TRAINER"
,
"PADDLE_TRAINER_ID"
:
str
(
cur_worker
.
rank
),
"PADDLE_WITH_GLOO"
:
str
(
os
.
getenv
(
"PADDLE_WITH_GLOO"
,
"
1
"
)),
"PADDLE_WITH_GLOO"
:
str
(
os
.
getenv
(
"PADDLE_WITH_GLOO"
,
"
0
"
)),
"PADDLE_GLOO_RENDEZVOUS"
:
"3"
,
"PADDLE_GLOO_FS_PATH"
:
self
.
gloo_rendezvous_dir
,
"FLAGS_selected_gpus"
:
"0"
,
...
...
@@ -1088,7 +1088,7 @@ class ParameterServerLauncher(object):
"TRAINING_ROLE"
:
"HETER_TRAINER"
,
"PADDLE_TRAINERS_NUM"
:
str
(
self
.
worker_num
),
"POD_IP"
:
cur_heter_worker
.
endpoint
.
split
(
":"
)[
0
],
"PADDLE_WITH_GLOO"
:
str
(
os
.
getenv
(
"PADDLE_WITH_GLOO"
,
"
1
"
)),
"PADDLE_WITH_GLOO"
:
str
(
os
.
getenv
(
"PADDLE_WITH_GLOO"
,
"
0
"
)),
"PADDLE_GLOO_RENDEZVOUS"
:
"3"
,
"PADDLE_GLOO_FS_PATH"
:
self
.
gloo_rendezvous_dir
,
"FLAGS_selected_gpus"
:
"0"
,
...
...
python/paddle/distributed/parallel.py
浏览文件 @
b0bd93de
...
...
@@ -142,21 +142,23 @@ def init_parallel_env():
_check_var_exists
(
"PADDLE_TRAINER_ENDPOINTS"
)
# 3: init gloo context (step 1: httpsever start)
ep_rank_0
=
parallel_env
.
trainer_endpoints
[
0
].
split
(
":"
)
ep_rank
=
parallel_env
.
trainer_endpoints
[
parallel_env
.
rank
].
split
(
":"
)
manager
=
Manager
()
# glboal dict to store status
http_server_d
=
manager
.
dict
()
http_server_d
[
"running"
]
=
False
if
parallel_env
.
rank
==
0
:
# The scope for worker used by http server is '_worker'
size
=
{
'_worker'
:
parallel_env
.
world_size
}
http_server
=
Process
(
target
=
_start_kv_server
,
args
=
(
int
(
ep_rank_0
[
1
]),
http_server_d
,
size
))
http_server
.
daemon
=
True
http_server_d
[
"running"
]
=
True
http_server
.
start
()
init_gloo
=
int
(
os
.
getenv
(
"PADDLE_WITH_GLOO"
,
"0"
))
if
init_gloo
:
ep_rank_0
=
parallel_env
.
trainer_endpoints
[
0
].
split
(
":"
)
ep_rank
=
parallel_env
.
trainer_endpoints
[
parallel_env
.
rank
].
split
(
":"
)
manager
=
Manager
()
# glboal dict to store status
http_server_d
=
manager
.
dict
()
http_server_d
[
"running"
]
=
False
if
parallel_env
.
rank
==
0
:
# The scope for worker used by http server is '_worker'
size
=
{
'_worker'
:
parallel_env
.
world_size
}
http_server
=
Process
(
target
=
_start_kv_server
,
args
=
(
int
(
ep_rank_0
[
1
]),
http_server_d
,
size
))
http_server
.
daemon
=
True
http_server_d
[
"running"
]
=
True
http_server
.
start
()
# 4. init NCCL ParallelStrategy
strategy
=
ParallelStrategy
()
...
...
@@ -185,22 +187,23 @@ def init_parallel_env():
# dividing init_gloo into two part beacause nccl and gloo
# are separately looking for free ports which sometimes
# leads to port-conflict.
wait_server_ready
([
parallel_env
.
trainer_endpoints
[
0
]])
gloo_strategy
=
core
.
GlooParallelStrategy
()
gloo_strategy
.
rank
=
parallel_env
.
rank
gloo_strategy
.
rank_num
=
parallel_env
.
world_size
gloo_strategy
.
ip_address
=
ep_rank_0
[
0
]
gloo_strategy
.
ip_port
=
int
(
ep_rank_0
[
1
])
default_init_timeout_seconds
=
3600
default_run_timeout_seconds
=
9999999
gloo_strategy
.
init_seconds
=
default_init_timeout_seconds
gloo_strategy
.
run_seconds
=
default_run_timeout_seconds
gloo
=
core
.
GlooParallelContext
(
gloo_strategy
)
gloo
.
init
()
if
parallel_env
.
rank
==
0
:
http_server_d
[
"running"
]
=
False
http_server
.
join
()
if
init_gloo
:
wait_server_ready
([
parallel_env
.
trainer_endpoints
[
0
]])
gloo_strategy
=
core
.
GlooParallelStrategy
()
gloo_strategy
.
rank
=
parallel_env
.
rank
gloo_strategy
.
rank_num
=
parallel_env
.
world_size
gloo_strategy
.
ip_address
=
ep_rank_0
[
0
]
gloo_strategy
.
ip_port
=
int
(
ep_rank_0
[
1
])
default_init_timeout_seconds
=
3600
default_run_timeout_seconds
=
9999999
gloo_strategy
.
init_seconds
=
default_init_timeout_seconds
gloo_strategy
.
run_seconds
=
default_run_timeout_seconds
gloo
=
core
.
GlooParallelContext
(
gloo_strategy
)
gloo
.
init
()
if
parallel_env
.
rank
==
0
:
http_server_d
[
"running"
]
=
False
http_server
.
join
()
def
get_rank
():
...
...
python/paddle/fluid/tests/unittests/test_collective_api_base.py
浏览文件 @
b0bd93de
...
...
@@ -178,6 +178,7 @@ class TestDistBase(unittest.TestCase):
"LD_PRELOAD"
:
os
.
getenv
(
"LD_PRELOAD"
,
""
),
"GLOG_v"
:
"0"
,
"NCCL_P2P_DISABLE"
:
"1"
,
"PADDLE_WITH_GLOO"
:
"1"
,
"BACKEND"
:
backend
,
"PATH_ID"
:
path_id
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录