Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
ebf486ac
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ebf486ac
编写于
5月 24, 2022
作者:
K
kuizhiqing
提交者:
GitHub
5月 24, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[launch] fix timeout reset (#42941)
上级
a5ad2659
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
32 addition
and
7 deletion
+32
-7
python/paddle/distributed/launch/context/__init__.py
python/paddle/distributed/launch/context/__init__.py
+7
-0
python/paddle/distributed/launch/context/args_envs.py
python/paddle/distributed/launch/context/args_envs.py
+2
-2
python/paddle/distributed/launch/controllers/__init__.py
python/paddle/distributed/launch/controllers/__init__.py
+1
-0
python/paddle/distributed/launch/controllers/collective.py
python/paddle/distributed/launch/controllers/collective.py
+5
-1
python/paddle/distributed/launch/controllers/master.py
python/paddle/distributed/launch/controllers/master.py
+11
-1
python/paddle/distributed/launch/controllers/ps.py
python/paddle/distributed/launch/controllers/ps.py
+2
-0
python/paddle/distributed/launch/plugins/__init__.py
python/paddle/distributed/launch/plugins/__init__.py
+2
-1
python/paddle/fluid/tests/unittests/test_run.py
python/paddle/fluid/tests/unittests/test_run.py
+2
-2
未找到文件。
python/paddle/distributed/launch/context/__init__.py
浏览文件 @
ebf486ac
...
...
@@ -17,6 +17,7 @@ from paddle.distributed.launch import plugins
from
.node
import
Node
from
.status
import
Status
from
.args_envs
import
parse_args
,
fetch_envs
,
env_args_mapping
import
six
import
logging
...
...
@@ -39,6 +40,12 @@ class Context(object):
if
enable_plugin
:
self
.
_enable_plugin
()
def
print
(
self
):
self
.
logger
.
info
(
"----------- Configuration ----------------------"
)
for
arg
,
value
in
sorted
(
six
.
iteritems
(
vars
(
self
.
args
))):
self
.
logger
.
info
(
"%s: %s"
%
(
arg
,
value
))
self
.
logger
.
info
(
"--------------------------------------------------"
)
def
is_legacy_mode
(
self
):
if
self
.
args
.
legacy
:
return
True
...
...
python/paddle/distributed/launch/context/args_envs.py
浏览文件 @
ebf486ac
...
...
@@ -85,7 +85,7 @@ def parse_args():
base_group
.
add_argument
(
"--run_mode"
,
type
=
str
,
default
=
"collective"
,
default
=
None
,
help
=
"run mode of the job, collective/ps/ps-heter"
)
base_group
.
add_argument
(
...
...
@@ -125,7 +125,7 @@ def parse_args():
ps_group
.
add_argument
(
"--gloo_port"
,
type
=
int
,
default
=
6767
,
help
=
"gloo http port"
)
ps_group
.
add_argument
(
"--with_gloo"
,
type
=
str
,
default
=
"
0
"
,
help
=
"use gloo or not"
)
"--with_gloo"
,
type
=
str
,
default
=
"
1
"
,
help
=
"use gloo or not"
)
# parameter elastic mode
elastic_group
=
parser
.
add_argument_group
(
"Elastic Parameters"
)
...
...
python/paddle/distributed/launch/controllers/__init__.py
浏览文件 @
ebf486ac
...
...
@@ -29,4 +29,5 @@ _controllers = [
def
init
(
ctx
):
for
c
in
_controllers
:
if
c
.
enable
(
ctx
):
ctx
.
print
()
return
c
(
ctx
)
python/paddle/distributed/launch/controllers/collective.py
浏览文件 @
ebf486ac
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from
.controller
import
Controller
from
.controller
import
Controller
,
ControleMode
import
json
import
os
...
...
@@ -23,8 +23,10 @@ import time
class
CollectiveController
(
Controller
):
@
classmethod
def
enable
(
cls
,
ctx
):
# collective is the default mode
if
ctx
:
ctx
.
logger
.
debug
(
"{} enabled"
.
format
(
cls
.
__name__
))
ctx
.
args
.
run_mode
=
ControleMode
.
COLLECTIVE
return
True
else
:
return
False
...
...
@@ -85,6 +87,7 @@ class CollectiveController(Controller):
"PADDLE_LOCAL_SIZE"
:
"{}"
.
format
(
self
.
pod
.
replicas
),
"PADDLE_GLOBAL_RANK"
:
"{}"
.
format
(
i
+
rank_offset
),
"PADDLE_LOCAL_RANK"
:
"{}"
.
format
(
i
),
"PADDLE_NNODES"
:
"{}"
.
format
(
self
.
job
.
replicas
),
## compatible env
"PADDLE_TRAINER_ENDPOINTS"
:
","
.
join
(
job_endpoints
),
"PADDLE_CURRENT_ENDPOINT"
:
endpoints
[
i
],
...
...
@@ -106,6 +109,7 @@ class CollectiveElasticController(CollectiveController):
def
enable
(
cls
,
ctx
):
if
ctx
.
args
.
master
and
ctx
.
args
.
master
.
startswith
(
"etcd://"
):
ctx
.
logger
.
debug
(
"{} enabled"
.
format
(
cls
.
__name__
))
ctx
.
args
.
run_mode
=
ControleMode
.
COLLECTIVE
return
True
else
:
return
False
...
...
python/paddle/distributed/launch/controllers/master.py
浏览文件 @
ebf486ac
...
...
@@ -276,10 +276,20 @@ class ETCDMaster(Master):
return
peer_alive
def
wait_peer_ready
(
self
,
replicas_min
,
replicas_max
,
timeout
):
timeout
=
timeout
if
timeout
>
1
else
3
end
=
time
.
time
()
+
timeout
np_pre
=
len
(
self
.
fetch_peer_alive
())
while
not
self
.
ctx
.
status
.
is_done
()
and
time
.
time
()
<
end
:
if
len
(
self
.
fetch_peer_alive
())
==
replicas_max
:
np
=
len
(
self
.
fetch_peer_alive
())
if
np
==
replicas_max
:
# maximum replicas reached, return immediately
return
(
True
,
replicas_max
)
elif
np
!=
np_pre
:
# replicas are changing, reset timeout
end
=
time
.
time
()
+
timeout
np_pre
=
np
time
.
sleep
(
0.2
)
else
:
time
.
sleep
(
0.5
)
...
...
python/paddle/distributed/launch/controllers/ps.py
浏览文件 @
ebf486ac
...
...
@@ -171,6 +171,7 @@ class PSController(Controller):
for
i
in
range
(
server_num
):
e
=
{
"PADDLE_NNODES"
:
"{}"
.
format
(
self
.
job
.
replicas
),
"PADDLE_PSERVERS_IP_PORT_LIST"
:
","
.
join
(
server_endpoints
),
"PADDLE_TRAINER_ENDPOINTS"
:
","
.
join
(
trainer_endpoints
),
"PADDLE_PORT"
:
...
...
@@ -186,6 +187,7 @@ class PSController(Controller):
for
i
in
range
(
trainer_num
):
e
=
{
"PADDLE_NNODES"
:
"{}"
.
format
(
self
.
job
.
replicas
),
"PADDLE_PSERVERS_IP_PORT_LIST"
:
","
.
join
(
server_endpoints
),
"PADDLE_TRAINER_ENDPOINTS"
:
","
.
join
(
trainer_endpoints
),
"PADDLE_PORT"
:
...
...
python/paddle/distributed/launch/plugins/__init__.py
浏览文件 @
ebf486ac
...
...
@@ -17,6 +17,7 @@ import six
__all__
=
[]
# print configuration after args are well filled in controller init
def
log
(
ctx
):
ctx
.
logger
.
info
(
"----------- Configuration ----------------------"
)
for
arg
,
value
in
sorted
(
six
.
iteritems
(
vars
(
ctx
.
args
))):
...
...
@@ -59,4 +60,4 @@ def rewrite_host_ip(ctx):
ctx
.
node
.
ip
=
ctx
.
args
.
host
enabled_plugins
=
[
collective_compatible
,
rewrite_host_ip
,
process_args
,
log
]
enabled_plugins
=
[
collective_compatible
,
rewrite_host_ip
,
process_args
]
python/paddle/fluid/tests/unittests/test_run.py
浏览文件 @
ebf486ac
...
...
@@ -95,7 +95,7 @@ class Collective_Test(unittest.TestCase):
shutil
.
rmtree
(
'./log'
)
port
=
random
.
randrange
(
6000
,
8000
)
args
=
"--job_id test3 --devices 0,1 --master 127.0.0.1:{} --n
p
2"
.
format
(
args
=
"--job_id test3 --devices 0,1 --master 127.0.0.1:{} --n
nodes
2"
.
format
(
port
)
p1
=
self
.
pdrun
(
args
)
p2
=
self
.
pdrun
(
args
)
...
...
@@ -143,7 +143,7 @@ class PS_Test(unittest.TestCase):
shutil
.
rmtree
(
'./log'
)
port
=
random
.
randrange
(
6000
,
8000
)
args
=
"--job_id ps3 --master 127.0.0.1:{} --n
p
2 --server_num=1 --trainer_num=1"
.
format
(
args
=
"--job_id ps3 --master 127.0.0.1:{} --n
nodes
2 --server_num=1 --trainer_num=1"
.
format
(
port
)
p1
=
self
.
pdrun
(
args
)
p2
=
self
.
pdrun
(
args
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录