Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
0b429a22
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
0b429a22
编写于
11月 01, 2019
作者:
C
Chengmo
提交者:
GitHub
11月 01, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Cherry-pick]Cherry pick paddle cloud role maker (#20947)
* Fix Paddle Cloud role maker (#20860)
上级
ad867398
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
47 addition
and
21 deletion
+47
-21
paddle/fluid/operators/distributed/rpc_client.cc
paddle/fluid/operators/distributed/rpc_client.cc
+1
-1
python/paddle/fluid/incubate/fleet/base/role_maker.py
python/paddle/fluid/incubate/fleet/base/role_maker.py
+7
-12
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
.../fleet/parameter_server/distribute_transpiler/__init__.py
+4
-0
python/paddle/fluid/tests/unittests/test_fleet_api_input.py
python/paddle/fluid/tests/unittests/test_fleet_api_input.py
+32
-6
python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
+3
-2
未找到文件。
paddle/fluid/operators/distributed/rpc_client.cc
浏览文件 @
0b429a22
...
...
@@ -17,7 +17,7 @@
// default to 3min to avoid temprary network failures.
DEFINE_int32
(
rpc_deadline
,
180000
,
"deadline timeouts for rpc"
);
DEFINE_int32
(
rpc_retry_times
,
0
,
"retry times for rpc"
);
DEFINE_int32
(
rpc_retry_times
,
3
,
"retry times for rpc"
);
namespace
paddle
{
namespace
operators
{
...
...
python/paddle/fluid/incubate/fleet/base/role_maker.py
浏览文件 @
0b429a22
...
...
@@ -335,18 +335,13 @@ class PaddleCloudRoleMaker(RoleMakerBase):
if
not
self
.
_role_is_generated
:
if
not
self
.
_is_collective
:
try
:
port
=
os
.
environ
[
"PADDLE_PORT"
]
pserver_ips
=
os
.
environ
[
"PADDLE_PSERVERS"
].
split
(
","
)
if
","
in
port
:
ports
=
port
.
split
(
","
)
else
:
ports
=
[
port
]
*
len
(
pserver_ips
)
eplist
=
[]
# Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
# format: string(ip:port), eg. 127.0.0.1:6001
eplist
=
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
].
split
(
","
)
# note that, we usually assign the same port to different ips
# if we run parameter server training in local mode
# port should be different in environment variables
for
i
,
ip
in
enumerate
(
pserver_ips
):
eplist
.
append
(
':'
.
join
([
ip
,
ports
[
i
]]))
trainers_num
=
int
(
os
.
environ
[
"PADDLE_TRAINERS_NUM"
])
training_role
=
os
.
environ
[
"TRAINING_ROLE"
]
...
...
@@ -361,9 +356,9 @@ class PaddleCloudRoleMaker(RoleMakerBase):
elif
training_role
==
"PSERVER"
:
role
=
Role
.
SERVER
cur_ip
=
os
.
environ
[
"POD_IP"
]
cur
_idx
=
pserver_ips
.
index
(
cur_ip
)
curr
ent_id
=
eplist
.
index
(
":"
.
join
(
[
cur_ip
,
ports
[
cur_idx
]])
)
cur
r_port
=
os
.
environ
[
"PADDLE_PORT"
]
curr
_endpoint
=
":"
.
join
([
cur_ip
,
curr_port
])
current_id
=
eplist
.
index
(
curr_endpoint
)
else
:
raise
ValueError
(
"TRAINING_ROLE must be PSERVER or TRAINER"
)
...
...
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
浏览文件 @
0b429a22
...
...
@@ -152,6 +152,10 @@ class DistributedTranspiler(Fleet):
if
not
isinstance
(
optimizer
,
Optimizer
):
raise
ValueError
(
"optimizer must be an instance of Optimizer"
)
if
not
fleet
.
_is_initialized
:
raise
ValueError
(
"use fleet.init(role) to initialize the role before use fleet.distributed_optimizer()"
)
self
.
_optimizer
=
TranspilerOptimizer
(
optimizer
,
strategy
)
return
self
.
_optimizer
...
...
python/paddle/fluid/tests/unittests/test_fleet_api_input.py
浏览文件 @
0b429a22
...
...
@@ -20,9 +20,11 @@ from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerCo
from
paddle.fluid.incubate.fleet.base.role_maker
import
UserDefinedRoleMaker
from
paddle.fluid.incubate.fleet.base.role_maker
import
UserDefinedCollectiveRoleMaker
from
paddle.fluid.incubate.fleet.base.role_maker
import
Role
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler
import
fleet
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler
import
TranspilerOptimizer
from
paddle.fluid.incubate.fleet.collective
import
CollectiveOptimizer
from
dist_simnet_bow
import
train_network
class
DistributeTranspilerConfigTest
(
unittest
.
TestCase
):
...
...
@@ -97,6 +99,30 @@ class FleetTest(unittest.TestCase):
main_program
=
compiled_prog
)
self
.
assertRaises
(
Exception
,
fleet
.
_transpile
,
"config"
)
def
set_program
(
self
,
avg_cost
,
strategy
):
optimizer
=
fluid
.
optimizer
.
SGD
(
0.1
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
avg_cost
)
def
test_init_role
(
self
):
role
=
role_maker
.
UserDefinedRoleMaker
(
current_id
=
0
,
role
=
role_maker
.
Role
.
SERVER
,
worker_num
=
2
,
server_endpoints
=
[
"127.0.0.1:36011"
,
"127.0.0.1:36012"
])
# for test optimizer without init(role)
# fleet.init(role)
batch_size
=
128
is_sparse
=
True
is_distribute
=
False
strategy
=
DistributeTranspilerConfig
()
strategy
.
sync_mode
=
False
strategy
.
geo_sgd_mode
=
True
strategy
.
geo_sgd_need_push_nums
=
5
avg_cost
,
_
,
_
=
train_network
(
batch_size
,
is_distribute
,
is_sparse
)
self
.
assertRaises
(
Exception
,
self
.
set_program
,
avg_cost
,
strategy
)
class
TranspilerOptimizerTest
(
unittest
.
TestCase
):
def
testInvalidInputs
(
self
):
...
...
@@ -124,7 +150,7 @@ class UserDefinedRoleMakerTest(unittest.TestCase):
def
testRoleMaker
(
self
):
self
.
createRoleMaker
()
#
#
test all invalid server_endpoints
# test all invalid server_endpoints
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
server_endpoints
=
None
)
# server_endpoints must be as list
...
...
@@ -140,7 +166,7 @@ class UserDefinedRoleMakerTest(unittest.TestCase):
self
.
createRoleMaker
,
server_endpoints
=
[
"127.0.0.1:8080"
,
"127.0.0.1:8080"
]
)
# element in server_endpoints can't be duplicate
#
# test all invalid current_id
#
test all invalid current_id
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=
"0"
)
# current_id must be as int
...
...
@@ -154,14 +180,14 @@ class UserDefinedRoleMakerTest(unittest.TestCase):
role
=
Role
.
SERVER
,
server_endpoints
=
[
"127.0.0.1:8080"
]
)
# if role is server, current_id must be less than len(server_endpoints)
#
#
test all invalid worker_num
# test all invalid worker_num
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_num
=
"1"
)
# worker_num must be as int
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_num
=
0
)
# worker_num must be greater than 0
#
#
test all invalid role
# test all invalid role
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
role
=
3
)
# role must be as Role(Role.WORKER=1, Role.SERVER=2)
...
...
@@ -174,7 +200,7 @@ class UserDefinedCollectiveRoleMakerTest(unittest.TestCase):
def
testRoleMaker
(
self
):
self
.
createRoleMaker
()
#
#
test all invalid worker_endpoints
# test all invalid worker_endpoints
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_endpoints
=
None
)
# worker_endpoints must be as list
...
...
@@ -190,7 +216,7 @@ class UserDefinedCollectiveRoleMakerTest(unittest.TestCase):
self
.
createRoleMaker
,
worker_endpoints
=
[
"127.0.0.1:8080"
,
"127.0.0.1:8080"
]
)
# element in worker_endpoints can't be duplicate
#
#
test all invalid current_id
# test all invalid current_id
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=
"0"
)
# current_id must be as int
...
...
python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
浏览文件 @
0b429a22
...
...
@@ -21,9 +21,9 @@ import paddle.fluid.incubate.fleet.base.role_maker as role_maker
class
TestCloudRoleMaker
(
unittest
.
TestCase
):
def
setUp
(
self
):
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
os
.
environ
[
"PADDLE_PSERVERS"
]
=
"127.0.0.1,127.0.0.2"
os
.
environ
[
"PADDLE_TRAINERS_NUM"
]
=
"2"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
"127.0.0.1:36001,127.0.0.2:36001"
def
test_tr_rolemaker
(
self
):
os
.
environ
[
"TRAINING_ROLE"
]
=
"TRAINER"
...
...
@@ -39,6 +39,7 @@ class TestCloudRoleMaker(unittest.TestCase):
def
test_ps_rolemaker
(
self
):
os
.
environ
[
"TRAINING_ROLE"
]
=
"PSERVER"
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
ro
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
False
)
ro
.
generate_role
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录