Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
0b429a22
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
0b429a22
编写于
11月 01, 2019
作者:
C
Chengmo
提交者:
GitHub
11月 01, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Cherry-pick]Cherry pick paddle cloud role maker (#20947)
* Fix Paddle Cloud role maker (#20860)
上级
ad867398
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
47 addition
and
21 deletion
+47
-21
paddle/fluid/operators/distributed/rpc_client.cc
paddle/fluid/operators/distributed/rpc_client.cc
+1
-1
python/paddle/fluid/incubate/fleet/base/role_maker.py
python/paddle/fluid/incubate/fleet/base/role_maker.py
+7
-12
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
.../fleet/parameter_server/distribute_transpiler/__init__.py
+4
-0
python/paddle/fluid/tests/unittests/test_fleet_api_input.py
python/paddle/fluid/tests/unittests/test_fleet_api_input.py
+32
-6
python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
+3
-2
未找到文件。
paddle/fluid/operators/distributed/rpc_client.cc
浏览文件 @
0b429a22
...
...
@@ -17,7 +17,7 @@
// default to 3min to avoid temprary network failures.
DEFINE_int32
(
rpc_deadline
,
180000
,
"deadline timeouts for rpc"
);
DEFINE_int32
(
rpc_retry_times
,
0
,
"retry times for rpc"
);
DEFINE_int32
(
rpc_retry_times
,
3
,
"retry times for rpc"
);
namespace
paddle
{
namespace
operators
{
...
...
python/paddle/fluid/incubate/fleet/base/role_maker.py
浏览文件 @
0b429a22
...
...
@@ -335,18 +335,13 @@ class PaddleCloudRoleMaker(RoleMakerBase):
if
not
self
.
_role_is_generated
:
if
not
self
.
_is_collective
:
try
:
port
=
os
.
environ
[
"PADDLE_PORT"
]
pserver_ips
=
os
.
environ
[
"PADDLE_PSERVERS"
].
split
(
","
)
if
","
in
port
:
ports
=
port
.
split
(
","
)
else
:
ports
=
[
port
]
*
len
(
pserver_ips
)
eplist
=
[]
# Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
# format: string(ip:port), eg. 127.0.0.1:6001
eplist
=
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
].
split
(
","
)
# note that, we usually assign the same port to different ips
# if we run parameter server training in local mode
# port should be different in environment variables
for
i
,
ip
in
enumerate
(
pserver_ips
):
eplist
.
append
(
':'
.
join
([
ip
,
ports
[
i
]]))
trainers_num
=
int
(
os
.
environ
[
"PADDLE_TRAINERS_NUM"
])
training_role
=
os
.
environ
[
"TRAINING_ROLE"
]
...
...
@@ -361,9 +356,9 @@ class PaddleCloudRoleMaker(RoleMakerBase):
elif
training_role
==
"PSERVER"
:
role
=
Role
.
SERVER
cur_ip
=
os
.
environ
[
"POD_IP"
]
cur
_idx
=
pserver_ips
.
index
(
cur_ip
)
curr
ent_id
=
eplist
.
index
(
":"
.
join
(
[
cur_ip
,
ports
[
cur_idx
]])
)
cur
r_port
=
os
.
environ
[
"PADDLE_PORT"
]
curr
_endpoint
=
":"
.
join
([
cur_ip
,
curr_port
])
current_id
=
eplist
.
index
(
curr_endpoint
)
else
:
raise
ValueError
(
"TRAINING_ROLE must be PSERVER or TRAINER"
)
...
...
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
浏览文件 @
0b429a22
...
...
@@ -152,6 +152,10 @@ class DistributedTranspiler(Fleet):
if
not
isinstance
(
optimizer
,
Optimizer
):
raise
ValueError
(
"optimizer must be an instance of Optimizer"
)
if
not
fleet
.
_is_initialized
:
raise
ValueError
(
"use fleet.init(role) to initialize the role before use fleet.distributed_optimizer()"
)
self
.
_optimizer
=
TranspilerOptimizer
(
optimizer
,
strategy
)
return
self
.
_optimizer
...
...
python/paddle/fluid/tests/unittests/test_fleet_api_input.py
浏览文件 @
0b429a22
...
...
@@ -20,9 +20,11 @@ from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerCo
from
paddle.fluid.incubate.fleet.base.role_maker
import
UserDefinedRoleMaker
from
paddle.fluid.incubate.fleet.base.role_maker
import
UserDefinedCollectiveRoleMaker
from
paddle.fluid.incubate.fleet.base.role_maker
import
Role
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler
import
fleet
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler
import
TranspilerOptimizer
from
paddle.fluid.incubate.fleet.collective
import
CollectiveOptimizer
from
dist_simnet_bow
import
train_network
class
DistributeTranspilerConfigTest
(
unittest
.
TestCase
):
...
...
@@ -97,6 +99,30 @@ class FleetTest(unittest.TestCase):
main_program
=
compiled_prog
)
self
.
assertRaises
(
Exception
,
fleet
.
_transpile
,
"config"
)
def
set_program
(
self
,
avg_cost
,
strategy
):
optimizer
=
fluid
.
optimizer
.
SGD
(
0.1
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
avg_cost
)
def
test_init_role
(
self
):
role
=
role_maker
.
UserDefinedRoleMaker
(
current_id
=
0
,
role
=
role_maker
.
Role
.
SERVER
,
worker_num
=
2
,
server_endpoints
=
[
"127.0.0.1:36011"
,
"127.0.0.1:36012"
])
# for test optimizer without init(role)
# fleet.init(role)
batch_size
=
128
is_sparse
=
True
is_distribute
=
False
strategy
=
DistributeTranspilerConfig
()
strategy
.
sync_mode
=
False
strategy
.
geo_sgd_mode
=
True
strategy
.
geo_sgd_need_push_nums
=
5
avg_cost
,
_
,
_
=
train_network
(
batch_size
,
is_distribute
,
is_sparse
)
self
.
assertRaises
(
Exception
,
self
.
set_program
,
avg_cost
,
strategy
)
class
TranspilerOptimizerTest
(
unittest
.
TestCase
):
def
testInvalidInputs
(
self
):
...
...
@@ -124,7 +150,7 @@ class UserDefinedRoleMakerTest(unittest.TestCase):
def
testRoleMaker
(
self
):
self
.
createRoleMaker
()
#
#
test all invalid server_endpoints
# test all invalid server_endpoints
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
server_endpoints
=
None
)
# server_endpoints must be as list
...
...
@@ -140,7 +166,7 @@ class UserDefinedRoleMakerTest(unittest.TestCase):
self
.
createRoleMaker
,
server_endpoints
=
[
"127.0.0.1:8080"
,
"127.0.0.1:8080"
]
)
# element in server_endpoints can't be duplicate
#
# test all invalid current_id
#
test all invalid current_id
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=
"0"
)
# current_id must be as int
...
...
@@ -154,14 +180,14 @@ class UserDefinedRoleMakerTest(unittest.TestCase):
role
=
Role
.
SERVER
,
server_endpoints
=
[
"127.0.0.1:8080"
]
)
# if role is server, current_id must be less than len(server_endpoints)
#
#
test all invalid worker_num
# test all invalid worker_num
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_num
=
"1"
)
# worker_num must be as int
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_num
=
0
)
# worker_num must be greater than 0
#
#
test all invalid role
# test all invalid role
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
role
=
3
)
# role must be as Role(Role.WORKER=1, Role.SERVER=2)
...
...
@@ -174,7 +200,7 @@ class UserDefinedCollectiveRoleMakerTest(unittest.TestCase):
def
testRoleMaker
(
self
):
self
.
createRoleMaker
()
#
#
test all invalid worker_endpoints
# test all invalid worker_endpoints
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_endpoints
=
None
)
# worker_endpoints must be as list
...
...
@@ -190,7 +216,7 @@ class UserDefinedCollectiveRoleMakerTest(unittest.TestCase):
self
.
createRoleMaker
,
worker_endpoints
=
[
"127.0.0.1:8080"
,
"127.0.0.1:8080"
]
)
# element in worker_endpoints can't be duplicate
#
#
test all invalid current_id
# test all invalid current_id
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=
"0"
)
# current_id must be as int
...
...
python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
浏览文件 @
0b429a22
...
...
@@ -21,9 +21,9 @@ import paddle.fluid.incubate.fleet.base.role_maker as role_maker
class
TestCloudRoleMaker
(
unittest
.
TestCase
):
def
setUp
(
self
):
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
os
.
environ
[
"PADDLE_PSERVERS"
]
=
"127.0.0.1,127.0.0.2"
os
.
environ
[
"PADDLE_TRAINERS_NUM"
]
=
"2"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
"127.0.0.1:36001,127.0.0.2:36001"
def
test_tr_rolemaker
(
self
):
os
.
environ
[
"TRAINING_ROLE"
]
=
"TRAINER"
...
...
@@ -39,6 +39,7 @@ class TestCloudRoleMaker(unittest.TestCase):
def
test_ps_rolemaker
(
self
):
os
.
environ
[
"TRAINING_ROLE"
]
=
"PSERVER"
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
ro
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
False
)
ro
.
generate_role
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录