Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
565d3095
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 2 年 前同步成功
通知
708
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
565d3095
编写于
5月 09, 2019
作者:
T
tangwei12
提交者:
GitHub
5月 09, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Reformat fleet API (#17135)
* fix some logic in distributed transpiler, test=develop * reformat fleet API, test=develop
上级
a88a1faa
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
383 addition
and
323 deletion
+383
-323
python/paddle/fluid/incubate/fleet/base/fleet_base.py
python/paddle/fluid/incubate/fleet/base/fleet_base.py
+84
-74
python/paddle/fluid/incubate/fleet/base/role_maker.py
python/paddle/fluid/incubate/fleet/base/role_maker.py
+94
-61
python/paddle/fluid/incubate/fleet/collective/__init__.py
python/paddle/fluid/incubate/fleet/collective/__init__.py
+14
-32
python/paddle/fluid/incubate/fleet/parameter_server/distributed_transpiler/__init__.py
...fleet/parameter_server/distributed_transpiler/__init__.py
+97
-68
python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
...e/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+91
-71
python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
...addle/fluid/incubate/fleet/parameter_server/pslib/node.py
+2
-2
python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
...ncubate/fleet/parameter_server/pslib/optimizer_factory.py
+1
-15
未找到文件。
python/paddle/fluid/incubate/fleet/base/fleet_base.py
浏览文件 @
565d3095
...
@@ -15,18 +15,21 @@
...
@@ -15,18 +15,21 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
abc
import
abc
import
sys
from
enum
import
Enum
from
enum
import
Enum
from
paddle.fluid.optimizer
import
SGD
from
paddle.fluid.optimizer
import
SGD
from
paddle.fluid.executor
import
Executor
from
role_maker
import
RoleMakerBase
,
Role
from
role_maker
import
RoleMakerBase
from
role_maker
import
MPISymetricRoleMaker
from
role_maker
import
MPISymetricRoleMaker
from
role_maker
import
UserDefinedRoleMaker
from
role_maker
import
UserDefinedRoleMaker
class
Mode
(
Enum
):
class
Mode
(
Enum
):
"""
There are various mode for fleet, each of them is designed for different model.
"""
TRANSPILER
=
1
,
TRANSPILER
=
1
,
PSLIB
=
2
,
PSLIB
=
2
,
COLLECTIVE
=
3
COLLECTIVE
=
3
...
@@ -46,17 +49,11 @@ class Fleet(object):
...
@@ -46,17 +49,11 @@ class Fleet(object):
def
__init__
(
self
,
mode
):
def
__init__
(
self
,
mode
):
assert
isinstance
(
mode
,
Mode
)
assert
isinstance
(
mode
,
Mode
)
self
.
is_initialized
=
False
self
.
_is_initialized
=
False
self
.
mode
=
mode
self
.
_mode
=
mode
self
.
workers
=
0
self
.
_optimizer
=
None
self
.
servers
=
0
self
.
_role_maker
=
None
self
.
worker_endpoints
=
[]
self
.
_executor
=
None
self
.
server_endpoints
=
[]
self
.
role
=
Role
.
WORKER
self
.
current_endpoint
=
None
self
.
current_id
=
0
self
.
optimizer
=
None
self
.
role_maker_
=
None
def
is_first_worker
(
self
):
def
is_first_worker
(
self
):
"""
"""
...
@@ -66,25 +63,25 @@ class Fleet(object):
...
@@ -66,25 +63,25 @@ class Fleet(object):
bool: True if this is the first node of worker,
bool: True if this is the first node of worker,
False if not.
False if not.
"""
"""
return
self
.
is_worker
()
and
self
.
current_id
==
0
return
self
.
_role_maker
.
is_first_worker
()
def
worker_i
d
(
self
):
def
worker_i
ndex
(
self
):
"""
"""
Get current worker i
d
.
Get current worker i
ndex
.
Returns:
Returns:
int: node id
int: node id
"""
"""
return
self
.
current_id
return
self
.
_role_maker
.
worker_index
()
def
get_workers
(
self
):
def
worker_num
(
self
):
"""
"""
Get current total worker number.
Get current total worker number.
Returns:
Returns:
int: worker number
int: worker number
"""
"""
return
self
.
workers
return
len
(
self
.
_role_maker
.
get_trainer_endpoints
())
def
is_worker
(
self
):
def
is_worker
(
self
):
"""
"""
...
@@ -94,7 +91,51 @@ class Fleet(object):
...
@@ -94,7 +91,51 @@ class Fleet(object):
bool: True if this is a node of worker,
bool: True if this is a node of worker,
False if not.
False if not.
"""
"""
return
self
.
role
==
Role
.
WORKER
return
self
.
_role_maker
.
is_worker
()
def
worker_endpoints
(
self
,
to_string
=
False
):
"""
Get current server endpoints, such as ["127.0.0.1:1001", "127.0.0.1:1002"].
Returns:
list/string: server endpoints
"""
if
to_string
:
return
","
.
join
(
self
.
_role_maker
.
get_trainer_endpoints
())
else
:
return
self
.
_role_maker
.
get_trainer_endpoints
()
def
server_num
(
self
):
"""
Get current total worker number.
Returns:
int: server number
"""
return
len
(
self
.
_role_maker
.
get_pserver_endpoints
())
def
server_index
(
self
):
"""
Get current server index.
Returns:
int: node id
"""
return
self
.
_role_maker
.
server_index
()
def
server_endpoints
(
self
,
to_string
=
False
):
"""
Get current server endpoints, such as ["127.0.0.1:1001", "127.0.0.1:1002"].
Returns:
list/string: server endpoints
"""
if
to_string
:
return
","
.
join
(
self
.
_role_maker
.
get_pserver_endpoints
())
else
:
return
self
.
_role_maker
.
get_pserver_endpoints
()
def
is_server
(
self
):
def
is_server
(
self
):
"""
"""
...
@@ -104,7 +145,7 @@ class Fleet(object):
...
@@ -104,7 +145,7 @@ class Fleet(object):
bool: True if this is a node of server,
bool: True if this is a node of server,
False if not.
False if not.
"""
"""
return
self
.
role
==
Role
.
SERVER
return
self
.
_role_maker
.
is_server
()
def
split_files
(
self
,
files
):
def
split_files
(
self
,
files
):
"""
"""
...
@@ -119,8 +160,8 @@ class Fleet(object):
...
@@ -119,8 +160,8 @@ class Fleet(object):
list: files belongs to this worker.
list: files belongs to this worker.
"""
"""
file_num
=
len
(
files
)
file_num
=
len
(
files
)
trainer_id
=
self
.
worker_i
d
()
trainer_id
=
self
.
worker_i
ndex
()
trainer_num
=
self
.
get_workers
()
trainer_num
=
self
.
worker_num
()
if
trainer_num
>
file_num
:
if
trainer_num
>
file_num
:
raise
ValueError
(
"trainer_num should be <= file_num : "
raise
ValueError
(
"trainer_num should be <= file_num : "
"%s > %s"
%
(
trainer_num
,
file_num
))
"%s > %s"
%
(
trainer_num
,
file_num
))
...
@@ -132,66 +173,49 @@ class Fleet(object):
...
@@ -132,66 +173,49 @@ class Fleet(object):
end
+=
length
end
+=
length
return
files
[
start
:
end
]
return
files
[
start
:
end
]
def
init
(
self
,
role_maker
=
None
):
def
init
(
self
,
executor
,
role_maker
=
None
):
"""
"""
should be called only once in user's python scripts,
should be called only once in user's python scripts,
init() will initialize RoleMaker which is used for identifying
init() will initialize RoleMaker which is used for identifying
current node's role, e.g. worker, server, etc.
current node's role, e.g. worker, server, etc.
Args:
Args:
executor(Executor): The executor to run fleet.
role_maker(RoleMakerBase): subclass of RoleMakerBase.
role_maker(RoleMakerBase): subclass of RoleMakerBase.
Returns:
Returns:
None
None
"""
"""
if
not
isinstance
(
executor
,
Executor
):
raise
ValueError
(
"executor must be an instance of Executor"
)
if
role_maker
and
not
isinstance
(
role_maker
,
RoleMakerBase
):
if
role_maker
and
not
isinstance
(
role_maker
,
RoleMakerBase
):
raise
ValueError
(
"role_maker must be an instance of RoleMakerBase"
)
raise
ValueError
(
"role_maker must be an instance of RoleMakerBase"
)
self
.
role_maker_
=
role_maker
if
isinstance
(
role_maker
,
MPISymetricRoleMaker
):
if
isinstance
(
role_maker
,
MPISymetricRoleMaker
):
self
.
role_maker_
.
_generate_role
()
self
.
_role_maker
=
role_maker
self
.
role
=
Role
.
WORKER
if
role_maker
.
_is_worker
()
else
Role
.
SERVER
self
.
_role_maker
.
generate_role
()
self
.
workers
=
role_maker
.
_worker_num
()
self
.
servers
=
role_maker
.
_server_num
()
self
.
server_endpoints
=
role_maker
.
_get_pserver_endpoints
()
self
.
worker_endpoints
=
role_maker
.
_get_trainer_endpoints
()
self
.
current_id
=
role_maker
.
_worker_index
(
)
if
role_maker
.
_is_worker
()
else
role_maker
.
_server_index
()
self
.
current_endpoint
=
self
.
worker_endpoints
[
self
.
current_id
]
\
if
role_maker
.
_is_worker
()
else
self
.
server_endpoints
[
self
.
current_id
]
elif
isinstance
(
role_maker
,
UserDefinedRoleMaker
):
elif
isinstance
(
role_maker
,
UserDefinedRoleMaker
):
self
.
current_id
=
role_maker
.
current_id
self
.
_role_maker
=
role_maker
self
.
current_endpoint
=
role_maker
.
current_endpoint
self
.
workers
=
role_maker
.
workers
self
.
worker_endpoints
=
role_maker
.
worker_endpoints
self
.
servers
=
role_maker
.
servers
self
.
server_endpoints
=
role_maker
.
server_endpoints
self
.
role
=
role_maker
.
role
else
:
else
:
raise
ValueError
(
raise
ValueError
(
"role_maker must be an instance of UserDefinedRoleMaker/MPISymetricRoleMaker"
"role_maker must be an instance of UserDefinedRoleMaker/MPISymetricRoleMaker"
)
)
self
.
is_initialized
=
True
self
.
_
is_initialized
=
True
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
init_worker
(
self
,
executor
):
def
init_worker
(
self
):
pass
pass
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
run_worker
(
self
,
executor
,
main_program
=
None
):
def
init_server
(
self
,
model_dir
=
None
):
pass
pass
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
init_server
(
self
,
executor
,
model_dir
=
None
):
def
run_server
(
self
,
):
pass
@
abc
.
abstractmethod
def
run_server
(
self
,
executor
):
pass
pass
@
abc
.
abstractmethod
@
abc
.
abstractmethod
...
@@ -199,7 +223,7 @@ class Fleet(object):
...
@@ -199,7 +223,7 @@ class Fleet(object):
pass
pass
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
stop
(
self
,
executor
):
def
stop
(
self
):
pass
pass
@
abc
.
abstractmethod
@
abc
.
abstractmethod
...
@@ -208,7 +232,6 @@ class Fleet(object):
...
@@ -208,7 +232,6 @@ class Fleet(object):
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
save_inference_model
(
self
,
def
save_inference_model
(
self
,
executor
,
dirname
,
dirname
,
feeded_var_names
,
feeded_var_names
,
target_vars
,
target_vars
,
...
@@ -217,21 +240,9 @@ class Fleet(object):
...
@@ -217,21 +240,9 @@ class Fleet(object):
pass
pass
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
save_persistables
(
self
,
executor
,
dirname
,
main_program
=
None
):
def
save_persistables
(
self
,
dirname
,
main_program
=
None
):
pass
pass
def
to_string
(
self
):
infos
=
"""
mode = {}
workers = {}
server_endpoints = {}
role = {}
current_endpoint = {}
current_id = {}
"""
.
format
(
self
.
mode
,
self
.
workers
,
self
.
server_endpoints
,
self
.
role
,
self
.
current_endpoint
,
self
.
current_id
)
return
infos
class
DistributedOptimizer
(
object
):
class
DistributedOptimizer
(
object
):
"""
"""
...
@@ -245,7 +256,7 @@ class DistributedOptimizer(object):
...
@@ -245,7 +256,7 @@ class DistributedOptimizer(object):
Args:
Args:
optimizer(Optimizer): subclass of Optimizer.
optimizer(Optimizer): subclass of Optimizer.
strategy(
dict
): the user define config for Optimizer.
strategy(
any
): the user define config for Optimizer.
Returns:
Returns:
None
None
...
@@ -257,9 +268,6 @@ class DistributedOptimizer(object):
...
@@ -257,9 +268,6 @@ class DistributedOptimizer(object):
if
not
isinstance
(
optimizer
,
SGD
.
__bases__
):
if
not
isinstance
(
optimizer
,
SGD
.
__bases__
):
raise
ValueError
(
"optimizer must be an instance of Optimizer"
)
raise
ValueError
(
"optimizer must be an instance of Optimizer"
)
if
strategy
and
not
isinstance
(
strategy
,
dict
):
raise
ValueError
(
"strategy must be an instance of Dict"
)
self
.
_optimizer
=
optimizer
self
.
_optimizer
=
optimizer
self
.
_strategy
=
strategy
self
.
_strategy
=
strategy
...
@@ -317,8 +325,9 @@ class DistributedOptimizer(object):
...
@@ -317,8 +325,9 @@ class DistributedOptimizer(object):
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
minimize
(
self
,
def
minimize
(
self
,
loss
,
losses
,
startup_program
=
None
,
scopes
=
None
,
startup_programs
=
None
,
parameter_list
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
no_grad_set
=
None
):
"""
"""
...
@@ -328,8 +337,9 @@ class DistributedOptimizer(object):
...
@@ -328,8 +337,9 @@ class DistributedOptimizer(object):
`apply_gradients()` into one.
`apply_gradients()` into one.
Args:
Args:
loss (Variable): loss variable to run optimizations.
losses (Variable|Variable List): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
scopes (Scope| Scope List): scope instance.
startup_programs (Program|Program List): startup_program for initializing parameters
in `parameter_list`.
in `parameter_list`.
parameter_list (list): list of Variables to update.
parameter_list (list): list of Variables to update.
no_grad_set (set|None): set of Variables should be ignored.
no_grad_set (set|None): set of Variables should be ignored.
...
...
python/paddle/fluid/incubate/fleet/base/role_maker.py
浏览文件 @
565d3095
...
@@ -11,10 +11,14 @@
...
@@ -11,10 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
sys
from
__future__
import
print_function
from
enum
import
Enum
from
enum
import
Enum
__all__
=
[
'Role'
,
'RoleMakerBase'
,
'MPISymetricRoleMaker'
,
'UserDefinedRoleMaker'
]
class
Role
(
Enum
):
class
Role
(
Enum
):
WORKER
=
1
,
WORKER
=
1
,
...
@@ -30,47 +34,62 @@ class RoleMakerBase(object):
...
@@ -30,47 +34,62 @@ class RoleMakerBase(object):
"""
"""
def
__init__
(
self
):
def
__init__
(
self
):
self
.
_
train
er_endpoints
=
[]
self
.
_
work
er_endpoints
=
[]
self
.
_
p
server_endpoints
=
[]
self
.
_server_endpoints
=
[]
self
.
_role_is_generated
=
False
self
.
_role_is_generated
=
False
self
.
_role
=
None
self
.
_current_id
=
-
1
def
_
is_worker
(
self
):
def
is_worker
(
self
):
"""
"""
return is_worker() of current process
return is_worker() of current process
"""
"""
raise
NotImplementedError
(
"Please implement this method in child class"
)
raise
NotImplementedError
(
"Please implement this method in child class"
)
def
_
is_server
(
self
):
def
is_server
(
self
):
"""
"""
return is_server() of current process
return is_server() of current process
"""
"""
raise
NotImplementedError
(
"Please implement this method in child class"
)
raise
NotImplementedError
(
"Please implement this method in child class"
)
def
_get_local_ip
(
self
):
def
is_first_worker
(
self
):
"""
"""
return get local ip
Check whether the node is the first instance of worker.
Returns:
bool: True if this is the first node of worker,
False if not.
"""
"""
import
socket
raise
NotImplementedError
(
"Please implement this method in child class"
)
self
.
_ip
=
socket
.
gethostbyname
(
socket
.
gethostname
())
return
self
.
_ip
def
_get_trainer_endpoints
(
self
):
def
worker_index
(
self
):
"""
"""
return trainer endpoints
Get current worker id.
Returns:
int: node id
"""
"""
r
eturn
self
.
_trainer_endpoints
r
aise
NotImplementedError
(
"Please implement this method in child class"
)
def
_get_pserver_endpoints
(
self
):
def
server_index
(
self
):
"""
"""
return pserver endpoints
Get current server id.
Returns:
int: node id
"""
"""
r
eturn
self
.
_pserver_endpoints
r
aise
NotImplementedError
(
"Please implement this method in child class"
)
def
_generate_role
(
self
):
def
get_trainer_endpoints
(
self
):
"""
"""
generate_role() should be called to identify current process's role
return trainer endpoints
"""
"""
raise
NotImplementedError
(
"Please implement this method in child class"
)
return
self
.
_worker_endpoints
def
get_pserver_endpoints
(
self
):
"""
return pserver endpoints
"""
return
self
.
_server_endpoints
class
MPIRoleMaker
(
RoleMakerBase
):
class
MPIRoleMaker
(
RoleMakerBase
):
...
@@ -82,9 +101,11 @@ class MPIRoleMaker(RoleMakerBase):
...
@@ -82,9 +101,11 @@ class MPIRoleMaker(RoleMakerBase):
def
__init__
(
self
):
def
__init__
(
self
):
super
(
MPIRoleMaker
,
self
).
__init__
()
super
(
MPIRoleMaker
,
self
).
__init__
()
from
mpi4py
import
MPI
from
mpi4py
import
MPI
self
.
_comm
=
MPI
.
COMM_WORLD
self
.
MPI
=
MPI
self
.
MPI
=
MPI
self
.
_comm
=
MPI
.
COMM_WORLD
self
.
_node_type_comm
=
None
self
.
_ips
=
None
self
.
_ips
=
None
self
.
_ip
=
None
def
_get_rank
(
self
):
def
_get_rank
(
self
):
"""
"""
...
@@ -111,7 +132,7 @@ class MPIRoleMaker(RoleMakerBase):
...
@@ -111,7 +132,7 @@ class MPIRoleMaker(RoleMakerBase):
"""
"""
worker_gather(obj) will call MPI's allgather function
worker_gather(obj) will call MPI's allgather function
"""
"""
if
self
.
_
is_worker
():
if
self
.
is_worker
():
self
.
_node_type_comm
.
barrier
()
self
.
_node_type_comm
.
barrier
()
return
self
.
_node_type_comm
.
allgather
(
obj
)
return
self
.
_node_type_comm
.
allgather
(
obj
)
return
None
return
None
...
@@ -122,19 +143,33 @@ class MPIRoleMaker(RoleMakerBase):
...
@@ -122,19 +143,33 @@ class MPIRoleMaker(RoleMakerBase):
"""
"""
self
.
_comm
.
barrier
()
self
.
_comm
.
barrier
()
def
_finalize
(
self
):
"""
finalize the current MPI instance.
"""
pass
def
_get_ips
(
self
):
def
_get_ips
(
self
):
"""
"""
collect current distributed job's ip list
collect current distributed job's ip list
"""
"""
if
self
.
_ips
==
None
:
if
not
self
.
_ips
:
self
.
_ips
=
self
.
_comm
.
allgather
(
self
.
_
get_local_ip
())
self
.
_ips
=
self
.
_comm
.
allgather
(
self
.
get_local_ip
())
return
self
.
_ips
return
self
.
_ips
def
_finalize
(
self
):
def
get_local_ip
(
self
):
"""
"""
finalize the current MPI instance.
return get local ip
"""
"""
pass
import
socket
self
.
_ip
=
socket
.
gethostbyname
(
socket
.
gethostname
())
return
self
.
_ip
def
generate_role
(
self
):
"""
generate_role() should be called to identify current process's role
"""
raise
NotImplementedError
(
"Please implement this method in child class"
)
class
MPISymetricRoleMaker
(
MPIRoleMaker
):
class
MPISymetricRoleMaker
(
MPIRoleMaker
):
...
@@ -151,20 +186,18 @@ class MPISymetricRoleMaker(MPIRoleMaker):
...
@@ -151,20 +186,18 @@ class MPISymetricRoleMaker(MPIRoleMaker):
def
_check_role_generation
(
self
):
def
_check_role_generation
(
self
):
if
not
self
.
_role_is_generated
:
if
not
self
.
_role_is_generated
:
sys
.
stderr
.
write
(
"generate_role() should be called first"
)
raise
NameError
(
"generate_role() should be called first"
)
sys
.
exit
(
-
1
)
return
False
return
True
return
True
def
_
is_first_worker
(
self
):
def
is_first_worker
(
self
):
"""
"""
return whether current process is the first worker assigned by role maker
return whether current process is the first worker assigned by role maker
"""
"""
if
self
.
_check_role_generation
():
if
self
.
_check_role_generation
():
return
self
.
_is_worker
()
and
0
==
self
.
_
worker_index
()
return
self
.
is_worker
()
and
0
==
self
.
worker_index
()
return
False
return
False
def
_
is_worker
(
self
):
def
is_worker
(
self
):
"""
"""
return whether current process is worker assigned by role maker
return whether current process is worker assigned by role maker
"""
"""
...
@@ -172,7 +205,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
...
@@ -172,7 +205,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return
self
.
_node_type
==
1
return
self
.
_node_type
==
1
return
False
return
False
def
_
is_server
(
self
):
def
is_server
(
self
):
"""
"""
return whether current process is server assigned by role maker
return whether current process is server assigned by role maker
"""
"""
...
@@ -185,7 +218,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
...
@@ -185,7 +218,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return the current number of worker
return the current number of worker
"""
"""
if
self
.
_check_role_generation
():
if
self
.
_check_role_generation
():
if
self
.
_
is_worker
():
if
self
.
is_worker
():
return
self
.
_get_size
()
/
2
return
self
.
_get_size
()
/
2
return
0
return
0
...
@@ -194,11 +227,11 @@ class MPISymetricRoleMaker(MPIRoleMaker):
...
@@ -194,11 +227,11 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return the current number of server
return the current number of server
"""
"""
if
self
.
_check_role_generation
():
if
self
.
_check_role_generation
():
if
self
.
_
is_server
():
if
self
.
is_server
():
return
self
.
_get_size
()
/
2
return
self
.
_get_size
()
/
2
return
0
return
0
def
_
worker_index
(
self
):
def
worker_index
(
self
):
"""
"""
return the index of worker
return the index of worker
"""
"""
...
@@ -206,7 +239,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
...
@@ -206,7 +239,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return
self
.
_rank
/
self
.
_proc_per_node
return
self
.
_rank
/
self
.
_proc_per_node
return
0
return
0
def
_
server_index
(
self
):
def
server_index
(
self
):
"""
"""
return the index of server
return the index of server
"""
"""
...
@@ -219,7 +252,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
...
@@ -219,7 +252,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
barrier all workers in current distributed job
barrier all workers in current distributed job
"""
"""
if
self
.
_check_role_generation
():
if
self
.
_check_role_generation
():
if
self
.
_
is_worker
():
if
self
.
is_worker
():
self
.
_node_type_comm
.
barrier
()
self
.
_node_type_comm
.
barrier
()
def
_barrier_server
(
self
):
def
_barrier_server
(
self
):
...
@@ -227,17 +260,17 @@ class MPISymetricRoleMaker(MPIRoleMaker):
...
@@ -227,17 +260,17 @@ class MPISymetricRoleMaker(MPIRoleMaker):
barrier all servers in current distributed job
barrier all servers in current distributed job
"""
"""
if
self
.
_check_role_generation
():
if
self
.
_check_role_generation
():
if
self
.
_
is_server
():
if
self
.
is_server
():
self
.
_node_type_comm
.
barrier
()
self
.
_node_type_comm
.
barrier
()
def
_
generate_role
(
self
):
def
generate_role
(
self
):
"""
"""
generate currently process's role
generate currently process's role
"""
"""
if
not
self
.
_role_is_generated
:
if
not
self
.
_role_is_generated
:
# TODO(guru4elephant): only allow to be called once
# TODO(guru4elephant): only allow to be called once
self
.
_
train
er_endpoints
=
self
.
_get_ips
()
self
.
_
work
er_endpoints
=
self
.
_get_ips
()
self
.
_
p
server_endpoints
=
self
.
_get_ips
()
self
.
_server_endpoints
=
self
.
_get_ips
()
if
0
==
self
.
_get_rank
()
%
self
.
_proc_per_node
%
2
:
if
0
==
self
.
_get_rank
()
%
self
.
_proc_per_node
%
2
:
self
.
_node_type
=
0
self
.
_node_type
=
0
...
@@ -250,12 +283,9 @@ class MPISymetricRoleMaker(MPIRoleMaker):
...
@@ -250,12 +283,9 @@ class MPISymetricRoleMaker(MPIRoleMaker):
class
UserDefinedRoleMaker
(
RoleMakerBase
):
class
UserDefinedRoleMaker
(
RoleMakerBase
):
def
__init__
(
self
,
def
__init__
(
self
,
current_id
=
0
,
current_id
=
0
,
current_endpoint
=
None
,
role
=
Role
.
WORKER
,
workers
=
0
,
worker_num
=
0
,
worker_endpoints
=
None
,
server_endpoints
=
None
):
servers
=
0
,
server_endpoints
=
None
,
role
=
Role
.
WORKER
):
"""
"""
UserDefinedRoleMaker is designed for worker and server assignment
UserDefinedRoleMaker is designed for worker and server assignment
under manual. Typically, a worker and a server node will be appointed
under manual. Typically, a worker and a server node will be appointed
...
@@ -263,19 +293,22 @@ class UserDefinedRoleMaker(RoleMakerBase):
...
@@ -263,19 +293,22 @@ class UserDefinedRoleMaker(RoleMakerBase):
"""
"""
super
(
UserDefinedRoleMaker
,
self
).
__init__
()
super
(
UserDefinedRoleMaker
,
self
).
__init__
()
self
.
current_id
=
current_id
self
.
_current_id
=
current_id
self
.
current_endpoint
=
current_endpoint
self
.
_role
=
role
self
.
workers
=
workers
self
.
_worker_num
=
worker_num
self
.
worker_endpoints
=
worker_endpoints
self
.
_server_endpoints
=
server_endpoints
self
.
servers
=
servers
self
.
server_endpoints
=
server_endpoints
def
is_worker
(
self
):
self
.
role
=
role
return
self
.
_role
==
Role
.
WORKER
def
is_server
(
self
):
return
self
.
_role
==
Role
.
SERVER
def
_is
_worker
(
self
):
def
is_first
_worker
(
self
):
return
self
.
role
==
Role
.
WORKER
return
self
.
_role
==
Role
.
WORKER
and
self
.
_current_id
==
0
def
_is_server
(
self
):
def
worker_index
(
self
):
return
self
.
role
==
Role
.
SERVER
return
self
.
_current_id
def
_generate_role
(
self
):
def
server_index
(
self
):
self
.
role_is_generated_
=
True
return
self
.
_current_id
python/paddle/fluid/incubate/fleet/collective/__init__.py
浏览文件 @
565d3095
...
@@ -11,7 +11,6 @@
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
import
sys
import
logging
import
logging
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
...
@@ -26,37 +25,21 @@ from ..base.fleet_base import DistributedOptimizer
...
@@ -26,37 +25,21 @@ from ..base.fleet_base import DistributedOptimizer
class
Collective
(
Fleet
):
class
Collective
(
Fleet
):
def
__init__
(
self
):
def
__init__
(
self
):
super
(
Collective
,
self
).
__init__
(
Mode
.
COLLECTIVE
)
super
(
Collective
,
self
).
__init__
(
Mode
.
COLLECTIVE
)
self
.
local_ip_
=
0
self
.
_local_ip
=
0
def
init
(
self
,
role_maker
=
None
):
def
init_worker
(
self
):
"""
should be called only once in user's python scripts,
init() will initialize RoleMaker which is used for identifying
current node's role, e.g. worker, server, etc.
Args:
role_maker(RoleMakerBase): subclass of RoleMakerBase.
Returns:
None
"""
super
(
Collective
,
self
).
init
(
role_maker
)
self
.
_role_maker
.
_generate_role
()
def
init_worker
(
self
,
executor
):
logging
.
warn
(
logging
.
warn
(
"You should not call 'init_worker' method for collective mode."
)
"You should not call 'init_worker' method for collective mode."
)
def
run_worker
(
self
,
executor
,
main_program
=
None
):
def
run_worker
(
self
,
main_programs
=
None
,
scopes
=
None
):
logging
.
warn
(
logging
.
warn
(
"You should not call 'run_worker' method for collective mode."
)
"You should not call 'run_worker' method for collective mode."
)
def
init_server
(
self
,
executor
,
model_dir
=
None
):
def
init_server
(
self
,
model_dir
=
None
):
logging
.
warn
(
logging
.
warn
(
"You should not call 'init_server' method for collective mode."
)
"You should not call 'init_server' method for collective mode."
)
def
run_server
(
self
,
executor
):
def
run_server
(
self
):
logging
.
warn
(
logging
.
warn
(
"You should not call 'run_server' method for collective mode."
)
"You should not call 'run_server' method for collective mode."
)
...
@@ -64,29 +47,28 @@ class Collective(Fleet):
...
@@ -64,29 +47,28 @@ class Collective(Fleet):
logging
.
warn
(
logging
.
warn
(
"You should not call 'stop_worker' method for collective mode."
)
"You should not call 'stop_worker' method for collective mode."
)
def
stop
(
self
,
executor
):
def
stop
(
self
):
"""
"""
stop(): will be called after a user finishes his/her training task.
stop(): will be called after a user finishes his/her training task.
"""
"""
logging
.
warn
(
"You should not call 'stop' method for collective mode."
)
logging
.
warn
(
"You should not call 'stop' method for collective mode."
)
def
distributed_optimizer
(
self
,
optimizer
,
strategy
=
None
):
def
distributed_optimizer
(
self
,
optimizer
,
strategy
=
None
):
self
.
optimizer
=
CollectiveOptimizer
(
optimizer
,
strategy
)
self
.
_
optimizer
=
CollectiveOptimizer
(
optimizer
,
strategy
)
return
self
.
optimizer
return
self
.
_
optimizer
def
save_inference_model
(
self
,
def
save_inference_model
(
self
,
executor
,
dirname
,
dirname
,
feeded_var_names
=
None
,
feeded_var_names
=
None
,
target_vars
=
None
,
target_vars
=
None
,
main_program
=
None
,
main_program
=
None
,
export_for_deployment
=
True
):
export_for_deployment
=
True
):
io
.
save_inference_model
(
dirname
,
feeded_var_names
,
target_vars
,
io
.
save_inference_model
(
dirname
,
feeded_var_names
,
target_vars
,
executor
,
main_program
,
None
,
None
,
self
.
_
executor
,
main_program
,
None
,
None
,
export_for_deployment
)
export_for_deployment
)
def
save_persistables
(
self
,
executor
,
dirname
,
main_program
=
None
):
def
save_persistables
(
self
,
dirname
,
main_program
=
None
):
io
.
save_persistables
(
executor
,
dirname
,
main_program
,
None
)
io
.
save_persistables
(
self
.
_
executor
,
dirname
,
main_program
,
None
)
fleet
=
Collective
()
fleet
=
Collective
()
...
@@ -143,9 +125,9 @@ class CollectiveOptimizer(DistributedOptimizer):
...
@@ -143,9 +125,9 @@ class CollectiveOptimizer(DistributedOptimizer):
optimize_ops
,
param_grads
=
self
.
_optimizer
.
minimize
(
optimize_ops
,
param_grads
=
self
.
_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
worker_endpoints
=
fleet
.
worker_endpoints
worker_endpoints
=
fleet
.
worker_endpoints
()
trainer_id
=
fleet
.
current_id
trainer_id
=
fleet
.
worker_index
()
current_endpoint
=
fleet
.
current_endpoint
current_endpoint
=
fleet
.
worker_endpoints
()[
trainer_id
]
startup_program
=
startup_program
if
startup_program
else
\
startup_program
=
startup_program
if
startup_program
else
\
fluid
.
framework
.
default_startup_program
fluid
.
framework
.
default_startup_program
...
...
python/paddle/fluid/incubate/fleet/parameter_server/distributed_transpiler/__init__.py
浏览文件 @
565d3095
...
@@ -12,12 +12,7 @@
...
@@ -12,12 +12,7 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
os
import
os
import
sys
from
paddle.fluid.executor
import
Executor
from
paddle.fluid.framework
import
Program
from
paddle.fluid.framework
import
default_main_program
from
paddle.fluid.framework
import
default_startup_program
from
paddle.fluid.framework
import
default_startup_program
from
paddle.fluid.optimizer
import
Optimizer
from
paddle.fluid.optimizer
import
Optimizer
...
@@ -27,7 +22,6 @@ import paddle.fluid.io as io
...
@@ -27,7 +22,6 @@ import paddle.fluid.io as io
from
paddle.fluid.transpiler.distribute_transpiler
import
DistributeTranspilerConfig
from
paddle.fluid.transpiler.distribute_transpiler
import
DistributeTranspilerConfig
from
paddle.fluid.transpiler.distribute_transpiler
import
DistributeTranspiler
as
OriginTranspiler
from
paddle.fluid.transpiler.distribute_transpiler
import
DistributeTranspiler
as
OriginTranspiler
from
...base.role_maker
import
Role
from
...base.fleet_base
import
Fleet
from
...base.fleet_base
import
Fleet
from
...base.fleet_base
import
Mode
from
...base.fleet_base
import
Mode
from
...base.fleet_base
import
DistributedOptimizer
from
...base.fleet_base
import
DistributedOptimizer
...
@@ -44,101 +38,75 @@ class DistributedTranspiler(Fleet):
...
@@ -44,101 +38,75 @@ class DistributedTranspiler(Fleet):
self
.
_startup_program
=
None
self
.
_startup_program
=
None
self
.
_main_program
=
None
self
.
_main_program
=
None
def
init_worker
(
self
,
executor
):
def
init_worker
(
self
):
"""
"""
`init_worker` has many many functions to do before training,
`init_worker` has many many functions to do before training,
first, wait for all parameter servers launch completely.
first, wait for all parameter servers launch completely.
second, run executor to initialize startup program
second, run executor to initialize startup program
third, wait for all worker initialize completely.
third, wait for all worker initialize completely.
Args:
executor(Executor): The executor to run for init startup program.
Returns:
Returns:
None
None
"""
"""
if
not
isinstance
(
executor
,
Executor
):
pass
raise
ValueError
(
"executor must be an instance of Executor"
)
if
not
self
.
_startup_program
:
raise
ValueError
(
"startup_program is None, need invoke DistributedOptimizer.minimize first"
)
executor
.
run
(
self
.
_startup_program
)
def
run_worker
(
self
,
executor
,
main_program
=
None
):
def
run_worker
(
self
,
main_programs
=
None
,
scopes
=
None
):
pass
pass
def
init_server
(
self
,
executor
,
model_dir
=
None
):
def
init_server
(
self
,
model_dir
=
None
):
"""
"""
`init_server` has many many functions to do before start pserver,
`init_server` has many many functions to do before start pserver,
first, run executor to initialize startup program,
first, run executor to initialize startup program,
second, if the `model_dir` is not empty, it will load parameters from it for increment training.
second, if the `model_dir` is not empty, it will load parameters from it for increment training.
Args:
Args:
executor(Executor): The executor to run for init server.
model_dir(str): The directory path.
model_dir(str): The directory path.
Returns:
Returns:
None
None
"""
"""
if
not
isinstance
(
executor
,
Executor
):
raise
ValueError
(
"executor must be an instance of Executor"
)
if
not
self
.
_startup_program
:
if
not
self
.
_startup_program
:
raise
ValueError
(
raise
ValueError
(
"startup_program is None, need invoke DistributedOptimizer.minimize first"
"startup_program is None, need invoke DistributedOptimizer.minimize first"
)
)
executor
.
run
(
self
.
_startup_program
)
self
.
_
executor
.
run
(
self
.
_startup_program
)
if
model_dir
:
if
model_dir
:
if
not
os
.
path
.
isdir
(
model_dir
):
if
not
os
.
path
.
isdir
(
model_dir
):
raise
ValueError
(
"There is no directory named '%s'"
,
model_dir
)
raise
ValueError
(
"There is no directory named '%s'"
,
model_dir
)
io
.
load_persistables
(
executor
,
model_dir
,
self
.
_startup_program
)
io
.
load_persistables
(
self
.
_executor
,
model_dir
,
self
.
_startup_program
)
def
run_server
(
self
,
executor
):
def
run_server
(
self
):
"""
"""
`run_server` execute executor to start pserver main program.
`run_server` execute executor to start pserver main program.
Args:
executor(Executor): The executor to run for init server.
Returns:
Returns:
None
None
"""
"""
if
not
isinstance
(
executor
,
Executor
):
raise
ValueError
(
"executor must be an instance of Executor"
)
if
not
self
.
_main_program
:
if
not
self
.
_main_program
:
raise
ValueError
(
raise
ValueError
(
"main_program is None, need invoke DistributedOptimizer.minimize first"
"main_program is None, need invoke DistributedOptimizer.minimize first"
)
)
executor
.
run
(
self
.
_main_program
)
self
.
_
executor
.
run
(
self
.
_main_program
)
def
stop_worker
(
self
):
def
stop_worker
(
self
):
pass
pass
def
stop
(
self
,
executor
):
def
stop
(
self
):
"""
"""
Close this executor.
Close this executor.
For the distributed training, this method would free the resource on PServers related to
For the distributed training, this method would free the resource on PServers related to
the current Trainer.
the current Trainer.
Args:
executor(Executor): The executor to run for init server.
Returns:
Returns:
None
None
"""
"""
self
.
_executor
.
close
()
if
not
isinstance
(
executor
,
Executor
):
raise
ValueError
(
"executor must be an instance of Executor"
)
executor
.
close
()
def
distributed_optimizer
(
self
,
optimizer
,
strategy
=
None
):
def
distributed_optimizer
(
self
,
optimizer
,
strategy
=
None
):
"""
"""
...
@@ -157,11 +125,10 @@ class DistributedTranspiler(Fleet):
...
@@ -157,11 +125,10 @@ class DistributedTranspiler(Fleet):
if
not
isinstance
(
optimizer
,
Optimizer
):
if
not
isinstance
(
optimizer
,
Optimizer
):
raise
ValueError
(
"optimizer must be an instance of Optimizer"
)
raise
ValueError
(
"optimizer must be an instance of Optimizer"
)
self
.
optimizer
=
TranspilerOptimizer
(
optimizer
,
strategy
)
self
.
_
optimizer
=
TranspilerOptimizer
(
optimizer
,
strategy
)
return
self
.
optimizer
return
self
.
_
optimizer
def
save_inference_model
(
self
,
def
save_inference_model
(
self
,
executor
,
dirname
,
dirname
,
feeded_var_names
,
feeded_var_names
,
target_vars
,
target_vars
,
...
@@ -172,10 +139,10 @@ class DistributedTranspiler(Fleet):
...
@@ -172,10 +139,10 @@ class DistributedTranspiler(Fleet):
and then save it and all related parameters to given `dirname` by the `executor`.
and then save it and all related parameters to given `dirname` by the `executor`.
"""
"""
io
.
save_inference_model
(
dirname
,
feeded_var_names
,
target_vars
,
io
.
save_inference_model
(
dirname
,
feeded_var_names
,
target_vars
,
executor
,
main_program
,
None
,
None
,
self
.
_
executor
,
main_program
,
None
,
None
,
export_for_deployment
)
export_for_deployment
)
def
save_persistables
(
self
,
executor
,
dirname
,
main_program
=
None
):
def
save_persistables
(
self
,
dirname
,
main_program
=
None
):
"""
"""
This function filters out all variables with `persistable==True` from the
This function filters out all variables with `persistable==True` from the
give `main_program` and then saves these variables to the folder `dirname`
give `main_program` and then saves these variables to the folder `dirname`
...
@@ -186,38 +153,56 @@ class DistributedTranspiler(Fleet):
...
@@ -186,38 +153,56 @@ class DistributedTranspiler(Fleet):
files, set `filename` None; if you would like to save all variables in a
files, set `filename` None; if you would like to save all variables in a
single file, use `filename` to specify the file name.
single file, use `filename` to specify the file name.
"""
"""
io
.
save_persistables
(
executor
,
dirname
,
main_program
,
None
)
io
.
save_persistables
(
self
.
_
executor
,
dirname
,
main_program
,
None
)
def
_transpile
(
self
,
config
):
def
_transpile
(
self
,
config
):
if
not
isinstance
(
config
,
DistributeTranspilerConfig
):
raise
ValueError
(
"config must be an instance of DistributeTranspilerConfig"
)
self
.
_transpiler
=
OriginTranspiler
(
config
)
self
.
_transpiler
=
OriginTranspiler
(
config
)
self
.
_transpiler
.
transpile
(
self
.
_transpiler
.
transpile
(
trainer_id
=
fleet
.
worker_i
d
(),
trainer_id
=
fleet
.
worker_i
ndex
(),
pservers
=
fleet
.
server_endpoints
,
pservers
=
fleet
.
server_endpoints
(
to_string
=
True
)
,
trainers
=
fleet
.
worker_num
())
trainers
=
fleet
.
worker_num
())
if
self
.
role
==
Role
.
WORKER
:
if
self
.
is_worker
()
:
self
.
_main_program
=
self
.
_transpiler
.
get_trainer_program
()
self
.
_main_program
=
self
.
_transpiler
.
get_trainer_program
()
self
.
_startup_program
=
default_startup_program
()
self
.
_startup_program
=
default_startup_program
()
else
:
else
:
self
.
_main_program
,
self
.
_startup_program
=
\
self
.
_main_program
,
self
.
_startup_program
=
\
self
.
_transpiler
.
get_pserver_programs
(
self
.
current_endpoint
)
self
.
_transpiler
.
get_pserver_programs
(
self
.
server_endpoints
(
self
.
server_index
())
)
fleet
=
DistributedTranspiler
()
fleet
=
DistributedTranspiler
()
class
TranspilerOptimizer
(
DistributedOptimizer
):
class
TranspilerOptimizer
(
DistributedOptimizer
):
"""
DistributedOptimizer is a wrapper for paddle.fluid.optimizer
A user should pass a paddle.fluid.optimizer to DistributedOptimizer
minimize() function is implemented.
DistributedOptimizer is the starting point for a user who wants to
run distributed training. The optimized information will be stored in
Fleet() instance who holds the global information about current distributed
training.
Args:
optimizer(Optimizer): subclass of Optimizer.
strategy(DistributeTranspilerConfig): instance of DistributeTranspilerConfig.
Returns:
None
"""
def
__init__
(
self
,
optimizer
,
strategy
=
None
):
def
__init__
(
self
,
optimizer
,
strategy
=
None
):
super
(
TranspilerOptimizer
,
self
).
__init__
(
optimizer
,
strategy
)
super
(
TranspilerOptimizer
,
self
).
__init__
(
optimizer
,
strategy
)
if
strategy
and
not
isinstance
(
strategy
,
DistributeTranspilerConfig
):
if
strategy
:
raise
ValueError
(
if
not
isinstance
(
strategy
,
DistributeTranspilerConfig
):
"In {} mode, strategy must be an instance of DistributeTranspilerConfig"
.
raise
ValueError
(
format
(
fleet
.
mode
))
"In {} mode, strategy must be an instance of DistributeTranspilerConfig"
.
format
(
fleet
.
_mode
))
else
:
self
.
_strategy
=
strategy
else
:
self
.
_strategy
=
DistributeTranspilerConfig
()
def
backward
(
self
,
def
backward
(
self
,
loss
,
loss
,
...
@@ -225,24 +210,68 @@ class TranspilerOptimizer(DistributedOptimizer):
...
@@ -225,24 +210,68 @@ class TranspilerOptimizer(DistributedOptimizer):
parameter_list
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
,
no_grad_set
=
None
,
callbacks
=
None
):
callbacks
=
None
):
"""
First part of `minimize`, do auto-diff to append backward ops for
the current program.
Args:
loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
parameter_list (list): list of Variables to update.
no_grad_set (set|None): set of Variables should be ignored.
callbacks (list|None): list of callables to run when appending backward
operator for one parameter.
Return:
list: list of (param, grad) pair, grad is the output of backward.
Examples:
See examples in `apply_gradients`.
"""
return
self
.
_optimizer
.
backward
(
loss
,
startup_program
,
parameter_list
,
return
self
.
_optimizer
.
backward
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
,
callbacks
)
no_grad_set
,
callbacks
)
def
apply_gradients
(
self
,
params_grads
):
def
apply_gradients
(
self
,
params_grads
):
"""
Second part of `minimize`, appending optimization operators for
given `params_grads` pairs.
Args:
params_grads (list): list of (param, grad) pair to do optimization.
Returns:
list: A list of operators appended to the current program.
Examples:
.. code-block:: python
loss = network()
optimizer = fluid.optimizer.SGD(learning_rate=0.1)
params_grads = optimizer.backward(loss)
# you may append operations for params_grads here
# ...
optimizer.apply_gradients(params_grads)
"""
return
self
.
_optimizer
.
apply_gradients
(
params_grads
)
return
self
.
_optimizer
.
apply_gradients
(
params_grads
)
def
minimize
(
self
,
def
minimize
(
self
,
loss
,
loss
,
scope
=
None
,
startup_program
=
None
,
startup_program
=
None
,
parameter_list
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
no_grad_set
=
None
):
optimize_ops
,
params_grads
=
self
.
_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
self
.
transpile
()
return
optimize_ops
,
params_grads
def
transpile
(
self
):
if
isinstance
(
loss
,
list
):
if
self
.
_strategy
is
None
:
raise
ValueError
(
self
.
_strategy
=
DistributeTranspilerConfig
(
)
"DistributedTranspiler's minimize can not accept loss with list"
)
if
isinstance
(
startup_program
,
list
):
raise
ValueError
(
"DistributedTranspiler's minimize can not accept program with list"
)
optimize_ops
,
params_grads
=
self
.
_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
fleet
.
_transpile
(
config
=
self
.
_strategy
)
fleet
.
_transpile
(
config
=
self
.
_strategy
)
return
optimize_ops
,
params_grads
python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
浏览文件 @
565d3095
...
@@ -28,63 +28,56 @@ class PSLib(Fleet):
...
@@ -28,63 +28,56 @@ class PSLib(Fleet):
def
__init__
(
self
):
def
__init__
(
self
):
super
(
PSLib
,
self
).
__init__
(
Mode
.
PSLIB
)
super
(
PSLib
,
self
).
__init__
(
Mode
.
PSLIB
)
self
.
_opt_info
=
None
self
.
_opt_info
=
None
self
.
local_ip_
=
0
self
.
_local_ip
=
0
self
.
_fleet_ptr
=
None
self
.
_fleet_ptr
=
None
self
.
_main_programs
=
[]
self
.
_scopes
=
[]
def
init
(
self
,
role_maker
=
None
):
def
init
(
self
,
executor
,
role_maker
=
None
):
super
(
PSLib
,
self
).
init
(
MPISymetricRoleMaker
())
super
(
PSLib
,
self
).
init
(
executor
,
MPISymetricRoleMaker
())
self
.
_fleet_ptr
=
fluid
.
core
.
Fleet
()
self
.
_fleet_ptr
=
fluid
.
core
.
Fleet
()
def
init_worker
(
self
,
executor
):
def
init_worker
(
self
):
pass
def
run_worker
(
self
,
executor
,
main_program
=
None
):
"""
"""
init_worker(): will be called by user. When a user knows current process is_server(), he/she
init_worker(): will be called by user. When a user knows current process is_server(), he/she
should call init_worker() to initialize global information about worker and connect
should call init_worker() to initialize global information about worker and connect
worker with pserver. You should run startup program before init_worker.
worker with pserver. You should run startup program before init_worker.
Args:
Args:
programs(Program|list): a Program or a list of Programs
executor(Executor): The executor to run for init server.
scopes(Scope|list): a Scope or a list of Scopes, default None
.
programs(Program|None): The program that need to run
.
"""
"""
if
not
isinstance
(
main_program
,
Program
):
raise
ValueError
(
"main_program must be an instance of Program"
)
programs
=
[
main_program
]
if
len
(
self
.
_main_programs
)
==
0
:
scopes
=
[
fluid
.
global_scope
()]
*
len
(
programs
)
raise
ValueError
(
"You should run DistributedOptimizer.minimize() first"
)
if
len
(
scopes
)
!=
len
(
programs
):
print
(
"You should make sure len(scopes) == len(programs) or set scopes None"
)
sys
.
exit
(
-
1
)
if
self
.
_opt_info
:
if
self
.
_opt_info
:
if
"fleet_desc"
in
self
.
_opt_info
:
if
"fleet_desc"
in
self
.
_opt_info
:
self
.
_dist_desc_str
=
text_format
.
MessageToString
(
self
.
_dist_desc_str
=
text_format
.
MessageToString
(
self
.
_opt_info
[
"fleet_desc"
])
self
.
_opt_info
[
"fleet_desc"
])
self
.
_dist_desc
=
self
.
_opt_info
[
"fleet_desc"
]
self
.
_dist_desc
=
self
.
_opt_info
[
"fleet_desc"
]
else
:
else
:
print
(
"You should run DistributedOptimizer.minimize() first"
)
raise
Exception
(
sys
.
exit
(
-
1
)
"You should run DistributedOptimizer.minimize() first"
)
# barrier_all for init_server, wait for server starts
# barrier_all for init_server, wait for server starts
self
.
role_maker_
.
_barrier_all
()
self
.
_role_maker
.
_barrier_all
()
self
.
all_ips_
=
self
.
role_maker_
.
_all_gather
(
self
.
local_ip_
)
self
.
all_ips_
=
self
.
_role_maker
.
_all_gather
(
self
.
_local_ip
)
self
.
_fleet_ptr
.
init_worker
(
self
.
_dist_desc_str
,
self
.
all_ips_
,
self
.
_fleet_ptr
.
init_worker
(
self
.
_dist_desc_str
,
self
.
all_ips_
,
self
.
role_maker_
.
_get_size
(),
self
.
_role_maker
.
_get_size
(),
self
.
role_maker_
.
_get_rank
())
self
.
_role_maker
.
_get_rank
())
# barrier_all for init_worker
# barrier_all for init_worker
self
.
role_maker_
.
_barrier_all
()
self
.
_role_maker
.
_barrier_all
()
# prepare for client to client communication
# prepare for client to client communication
info
=
self
.
_fleet_ptr
.
get_clients_info
()
info
=
self
.
_fleet_ptr
.
get_clients_info
()
all_info
=
self
.
role_maker_
.
_worker_gather
(
info
[
0
])
all_info
=
self
.
_role_maker
.
_worker_gather
(
info
[
0
])
self
.
_fleet_ptr
.
gather_clients
(
all_info
)
self
.
_fleet_ptr
.
gather_clients
(
all_info
)
self
.
_fleet_ptr
.
create_client2client_connection
()
self
.
_fleet_ptr
.
create_client2client_connection
()
# barrier for init model
# barrier for init model
self
.
role_maker_
.
_barrier_worker
()
self
.
_role_maker
.
_barrier_worker
()
if
self
.
role_maker_
.
_
is_first_worker
():
if
self
.
_role_maker
.
is_first_worker
():
tables
=
self
.
_dist_desc
.
trainer_param
.
dense_table
tables
=
self
.
_dist_desc
.
trainer_param
.
dense_table
for
prog
,
scope
in
zip
(
programs
,
scopes
):
for
prog
,
scope
in
zip
(
self
.
_main_programs
,
self
.
_
scopes
):
prog_id
=
str
(
id
(
prog
))
prog_id
=
str
(
id
(
prog
))
prog_conf
=
self
.
_opt_info
[
'program_configs'
][
prog_id
]
prog_conf
=
self
.
_opt_info
[
'program_configs'
][
prog_id
]
prog_tables
=
{}
prog_tables
=
{}
...
@@ -100,24 +93,23 @@ class PSLib(Fleet):
...
@@ -100,24 +93,23 @@ class PSLib(Fleet):
for
i
in
range
(
0
,
len
(
table
.
dense_variable_name
)):
for
i
in
range
(
0
,
len
(
table
.
dense_variable_name
)):
var_name
=
table
.
dense_variable_name
[
i
]
var_name
=
table
.
dense_variable_name
[
i
]
if
scope
.
find_var
(
var_name
)
is
None
:
if
scope
.
find_var
(
var_name
)
is
None
:
print
(
"var "
+
var_name
+
raise
ValueError
(
" not found in scope, "
+
"var "
+
var_name
+
" not found in scope, "
"you should run startup program first"
)
+
"you should run startup program first"
)
sys
.
exit
(
-
1
)
var_name_list
.
append
(
var_name
)
var_name_list
.
append
(
var_name
)
self
.
_fleet_ptr
.
init_model
(
scope
,
self
.
_fleet_ptr
.
init_model
(
scope
,
int
(
table
.
table_id
),
int
(
table
.
table_id
),
var_name_list
)
var_name_list
)
# barrier for init model done
# barrier for init model done
self
.
role_maker_
.
_barrier_worker
()
self
.
_role_maker
.
_barrier_worker
()
else
:
else
:
raise
NameError
(
raise
NameError
(
"You should run DistributedOptimizer.minimize() first"
)
"You should run DistributedOptimizer.minimize() first"
)
def
init_server
(
self
,
executor
,
model_dir
=
None
):
def
init_server
(
self
,
model_dir
=
None
):
pass
pass
def
run_server
(
self
,
executor
):
def
run_server
(
self
):
"""
"""
init_pserver(): will be called by user. When a user knows current process is_worker(), he/she
init_pserver(): will be called by user. When a user knows current process is_worker(), he/she
should call init_pserver() to initialize global information about parameter server
should call init_pserver() to initialize global information about parameter server
...
@@ -128,22 +120,22 @@ class PSLib(Fleet):
...
@@ -128,22 +120,22 @@ class PSLib(Fleet):
self
.
_opt_info
[
"fleet_desc"
])
self
.
_opt_info
[
"fleet_desc"
])
self
.
_dist_desc
=
self
.
_opt_info
[
"fleet_desc"
]
self
.
_dist_desc
=
self
.
_opt_info
[
"fleet_desc"
]
else
:
else
:
print
(
"You should run DistributedOptimizer.minimize() first"
)
raise
Exception
(
sys
.
exit
(
-
1
)
"You should run DistributedOptimizer.minimize() first"
)
self
.
_fleet_ptr
.
init_server
(
self
.
_dist_desc_str
,
self
.
_fleet_ptr
.
init_server
(
self
.
_dist_desc_str
,
self
.
role_maker_
.
_get_rank
())
self
.
_role_maker
.
_get_rank
())
self
.
local_ip_
=
self
.
_fleet_ptr
.
run_server
()
self
.
_local_ip
=
self
.
_fleet_ptr
.
run_server
()
# barrier_all for init_server
# barrier_all for init_server
self
.
role_maker_
.
_barrier_all
()
self
.
_role_maker
.
_barrier_all
()
self
.
all_ips_
=
self
.
role_maker_
.
_all_gather
(
self
.
local_ip_
)
self
.
all_ips_
=
self
.
_role_maker
.
_all_gather
(
self
.
_local_ip
)
self
.
_fleet_ptr
.
gather_servers
(
self
.
all_ips_
,
self
.
_fleet_ptr
.
gather_servers
(
self
.
all_ips_
,
self
.
role_maker_
.
_get_size
())
self
.
_role_maker
.
_get_size
())
# barrier_all for init_worker, wait all workers start
# barrier_all for init_worker, wait all workers start
self
.
role_maker_
.
_barrier_all
()
self
.
_role_maker
.
_barrier_all
()
else
:
else
:
raise
NameError
(
raise
Exception
(
"You should run DistributedOptimizer.minimize() first"
)
"You should run DistributedOptimizer.minimize() first"
)
def
stop_worker
(
self
):
def
stop_worker
(
self
):
...
@@ -151,31 +143,30 @@ class PSLib(Fleet):
...
@@ -151,31 +143,30 @@ class PSLib(Fleet):
stop(): will be called after a user finishes his/her training task. Fleet instance will be
stop(): will be called after a user finishes his/her training task. Fleet instance will be
destroyed when stop() is called.
destroyed when stop() is called.
"""
"""
self
.
role_maker_
.
_barrier_worker
()
self
.
_role_maker
.
_barrier_worker
()
if
self
.
role_maker_
.
_
is_first_worker
():
if
self
.
_role_maker
.
is_first_worker
():
self
.
_fleet_ptr
.
stop_server
()
self
.
_fleet_ptr
.
stop_server
()
self
.
role_maker_
.
_barrier_worker
()
self
.
_role_maker
.
_barrier_worker
()
self
.
role_maker_
.
_barrier_all
()
self
.
_role_maker
.
_barrier_all
()
self
.
role_maker_
.
_finalize
()
self
.
_role_maker
.
_finalize
()
def
stop
(
self
,
executor
):
def
stop
(
self
):
"""
"""
stop(): will be called after a user finishes his/her training task. Fleet instance will be
stop(): will be called after a user finishes his/her training task. Fleet instance will be
destroyed when stop() is called.
destroyed when stop() is called.
"""
"""
self
.
role_maker_
.
_barrier_worker
()
self
.
_role_maker
.
_barrier_worker
()
if
self
.
role_maker_
.
_
is_first_worker
():
if
self
.
_role_maker
.
is_first_worker
():
self
.
_fleet_ptr
.
stop_server
()
self
.
_fleet_ptr
.
stop_server
()
self
.
role_maker_
.
_barrier_worker
()
self
.
_role_maker
.
_barrier_worker
()
self
.
role_maker_
.
_barrier_all
()
self
.
_role_maker
.
_barrier_all
()
self
.
role_maker_
.
_finalize
()
self
.
_role_maker
.
_finalize
()
def
distributed_optimizer
(
self
,
optimizer
,
strategy
=
None
):
def
distributed_optimizer
(
self
,
optimizer
,
strategy
=
None
):
self
.
optimizer
=
DownpourOptimizer
(
optimizer
,
strategy
)
self
.
_
optimizer
=
DownpourOptimizer
(
optimizer
,
strategy
)
return
self
.
optimizer
return
self
.
_
optimizer
def
save_inference_model
(
self
,
def
save_inference_model
(
self
,
executor
,
dirname
,
dirname
,
feeded_var_names
=
None
,
feeded_var_names
=
None
,
target_vars
=
None
,
target_vars
=
None
,
...
@@ -186,7 +177,7 @@ class PSLib(Fleet):
...
@@ -186,7 +177,7 @@ class PSLib(Fleet):
"""
"""
self
.
_fleet_ptr
.
save_model
(
dirname
)
self
.
_fleet_ptr
.
save_model
(
dirname
)
def
save_persistables
(
self
,
executor
,
dirname
,
main_program
=
None
):
def
save_persistables
(
self
,
dirname
,
main_program
=
None
):
self
.
_fleet_ptr
.
save_model
(
dirname
)
self
.
_fleet_ptr
.
save_model
(
dirname
)
def
_set_opt_info
(
self
,
opt_info
):
def
_set_opt_info
(
self
,
opt_info
):
...
@@ -208,6 +199,13 @@ class DownpourOptimizer(DistributedOptimizer):
...
@@ -208,6 +199,13 @@ class DownpourOptimizer(DistributedOptimizer):
run distributed training. The optimized information will be stored in
run distributed training. The optimized information will be stored in
Fleet() instance who holds the global information about current distributed
Fleet() instance who holds the global information about current distributed
training.
training.
Args:
optimizer(Optimizer): subclass of Optimizer.
strategy(any): config for DownpourOptimizer.
Returns:
None
"""
"""
def
__init__
(
self
,
optimizer
,
strategy
=
None
):
def
__init__
(
self
,
optimizer
,
strategy
=
None
):
...
@@ -242,32 +240,54 @@ class DownpourOptimizer(DistributedOptimizer):
...
@@ -242,32 +240,54 @@ class DownpourOptimizer(DistributedOptimizer):
raise
NotImplementedError
()
raise
NotImplementedError
()
def
minimize
(
self
,
def
minimize
(
self
,
loss
,
losses
,
startup_program
=
None
,
scopes
=
None
,
startup_programs
=
None
,
parameter_list
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
no_grad_set
=
None
):
"""
"""
minimize a program through loss, loss can be a list in DistributedOptimizer
minimize a program through loss, loss can be a list in DistributedOptimizer.
Note that in parameter server mode, a worker will not get anything about optimize_os
Because optmizer algorithms run on pserver side. We will make this usable in pserver
process, but currently the optimization part is written into Fleet(). A user does not
need to care about how to startup a pserver node.
Args:
Args:
loss (Variable|Variable List): loss variable or loss variable list to run optimization.
losses (Variable|Variable List): loss variable or loss variable list to run optimization.
startup_program (Program): startup_program for initializing parameters
scopes (Scope| Scope List): scope instance.
startup_programs (Program|Program List): startup_program for initializing parameters
in `parameter_list`.
in `parameter_list`.
parameter_list (list): list of Variables to update.
parameter_list (list): list of Variables to update.
no_grad_set (set|None): set of Variables should be ignored.
no_grad_set (set|None): set of Variables should be ignored.
Returns:
Returns:
tuple: (optimize_ops, params_grads) which are, list of operators appended;
tuple: (optimize_ops, params_grads) which are, list of operators appended;
and list of (param, grad) Variables pair for optimization.
and list of (param, grad) Variables pair for optimization.
Note that in parameter server mode, a worker will not get anything about optimize_os
Because optmizer algorithms run on pserver side. We will make this usable in pserver
process, but currently the optimization part is written into Fleet(). A user does not
need to care about how to startup a pserver node.
"""
"""
if
not
isinstance
(
losses
,
list
):
losses
=
[
losses
]
optimize_ops
,
param_grads
,
opt_info
=
\
optimize_ops
,
param_grads
,
opt_info
=
\
self
.
_distributed_optimizer
.
_minimize
(
self
.
_distributed_optimizer
.
_minimize
(
loss
,
loss
es
,
startup_program
,
startup_program
s
,
parameter_list
,
parameter_list
,
no_grad_set
)
no_grad_set
)
fleet
.
_set_opt_info
(
opt_info
)
fleet
.
_set_opt_info
(
opt_info
)
programs
=
[
loss
.
block
.
program
for
loss
in
losses
]
if
scopes
is
None
:
scopes
=
[
fluid
.
global_scope
()]
*
len
(
programs
)
if
len
(
scopes
)
!=
len
(
programs
):
raise
ValueError
(
"You should make sure len(scopes) == len(programs) or set scopes None"
)
fleet
.
_main_programs
=
programs
fleet
.
_scopes
=
scopes
return
[
optimize_ops
,
param_grads
]
return
[
optimize_ops
,
param_grads
]
python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
浏览文件 @
565d3095
...
@@ -94,7 +94,7 @@ class DownpourServer(Server):
...
@@ -94,7 +94,7 @@ class DownpourServer(Server):
Returns:
Returns:
return None
return None
"""
"""
table
=
self
.
server_
.
downpour_server_param
.
downpour_table_param
.
add
()
table
=
self
.
_server
.
downpour_server_param
.
downpour_table_param
.
add
()
table
.
table_id
=
table_id
table
.
table_id
=
table_id
table
.
table_class
=
"DownpourDenseTable"
table
.
table_class
=
"DownpourDenseTable"
table
.
type
=
pslib
.
PS_DENSE_TABLE
table
.
type
=
pslib
.
PS_DENSE_TABLE
...
@@ -169,7 +169,7 @@ class DownpourWorker(Worker):
...
@@ -169,7 +169,7 @@ class DownpourWorker(Worker):
Returns:
Returns:
return None
return None
"""
"""
table
=
self
.
worker_
.
sparse_table
.
add
()
table
=
self
.
_worker
.
sparse_table
.
add
()
table
.
table_id
=
table_id
table
.
table_id
=
table_id
table
.
slot_key
.
extend
([
var
.
name
for
var
in
slot_key_vars
])
table
.
slot_key
.
extend
([
var
.
name
for
var
in
slot_key_vars
])
table
.
slot_value
.
extend
([
var
.
name
for
var
in
slot_value_vars
])
table
.
slot_value
.
extend
([
var
.
name
for
var
in
slot_value_vars
])
...
...
python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
浏览文件 @
565d3095
...
@@ -66,8 +66,6 @@ class DistributedAdam(DistributedOptimizerImplBase):
...
@@ -66,8 +66,6 @@ class DistributedAdam(DistributedOptimizerImplBase):
Returns:
Returns:
[optimize_ops, grads_and_weights]
[optimize_ops, grads_and_weights]
"""
"""
if
not
isinstance
(
losses
,
list
):
losses
=
[
losses
]
table_name
=
find_distributed_lookup_table
(
losses
[
0
].
block
.
program
)
table_name
=
find_distributed_lookup_table
(
losses
[
0
].
block
.
program
)
prefetch_slots
=
find_distributed_lookup_table_inputs
(
prefetch_slots
=
find_distributed_lookup_table_inputs
(
...
@@ -77,7 +75,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
...
@@ -77,7 +75,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
ps_param
=
pslib
.
PSParameter
()
ps_param
=
pslib
.
PSParameter
()
server
=
DownpourServer
()
server
=
DownpourServer
()
worker
=
DownpourWorker
(
self
.
window_
)
worker
=
DownpourWorker
(
self
.
_window
)
sparse_table_index
=
0
sparse_table_index
=
0
server
.
add_sparse_table
(
sparse_table_index
,
self
.
_learning_rate
,
server
.
add_sparse_table
(
sparse_table_index
,
self
.
_learning_rate
,
prefetch_slots
,
prefetch_slots_emb
)
prefetch_slots
,
prefetch_slots_emb
)
...
@@ -88,17 +86,12 @@ class DistributedAdam(DistributedOptimizerImplBase):
...
@@ -88,17 +86,12 @@ class DistributedAdam(DistributedOptimizerImplBase):
param_grads_list
=
[]
param_grads_list
=
[]
for
loss_index
in
range
(
len
(
losses
)):
for
loss_index
in
range
(
len
(
losses
)):
#program_config = ps_param.trainer_param.program_config.add()
#program_config.program_id = str(
# id(losses[loss_index].block.program))
program_id
=
str
(
id
(
losses
[
loss_index
].
block
.
program
))
program_id
=
str
(
id
(
losses
[
loss_index
].
block
.
program
))
program_configs
[
program_id
]
=
{
program_configs
[
program_id
]
=
{
"pull_sparse"
:
[
sparse_table_index
],
"pull_sparse"
:
[
sparse_table_index
],
"push_sparse"
:
[
sparse_table_index
]
"push_sparse"
:
[
sparse_table_index
]
}
}
#program_config.pull_sparse_table_id.extend([sparse_table_index])
#program_config.push_sparse_table_id.extend([sparse_table_index])
params_grads
=
sorted
(
params_grads
=
sorted
(
fluid
.
backward
.
append_backward
(
losses
[
loss_index
],
fluid
.
backward
.
append_backward
(
losses
[
loss_index
],
parameter_list
,
no_grad_set
),
parameter_list
,
no_grad_set
),
...
@@ -130,8 +123,6 @@ class DistributedAdam(DistributedOptimizerImplBase):
...
@@ -130,8 +123,6 @@ class DistributedAdam(DistributedOptimizerImplBase):
params
,
grads
)
params
,
grads
)
program_configs
[
program_id
][
"pull_dense"
]
=
[
dense_table_index
]
program_configs
[
program_id
][
"pull_dense"
]
=
[
dense_table_index
]
program_configs
[
program_id
][
"push_dense"
]
=
[
dense_table_index
]
program_configs
[
program_id
][
"push_dense"
]
=
[
dense_table_index
]
#program_config.pull_dense_table_id.extend([dense_table_index])
#program_config.push_dense_table_id.extend([dense_table_index])
if
len
(
data_norm_params
)
!=
0
and
len
(
data_norm_grads
)
!=
0
:
if
len
(
data_norm_params
)
!=
0
and
len
(
data_norm_grads
)
!=
0
:
dense_table_index
+=
1
dense_table_index
+=
1
server
.
add_data_norm_table
(
dense_table_index
,
server
.
add_data_norm_table
(
dense_table_index
,
...
@@ -139,18 +130,13 @@ class DistributedAdam(DistributedOptimizerImplBase):
...
@@ -139,18 +130,13 @@ class DistributedAdam(DistributedOptimizerImplBase):
data_norm_params
,
data_norm_grads
)
data_norm_params
,
data_norm_grads
)
worker
.
add_dense_table
(
dense_table_index
,
self
.
_learning_rate
,
worker
.
add_dense_table
(
dense_table_index
,
self
.
_learning_rate
,
data_norm_params
,
data_norm_grads
)
data_norm_params
,
data_norm_grads
)
#program_config.pull_dense_table_id.extend([dense_table_index])
#program_config.push_dense_table_id.extend([dense_table_index])
program_configs
[
program_id
][
"pull_dense"
].
extend
(
program_configs
[
program_id
][
"pull_dense"
].
extend
(
[
dense_table_index
])
[
dense_table_index
])
program_configs
[
program_id
][
"push_dense"
].
extend
(
program_configs
[
program_id
][
"push_dense"
].
extend
(
[
dense_table_index
])
[
dense_table_index
])
dense_table_index
+=
1
dense_table_index
+=
1
#program_configs.append(program_config)
ps_param
.
server_param
.
CopyFrom
(
server
.
get_desc
())
ps_param
.
server_param
.
CopyFrom
(
server
.
get_desc
())
ps_param
.
trainer_param
.
CopyFrom
(
worker
.
get_desc
())
ps_param
.
trainer_param
.
CopyFrom
(
worker
.
get_desc
())
#for program_config in program_configs:
# ps_param.trainer_param.program_config.extend([program_config])
# Todo(guru4elephant): figure out how to support more sparse parameters
# Todo(guru4elephant): figure out how to support more sparse parameters
# currently only support lookup_table
# currently only support lookup_table
worker_skipped_ops
=
[
"lookup_table"
,
"lookup_table_grad"
]
worker_skipped_ops
=
[
"lookup_table"
,
"lookup_table_grad"
]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录