Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
ceac9df8
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ceac9df8
编写于
4月 10, 2019
作者:
D
dongdaxiang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix code style for incubator
上级
aa46caf3
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
130 addition
and
130 deletion
+130
-130
python/paddle/fluid/device_worker.py
python/paddle/fluid/device_worker.py
+18
-18
python/paddle/fluid/incubate/fleet/base/role_maker.py
python/paddle/fluid/incubate/fleet/base/role_maker.py
+39
-39
python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
.../paddle/fluid/incubate/fleet/parameter_server/__init__.py
+34
-34
python/paddle/fluid/incubate/fleet/parameter_server/node.py
python/paddle/fluid/incubate/fleet/parameter_server/node.py
+13
-13
python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py
...luid/incubate/fleet/parameter_server/optimizer_factory.py
+10
-10
python/paddle/fluid/trainer_desc.py
python/paddle/fluid/trainer_desc.py
+16
-16
未找到文件。
python/paddle/fluid/device_worker.py
浏览文件 @
ceac9df8
...
...
@@ -26,8 +26,8 @@ class DeviceWorker(object):
"""
Init.
"""
self
.
program_
=
None
self
.
infer_
=
None
self
.
_program
=
None
self
.
_infer
=
None
def
_set_infer
(
self
,
infer
=
False
):
"""
...
...
@@ -36,7 +36,7 @@ class DeviceWorker(object):
Args:
infer(bool): whether to do inference
"""
self
.
infer_
=
infer
self
.
_infer
=
infer
def
_set_fleet_desc
(
self
,
fleet_desc
):
"""
...
...
@@ -45,7 +45,7 @@ class DeviceWorker(object):
Args:
fleet_desc(PSParameter): pslib.PSParameter object
"""
self
.
fleet_desc_
=
fleet_desc
self
.
_fleet_desc
=
fleet_desc
def
_set_program
(
self
,
program
):
"""
...
...
@@ -54,7 +54,7 @@ class DeviceWorker(object):
Args:
program(Program): a Program object
"""
self
.
program_
=
program
self
.
_program
=
program
def
_gen_worker_desc
(
self
,
trainer_desc
):
"""
...
...
@@ -88,7 +88,7 @@ class Hogwild(DeviceWorker):
trainer_desc(TrainerDesc): a TrainerDesc object
"""
trainer_desc
.
device_worker_name
=
"HogwildWorker"
if
self
.
infer_
:
if
self
.
_infer
:
# just ignore feed op for inference model
trainer_desc
.
hogwild_param
.
skip_ops
.
extend
([
"feed"
])
...
...
@@ -113,11 +113,11 @@ class DownpourSGD(DeviceWorker):
trainer_desc(TrainerDesc): a TrainerDesc object
"""
dense_table_set
=
set
()
program_id
=
str
(
id
(
self
.
program_
))
if
self
.
program_
==
None
:
program_id
=
str
(
id
(
self
.
_program
))
if
self
.
_program
==
None
:
print
(
"program of current device worker is not configured"
)
exit
(
-
1
)
opt_info
=
self
.
program_
.
_fleet_opt
opt_info
=
self
.
_program
.
_fleet_opt
program_configs
=
opt_info
[
"program_configs"
]
downpour
=
trainer_desc
.
downpour_param
...
...
@@ -140,7 +140,7 @@ class DownpourSGD(DeviceWorker):
trainer_desc
.
device_worker_name
=
"DownpourWorker"
pull_thread
=
trainer_desc
.
pull_dense_param
pull_thread
.
device_num
=
trainer_desc
.
thread_num
for
i
in
self
.
fleet_desc_
.
trainer_param
.
dense_table
:
for
i
in
self
.
_fleet_desc
.
trainer_param
.
dense_table
:
if
i
.
table_id
in
dense_table_set
:
dense_table
=
pull_thread
.
dense_table
.
add
()
dense_table
.
dense_value_name
.
extend
(
i
.
dense_variable_name
)
...
...
@@ -148,29 +148,29 @@ class DownpourSGD(DeviceWorker):
i
.
table_id
sparse_table
=
downpour
.
sparse_table
.
add
()
sparse_table
.
table_id
=
\
self
.
fleet_desc_
.
trainer_param
.
sparse_table
[
0
].
table_id
self
.
_fleet_desc
.
trainer_param
.
sparse_table
[
0
].
table_id
sparse_table
.
sparse_key_name
.
extend
(
self
.
fleet_desc_
.
trainer_param
.
sparse_table
[
0
].
slot_key
)
self
.
_fleet_desc
.
trainer_param
.
sparse_table
[
0
].
slot_key
)
sparse_table
.
sparse_value_name
.
extend
(
self
.
fleet_desc_
.
trainer_param
.
sparse_table
[
0
].
slot_value
)
self
.
_fleet_desc
.
trainer_param
.
sparse_table
[
0
].
slot_value
)
sparse_table
.
sparse_grad_name
.
extend
(
self
.
fleet_desc_
.
trainer_param
.
sparse_table
[
0
].
slot_gradient
)
self
.
_fleet_desc
.
trainer_param
.
sparse_table
[
0
].
slot_gradient
)
sparse_table
.
emb_dim
=
\
self
.
fleet_desc_
.
server_param
.
downpour_server_param
.
downpour_table_param
[
self
.
_fleet_desc
.
server_param
.
downpour_server_param
.
downpour_table_param
[
0
].
accessor
.
fea_dim
-
2
sparse_table
.
fea_dim
=
sparse_table
.
emb_dim
+
2
# TODO(guru4elephant): hard code here, need to improve
sparse_table
.
label_var_name
=
"click"
for
i
in
self
.
fleet_desc_
.
trainer_param
.
dense_table
:
for
i
in
self
.
_fleet_desc
.
trainer_param
.
dense_table
:
if
i
.
table_id
in
dense_table_set
:
dense_table
=
downpour
.
dense_table
.
add
()
dense_table
.
table_id
=
i
.
table_id
dense_table
.
dense_value_name
.
extend
(
i
.
dense_variable_name
)
dense_table
.
dense_grad_name
.
extend
(
i
.
dense_gradient_variable_name
)
downpour
.
skip_ops
.
extend
(
self
.
fleet_desc_
.
trainer_param
.
skip_op
)
if
self
.
infer_
:
downpour
.
skip_ops
.
extend
(
self
.
_fleet_desc
.
trainer_param
.
skip_op
)
if
self
.
_infer
:
downpour
.
push_dense
=
False
downpour
.
push_sparse
=
False
...
...
python/paddle/fluid/incubate/fleet/base/role_maker.py
浏览文件 @
ceac9df8
...
...
@@ -23,10 +23,10 @@ class RoleMakerBase(object):
"""
def
__init__
(
self
):
self
.
role_maker_name_
=
""
self
.
trainer_endpoints_
=
[]
self
.
pserver_endpoints_
=
[]
self
.
role_is_generated_
=
False
self
.
_role_maker_name
=
""
self
.
_trainer_endpoints
=
[]
self
.
_pserver_endpoints
=
[]
self
.
_role_is_generated
=
False
def
_is_worker
(
self
):
"""
...
...
@@ -45,20 +45,20 @@ class RoleMakerBase(object):
return get local ip
"""
import
socket
self
.
ip_
=
socket
.
gethostbyname
(
socket
.
gethostname
())
return
self
.
ip_
self
.
_ip
=
socket
.
gethostbyname
(
socket
.
gethostname
())
return
self
.
_ip
def
_get_trainer_endpoints
(
self
):
"""
return trainer endpoints
"""
return
self
.
trainer_endpoints_
return
self
.
_trainer_endpoints
def
_get_pserver_endpoints
(
self
):
"""
return pserver endpoints
"""
return
self
.
pserver_endpoints_
return
self
.
_pserver_endpoints
def
_generate_role
(
self
):
"""
...
...
@@ -76,59 +76,59 @@ class MPIRoleMaker(RoleMakerBase):
def
__init__
(
self
):
super
(
MPIRoleMaker
,
self
).
__init__
()
from
mpi4py
import
MPI
self
.
comm_
=
MPI
.
COMM_WORLD
self
.
_comm
=
MPI
.
COMM_WORLD
self
.
MPI
=
MPI
self
.
ips_
=
None
self
.
_ips
=
None
def
_get_rank
(
self
):
"""
return rank
"""
self
.
rank_
=
self
.
comm_
.
Get_rank
()
return
self
.
rank_
self
.
_rank
=
self
.
_comm
.
Get_rank
()
return
self
.
_rank
def
_get_size
(
self
):
"""
return size
"""
self
.
size_
=
self
.
comm_
.
Get_size
()
return
self
.
size_
self
.
_size
=
self
.
_comm
.
Get_size
()
return
self
.
_size
def
_all_gather
(
self
,
obj
):
"""
all_gather(obj) will call MPI's allgather function
"""
self
.
_barrier_all
()
return
self
.
comm_
.
allgather
(
obj
)
return
self
.
_comm
.
allgather
(
obj
)
def
_worker_gather
(
self
,
obj
):
"""
worker_gather(obj) will call MPI's allgather function
"""
if
self
.
_is_worker
():
self
.
node_type_comm_
.
barrier
()
return
self
.
node_type_comm_
.
allgather
(
obj
)
self
.
_node_type_comm
.
barrier
()
return
self
.
_node_type_comm
.
allgather
(
obj
)
return
None
def
_barrier_all
(
self
):
"""
barrier_all() will call MPI's barrier_all function
"""
self
.
comm_
.
barrier
()
self
.
_comm
.
barrier
()
def
_get_ips
(
self
):
"""
collect current distributed job's ip list
"""
if
self
.
ips_
==
None
:
self
.
ips_
=
self
.
comm_
.
allgather
(
self
.
_get_local_ip
())
return
self
.
ips_
if
self
.
_ips
==
None
:
self
.
_ips
=
self
.
_comm
.
allgather
(
self
.
_get_local_ip
())
return
self
.
_ips
def
_finalize
(
self
):
"""
finalize the current MPI instance.
"""
self
.
comm_
.
finalize
()
self
.
_comm
.
finalize
()
class
MPISymetricRoleMaker
(
MPIRoleMaker
):
...
...
@@ -140,11 +140,11 @@ class MPISymetricRoleMaker(MPIRoleMaker):
def
__init__
(
self
):
super
(
MPISymetricRoleMaker
,
self
).
__init__
()
self
.
node_type_
=
None
self
.
proc_per_node_
=
2
self
.
_node_type
=
None
self
.
_proc_per_node
=
2
def
_check_role_generation
(
self
):
if
not
self
.
role_is_generated_
:
if
not
self
.
_role_is_generated
:
sys
.
stderr
.
write
(
"generate_role() should be called first"
)
sys
.
exit
(
-
1
)
return
False
...
...
@@ -163,7 +163,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return whether current process is worker assigned by role maker
"""
if
self
.
_check_role_generation
():
return
self
.
node_type_
==
1
return
self
.
_node_type
==
1
return
False
def
_is_server
(
self
):
...
...
@@ -171,7 +171,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return whether current process is server assigned by role maker
"""
if
self
.
_check_role_generation
():
return
self
.
node_type_
==
0
return
self
.
_node_type
==
0
return
False
def
_worker_num
(
self
):
...
...
@@ -197,7 +197,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return the index of worker
"""
if
self
.
_check_role_generation
():
return
self
.
rank_
/
self
.
proc_per_node_
return
self
.
_rank
/
self
.
_proc_per_node
return
0
def
_server_index
(
self
):
...
...
@@ -205,7 +205,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return the index of server
"""
if
self
.
_check_role_generation
():
return
self
.
rank_
/
self
.
proc_per_node_
return
self
.
_rank
/
self
.
_proc_per_node
return
0
def
_barrier_worker
(
self
):
...
...
@@ -214,7 +214,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
"""
if
self
.
_check_role_generation
():
if
self
.
_is_worker
():
self
.
node_type_comm_
.
barrier
()
self
.
_node_type_comm
.
barrier
()
def
_barrier_server
(
self
):
"""
...
...
@@ -222,20 +222,20 @@ class MPISymetricRoleMaker(MPIRoleMaker):
"""
if
self
.
_check_role_generation
():
if
self
.
_is_server
():
self
.
node_type_comm_
.
barrier
()
self
.
_node_type_comm
.
barrier
()
def
_generate_role
(
self
):
"""
generate currently process's role
"""
if
not
self
.
role_is_generated_
:
if
not
self
.
_role_is_generated
:
# TODO(guru4elephant): only allow to be called once
self
.
trainer_endpoints_
=
self
.
_get_ips
()
self
.
pserver_endpoints_
=
self
.
_get_ips
()
self
.
_trainer_endpoints
=
self
.
_get_ips
()
self
.
_pserver_endpoints
=
self
.
_get_ips
()
if
0
==
self
.
_get_rank
()
%
self
.
proc_per_node_
%
2
:
self
.
node_type_
=
0
if
0
==
self
.
_get_rank
()
%
self
.
_proc_per_node
%
2
:
self
.
_node_type
=
0
else
:
self
.
node_type_
=
1
self
.
node_type_comm_
=
self
.
comm_
.
Split
(
self
.
node_type_
)
self
.
role_is_generated_
=
True
self
.
_node_type
=
1
self
.
_node_type_comm
=
self
.
_comm
.
Split
(
self
.
_node_type
)
self
.
_role_is_generated
=
True
python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
浏览文件 @
ceac9df8
...
...
@@ -64,9 +64,9 @@ class Fleet(object):
def
__init__
(
self
):
self
.
_opt_info
=
None
# for fleet only
self
.
role_maker_
=
None
self
.
local_ip_
=
0
self
.
is_initialized_
=
False
self
.
_role_maker
=
None
self
.
_local_ip
=
0
self
.
_is_initialized
=
False
def
init
(
self
):
# TODO(guru4elephant)
...
...
@@ -78,22 +78,22 @@ class Fleet(object):
current node's role, e.g. worker, server, etc.
"""
if
not
self
.
is_initialized_
:
self
.
role_maker_
=
MPISymetricRoleMaker
()
self
.
role_maker_
.
_generate_role
()
self
.
_role_maker
=
MPISymetricRoleMaker
()
self
.
_role_maker
.
_generate_role
()
self
.
_fleet_ptr
=
fluid
.
core
.
Fleet
()
self
.
is_initialized_
=
True
self
.
_is_initialized
=
True
def
stop
(
self
):
"""
stop(): will be called after a user finishes his/her training task. Fleet instance will be
destroyed when stop() is called.
"""
self
.
role_maker_
.
_barrier_worker
()
if
self
.
role_maker_
.
_is_first_worker
():
self
.
_role_maker
.
_barrier_worker
()
if
self
.
_role_maker
.
_is_first_worker
():
self
.
_fleet_ptr
.
stop_server
()
self
.
role_maker_
.
_barrier_worker
()
self
.
role_maker_
.
_barrier_all
()
self
.
role_maker_
.
_finalize
()
self
.
_role_maker
.
_barrier_worker
()
self
.
_role_maker
.
_barrier_all
()
self
.
_role_maker
.
_finalize
()
def
init_pserver
(
self
):
"""
...
...
@@ -110,15 +110,15 @@ class Fleet(object):
sys
.
exit
(
-
1
)
self
.
_fleet_ptr
.
init_server
(
self
.
_dist_desc_str
,
self
.
role_maker_
.
_get_rank
())
self
.
local_ip_
=
self
.
_fleet_ptr
.
run_server
()
self
.
_local_ip
=
self
.
_fleet_ptr
.
run_server
()
# barrier_all for init_server
self
.
role_maker_
.
_barrier_all
()
self
.
all_ips_
=
self
.
role_maker_
.
_all_gather
(
self
.
local_ip_
)
self
.
_role_maker
.
_barrier_all
()
self
.
_all_ips
=
self
.
_role_maker
.
_all_gather
(
self
.
local_ip_
)
self
.
_fleet_ptr
.
gather_servers
(
self
.
all_ips_
,
self
.
role_maker_
.
_get_size
())
self
.
_fleet_ptr
.
gather_servers
(
self
.
_all_ips
,
self
.
_role_maker
.
_get_size
())
# barrier_all for init_worker, wait all workers start
self
.
role_maker_
.
_barrier_all
()
self
.
_role_maker
.
_barrier_all
()
else
:
print
(
"You should run DistributedOptimizer.minimize() first"
)
sys
.
exit
(
-
1
)
...
...
@@ -151,21 +151,21 @@ class Fleet(object):
print
(
"You should run DistributedOptimizer.minimize() first"
)
sys
.
exit
(
-
1
)
# barrier_all for init_server, wait for server starts
self
.
role_maker_
.
_barrier_all
()
self
.
all_ips_
=
self
.
role_maker_
.
_all_gather
(
self
.
local_ip_
)
self
.
_fleet_ptr
.
init_worker
(
self
.
_dist_desc_str
,
self
.
all_ips_
,
self
.
role_maker_
.
_get_size
(),
self
.
role_maker_
.
_get_rank
())
self
.
_role_maker
.
_barrier_all
()
self
.
_all_ips
=
self
.
_role_maker
.
_all_gather
(
self
.
local_ip_
)
self
.
_fleet_ptr
.
init_worker
(
self
.
_dist_desc_str
,
self
.
_all_ips
,
self
.
_role_maker
.
_get_size
(),
self
.
_role_maker
.
_get_rank
())
# barrier_all for init_worker
self
.
role_maker_
.
_barrier_all
()
self
.
_role_maker
.
_barrier_all
()
# prepare for client to client communication
info
=
self
.
_fleet_ptr
.
get_clients_info
()
all_info
=
self
.
role_maker_
.
_worker_gather
(
info
[
0
])
all_info
=
self
.
_role_maker
.
_worker_gather
(
info
[
0
])
self
.
_fleet_ptr
.
gather_clients
(
all_info
)
self
.
_fleet_ptr
.
create_client2client_connection
()
# barrier for init model
self
.
role_maker_
.
_barrier_worker
()
if
self
.
role_maker_
.
_is_first_worker
():
self
.
_role_maker
.
_barrier_worker
()
if
self
.
_role_maker
.
_is_first_worker
():
tables
=
self
.
_dist_desc
.
trainer_param
.
dense_table
for
prog
,
scope
in
zip
(
programs
,
scopes
):
prog_id
=
str
(
id
(
prog
))
...
...
@@ -192,7 +192,7 @@ class Fleet(object):
int
(
table
.
table_id
),
var_name_list
)
# barrier for init model done
self
.
role_maker_
.
_barrier_worker
()
self
.
_role_maker
.
_barrier_worker
()
else
:
print
(
"You should run DistributedOptimizer.minimize() first"
)
sys
.
exit
(
-
1
)
...
...
@@ -201,39 +201,39 @@ class Fleet(object):
"""
return the number of current job's worker num
"""
return
self
.
role_maker_
.
_worker_num
()
return
self
.
_role_maker
.
_worker_num
()
def
get_server_num
(
self
):
"""
return the number of current job's server num
"""
return
self
.
role_maker_
.
_server_num
()
return
self
.
_role_maker
.
_server_num
()
def
get_worker_index
(
self
):
"""
return the mpi rank of current worker
"""
return
self
.
role_maker_
.
_worker_index
()
return
self
.
_role_maker
.
_worker_index
()
def
is_worker
(
self
):
"""
return whether current node is a worker
"""
return
self
.
role_maker_
.
_is_worker
()
return
self
.
_role_maker
.
_is_worker
()
def
is_server
(
self
):
"""
return whether current node is pserver
"""
return
self
.
role_maker_
.
_is_server
()
return
self
.
_role_maker
.
_is_server
()
def
init_pserver_model
(
self
):
"""
init pserver model called from pserver
"""
if
self
.
role_maker_
.
_is_first_worker
():
if
self
.
_role_maker
.
_is_first_worker
():
self
.
_fleet_ptr
.
init_model
()
self
.
role_maker_
.
_barrier_worker
()
self
.
_role_maker
.
_barrier_worker
()
def
save_pserver_model
(
self
,
save_path
):
"""
...
...
python/paddle/fluid/incubate/fleet/parameter_server/node.py
浏览文件 @
ceac9df8
...
...
@@ -42,13 +42,13 @@ class DownpourServer(Server):
"""
def
__init__
(
self
):
self
.
server_
=
pslib
.
ServerParameter
()
self
.
server_
.
downpour_server_param
.
service_param
.
start_server_port
=
0
self
.
server_
.
downpour_server_param
.
service_param
.
server_class
=
"DownpourBrpcPsServer"
self
.
server_
.
downpour_server_param
.
service_param
.
client_class
=
"DownpourBrpcPsClient"
self
.
server_
.
downpour_server_param
.
service_param
.
service_class
=
"DownpourPsService"
self
.
server_
.
downpour_server_param
.
service_param
.
start_server_port
=
0
self
.
server_
.
downpour_server_param
.
service_param
.
server_thread_num
=
12
self
.
_server
=
pslib
.
ServerParameter
()
self
.
_server
.
downpour_server_param
.
service_param
.
start_server_port
=
0
self
.
_server
.
downpour_server_param
.
service_param
.
server_class
=
"DownpourBrpcPsServer"
self
.
_server
.
downpour_server_param
.
service_param
.
client_class
=
"DownpourBrpcPsClient"
self
.
_server
.
downpour_server_param
.
service_param
.
service_class
=
"DownpourPsService"
self
.
_server
.
downpour_server_param
.
service_param
.
start_server_port
=
0
self
.
_server
.
downpour_server_param
.
service_param
.
server_thread_num
=
12
def
add_sparse_table
(
self
,
table_id
,
learning_rate
,
slot_key_vars
,
slot_value_var
):
...
...
@@ -62,7 +62,7 @@ class DownpourServer(Server):
Returns:
return None
"""
table
=
self
.
server_
.
downpour_server_param
.
downpour_table_param
.
add
()
table
=
self
.
_server
.
downpour_server_param
.
downpour_table_param
.
add
()
table
.
table_id
=
table_id
table
.
table_class
=
"DownpourSparseTable"
table
.
type
=
pslib
.
PS_SPARSE_TABLE
...
...
@@ -123,7 +123,7 @@ class DownpourServer(Server):
Returns:
return None
"""
table
=
self
.
server_
.
downpour_server_param
.
downpour_table_param
.
add
()
table
=
self
.
_server
.
downpour_server_param
.
downpour_table_param
.
add
()
table
.
table_id
=
table_id
table
.
table_class
=
"DownpourDenseTable"
table
.
type
=
pslib
.
PS_DENSE_TABLE
...
...
@@ -140,7 +140,7 @@ class DownpourServer(Server):
"""
Return downpour server program_desc
"""
return
self
.
server_
return
self
.
_server
class
DownpourWorker
(
Worker
):
...
...
@@ -155,7 +155,7 @@ class DownpourWorker(Worker):
def
__init__
(
self
,
window
):
self
.
window
=
window
self
.
worker_
=
pslib
.
DownpourTrainerParameter
()
self
.
_worker
=
pslib
.
DownpourTrainerParameter
()
def
add_sparse_table
(
self
,
table_id
,
learning_rate
,
slot_key_vars
,
slot_value_vars
):
...
...
@@ -187,7 +187,7 @@ class DownpourWorker(Worker):
Returns:
return None
"""
table
=
self
.
worker_
.
dense_table
.
add
()
table
=
self
.
_worker
.
dense_table
.
add
()
table
.
table_id
=
table_id
table
.
dense_variable_name
.
extend
(
filter
(
lambda
x
:
x
.
find
(
"embedding"
)
==
-
1
,
...
...
@@ -200,4 +200,4 @@ class DownpourWorker(Worker):
"""
Return downpour worker program_desc
"""
return
self
.
worker_
return
self
.
_worker
python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py
浏览文件 @
ceac9df8
...
...
@@ -24,9 +24,9 @@ from .node import DownpourWorker, DownpourServer
class
DistributedOptimizerImplBase
(
object
):
def
__init__
(
self
,
optimizer
):
self
.
optimizer_
=
optimizer
self
.
learning_rate_
=
optimizer
.
_learning_rate
self
.
regularization_
=
optimizer
.
regularization
self
.
_optimizer
=
optimizer
self
.
_learning_rate
=
optimizer
.
_learning_rate
self
.
_regularization
=
optimizer
.
regularization
def
minimize
(
self
,
losses
,
...
...
@@ -41,7 +41,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
# todo(guru4elephant): add more optimizers here as argument
# todo(guru4elephant): make learning_rate as a variable
super
(
DistributedAdam
,
self
).
__init__
(
optimizer
)
self
.
window_
=
1
self
.
_window
=
1
self
.
type
=
"downpour"
self
.
data_norm_name
=
[
".batch_size"
,
".batch_square_sum"
,
".batch_sum"
,
...
...
@@ -79,9 +79,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
server
=
DownpourServer
()
worker
=
DownpourWorker
(
self
.
window_
)
sparse_table_index
=
0
server
.
add_sparse_table
(
sparse_table_index
,
self
.
learning_rate_
,
server
.
add_sparse_table
(
sparse_table_index
,
self
.
_learning_rate
,
prefetch_slots
,
prefetch_slots_emb
)
worker
.
add_sparse_table
(
sparse_table_index
,
self
.
learning_rate_
,
worker
.
add_sparse_table
(
sparse_table_index
,
self
.
_learning_rate
,
prefetch_slots
,
prefetch_slots_emb
)
dense_table_index
=
1
program_configs
=
{}
...
...
@@ -124,9 +124,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
data_norm_grads
.
append
(
i
[
1
])
if
not
is_data_norm_data
:
grads
.
append
(
i
[
1
])
server
.
add_dense_table
(
dense_table_index
,
self
.
learning_rate_
,
server
.
add_dense_table
(
dense_table_index
,
self
.
_learning_rate
,
params
,
grads
)
worker
.
add_dense_table
(
dense_table_index
,
self
.
learning_rate_
,
worker
.
add_dense_table
(
dense_table_index
,
self
.
_learning_rate
,
params
,
grads
)
program_configs
[
program_id
][
"pull_dense"
]
=
[
dense_table_index
]
program_configs
[
program_id
][
"push_dense"
]
=
[
dense_table_index
]
...
...
@@ -135,9 +135,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
if
len
(
data_norm_params
)
!=
0
and
len
(
data_norm_grads
)
!=
0
:
dense_table_index
+=
1
server
.
add_data_norm_table
(
dense_table_index
,
self
.
learning_rate_
,
self
.
_learning_rate
,
data_norm_params
,
data_norm_grads
)
worker
.
add_dense_table
(
dense_table_index
,
self
.
learning_rate_
,
worker
.
add_dense_table
(
dense_table_index
,
self
.
_learning_rate
,
data_norm_params
,
data_norm_grads
)
#program_config.pull_dense_table_id.extend([dense_table_index])
#program_config.push_dense_table_id.extend([dense_table_index])
...
...
python/paddle/fluid/trainer_desc.py
浏览文件 @
ceac9df8
...
...
@@ -28,10 +28,10 @@ class TrainerDesc(object):
import
multiprocessing
as
mp
# set default thread num == cpu count
self
.
proto_desc
.
thread_num
=
mp
.
cpu_count
()
self
.
fleet_desc_
=
None
self
.
device_worker_
=
None
self
.
program_
=
None
self
.
infer_
=
False
self
.
_fleet_desc
=
None
self
.
_device_worker
=
None
self
.
_program
=
None
self
.
_infer
=
False
def
_set_fetch_var_and_info
(
self
,
fetch_vars
,
fetch_info
,
print_period
):
for
i
,
v
in
enumerate
(
fetch_vars
):
...
...
@@ -47,19 +47,19 @@ class TrainerDesc(object):
self
.
proto_desc
.
thread_num
=
thread_num
def
_set_device_worker
(
self
,
device_worker
):
self
.
device_worker_
=
device_worker
self
.
_device_worker
=
device_worker
def
_set_infer
(
self
,
infer
):
self
.
infer_
=
infer
self
.
_infer
=
infer
def
_set_fleet_desc
(
self
,
fleet_desc
):
self
.
fleet_desc_
=
fleet_desc
self
.
_fleet_desc
=
fleet_desc
def
_gen_trainer_desc
(
self
):
pass
def
_set_program
(
self
,
program
):
self
.
program_
=
program
self
.
_program
=
program
def
_desc
(
self
):
from
google.protobuf
import
text_format
...
...
@@ -73,13 +73,13 @@ class MultiTrainer(TrainerDesc):
def
_set_program
(
self
,
program
):
super
(
MultiTrainer
,
self
).
_set_program
(
program
)
self
.
program_
=
program
self
.
_program
=
program
def
_gen_trainer_desc
(
self
):
super
(
MultiTrainer
,
self
).
_gen_trainer_desc
()
self
.
proto_desc
.
class_name
=
"MultiTrainer"
self
.
device_worker_
.
_set_infer
(
self
.
infer_
)
self
.
device_worker_
.
_gen_worker_desc
(
self
.
proto_desc
)
self
.
_device_worker
.
_set_infer
(
self
.
infer_
)
self
.
_device_worker
.
_gen_worker_desc
(
self
.
proto_desc
)
class
DistMultiTrainer
(
TrainerDesc
):
...
...
@@ -89,13 +89,13 @@ class DistMultiTrainer(TrainerDesc):
def
_set_program
(
self
,
program
):
super
(
DistMultiTrainer
,
self
).
_set_program
(
program
)
self
.
program_
=
program
self
.
_program
=
program
def
_gen_trainer_desc
(
self
):
super
(
DistMultiTrainer
,
self
).
_gen_trainer_desc
()
self
.
proto_desc
.
class_name
=
"DistMultiTrainer"
if
self
.
program_
==
None
:
if
self
.
_program
==
None
:
raise
RuntimeError
(
"None Program"
)
self
.
device_worker_
.
_set_infer
(
self
.
infer_
)
self
.
device_worker_
.
_set_program
(
self
.
program_
)
self
.
device_worker_
.
_gen_worker_desc
(
self
.
proto_desc
)
self
.
_device_worker
.
_set_infer
(
self
.
infer_
)
self
.
_device_worker
.
_set_program
(
self
.
program_
)
self
.
_device_worker
.
_gen_worker_desc
(
self
.
proto_desc
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录