Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
06213b79
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
695
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
06213b79
编写于
12月 04, 2018
作者:
D
dongdaxiang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add hadoop helper function for distributed training
上级
49130f9b
变更
6
展开全部
隐藏空白更改
内联
并排
Showing
6 changed file
with
134 addition
and
69 deletion
+134
-69
python/paddle/fluid/async_executor.py
python/paddle/fluid/async_executor.py
+7
-2
python/paddle/fluid/distribute_lookup_table.py
python/paddle/fluid/distribute_lookup_table.py
+4
-2
python/paddle/fluid/distributed/downpour.py
python/paddle/fluid/distributed/downpour.py
+36
-9
python/paddle/fluid/distributed/helper.py
python/paddle/fluid/distributed/helper.py
+24
-0
python/paddle/fluid/distributed/node.py
python/paddle/fluid/distributed/node.py
+0
-1
python/paddle/fluid/distributed/ps_pb2.py
python/paddle/fluid/distributed/ps_pb2.py
+63
-55
未找到文件。
python/paddle/fluid/async_executor.py
浏览文件 @
06213b79
...
...
@@ -150,8 +150,13 @@ class AsyncExecutor(object):
data_feed
.
desc
(),
filelist
,
thread_num
,
fetch_var_names
,
debug
)
def
config_ps
(
self
,
dist_desc
,
host_sign_list
,
node_num
,
index
):
self
.
executor
.
config_pslib
(
dist_desc
,
host_sign_list
,
node_num
,
index
)
def
config_distributed_nodes
(
self
,
dist_opt
):
# get total rank
# get rank index
# get iplists
# get hadoop info
return
def
start_server
(
self
):
self
.
executor
.
start_server
()
...
...
python/paddle/fluid/distribute_lookup_table.py
浏览文件 @
06213b79
...
...
@@ -21,7 +21,8 @@ def find_distributed_lookup_table_inputs(program, table_name):
for
op
in
program
.
global_block
().
ops
:
if
op
.
type
==
LOOKUP_TABLE_TYPE
:
if
table_name
==
op
.
input
(
"W"
)[
0
]:
inputs
.
extend
([
local_vars
[
name
]
for
name
in
op
.
input
(
"Ids"
)])
inputs
.
extend
(
[
local_vars
[
name
]
for
name
in
op
.
input
(
"Ids"
)])
return
inputs
def
find_distributed_lookup_table_outputs
(
program
,
table_name
):
...
...
@@ -30,7 +31,8 @@ def find_distributed_lookup_table_outputs(program, table_name):
for
op
in
program
.
global_block
().
ops
:
if
op
.
type
==
LOOKUP_TABLE_TYPE
:
if
table_name
==
op
.
input
(
"W"
)[
0
]:
outputs
.
extend
([
local_vars
[
name
]
for
name
in
op
.
output
(
"Out"
)])
outputs
.
extend
(
[
local_vars
[
name
]
for
name
in
op
.
output
(
"Out"
)])
return
outputs
def
find_distributed_lookup_table
(
program
):
...
...
python/paddle/fluid/distributed/downpour.py
浏览文件 @
06213b79
...
...
@@ -8,30 +8,57 @@ from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_o
from
google.protobuf
import
text_format
class
DownpourSGD
(
object
):
"""
Distributed optimizer of downpour stochastic gradient descent
Standard implementation of Google's Downpour SGD
in Large Scale Distributed Deep Networks
Args:
learning_rate (float): the learning rate used to update parameters.
\
Can be a float value
Examples:
.. code-block:: python
downpour_sgd = fluid.distributed.DownpourSGD(learning_rate=0.2)
downpour_sgd.minimize(cost)
"""
def
__init__
(
self
,
learning_rate
=
0.001
,
window
=
1
):
# todo(guru4elephant): if optimizer is not None, will warning here
# todo(guru4elephant): add more optimizers here as argument
# todo(guru4elephant): make learning_rate as a variable
self
.
learning_rate_
=
learning_rate
self
.
window_
=
window
self
.
type
=
"downpour"
def
minimize
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
params_grads
=
sorted
(
append_backward
(
loss
),
key
=
lambda
x
:
x
[
0
].
name
)
params_grads
=
sorted
(
append_backward
(
loss
,
parameter_list
,
no_grad_set
),
key
=
lambda
x
:
x
[
0
].
name
)
table_name
=
find_distributed_lookup_table
(
loss
.
block
.
program
)
prefetch_slots
=
find_distributed_lookup_table_inputs
(
loss
.
block
.
program
,
table_name
)
prefetch_slots_emb
=
find_distributed_lookup_table_outputs
(
loss
.
block
.
program
,
table_name
)
server
=
DownpourServer
()
# window is communication strategy
worker
=
DownpourWorker
(
self
.
window_
)
server
.
add_sparse_table
(
0
,
self
.
learning_rate_
,
# Todo(guru4elephant): support multiple tables definitions
# currently support one big sparse table
sparse_table_index
=
0
# currently merge all dense parameters into one dense table
dense_table_index
=
1
server
.
add_sparse_table
(
sparse_table_index
,
self
.
learning_rate_
,
prefetch_slots
,
prefetch_slots_emb
)
server
.
add_dense_table
(
1
,
self
.
learning_rate_
,
params_grads
[
0
],
params_grads
[
1
])
worker
.
add_sparse_table
(
0
,
self
.
learning_rate_
,
server
.
add_dense_table
(
dense_table_index
,
self
.
learning_rate_
,
params_grads
[
0
],
params_grads
[
1
])
worker
.
add_sparse_table
(
sparse_table_index
,
self
.
learning_rate_
,
prefetch_slots
,
prefetch_slots_emb
)
worker
.
add_dense_table
(
1
,
self
.
learning_rate_
,
params_grads
[
0
],
params_grads
[
1
])
worker
.
add_dense_table
(
dense_table_index
,
self
.
learning_rate_
,
params_grads
[
0
],
params_grads
[
1
])
ps_param
=
pslib
.
PSParameter
()
ps_param
.
server_param
.
CopyFrom
(
server
.
get_desc
())
#ps_param.worker_param.CopyFrom(worker.get_desc())
ps_param
.
worker_param
.
CopyFrom
(
worker
.
get_desc
())
# Todo(guru4elephant): figure out how to support more sparse parameters
# currently only support lookup_table
worker_skipped_ops
=
[
"lookup_table"
,
"lookup_table_grad"
]
ps_param_str
=
text_format
.
MessageToString
(
ps_param
)
return
[
ps_param_str
,
worker_skipped_ops
,
text_format
.
MessageToString
(
worker
.
get_desc
())
]
return
[
ps_param_str
,
worker_skipped_ops
]
python/paddle/fluid/distributed/helper.py
浏览文件 @
06213b79
from
mpi4py
import
MPI
class
FileSystem
(
object
):
def
__init__
(
self
,
fs_type
=
"afs"
,
uri
=
"afs://tianqi.afs.baidu.com:9902"
,
user
=
None
,
passwd
=
None
,
hadoop_bin
=
""
,
afs_conf
=
None
):
assert
user
not
None
assert
passwd
not
None
assert
hadoop_bin
not
None
fs_client
=
pslib
.
FsClientParameter
()
if
fs_type
==
"afs"
:
fs_client
.
fs_type
=
pslib
.
FsApiType
.
AFS
else
:
fs_client
.
fs_type
=
pslib
.
FsApiType
.
HDFS
fs_client
.
uri
=
uri
fs_client
.
user
=
user
fs_client
.
passwd
=
passwd
fs_client
.
buffer_size
=
0
fs_client
.
afs_conf
=
afs_conf
if
not
afs_conf
else
""
class
MPIHelper
(
object
):
def
__init__
(
self
):
self
.
comm
=
MPI
.
COMM_WORLD
...
...
@@ -18,3 +40,5 @@ class MPIHelper(object):
def
get_hostname
(
self
):
import
socket
return
socket
.
gethostname
()
python/paddle/fluid/distributed/node.py
浏览文件 @
06213b79
...
...
@@ -12,7 +12,6 @@ class Worker(object):
class
DownpourServer
(
Server
):
def
__init__
(
self
):
#self.server_ = pslib.ServerParameter().downpour_server_param
self
.
server_
=
pslib
.
ServerParameter
()
def
add_sparse_table
(
self
,
table_id
,
learning_rate
,
...
...
python/paddle/fluid/distributed/ps_pb2.py
浏览文件 @
06213b79
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录