Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
ff87698a
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ff87698a
编写于
3月 12, 2019
作者:
D
dongdaxiang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor downpour optimization
上级
b66f0074
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
70 addition
and
47 deletion
+70
-47
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+2
-5
python/paddle/fluid/device_worker.py
python/paddle/fluid/device_worker.py
+2
-3
python/paddle/fluid/distributed/downpour.py
python/paddle/fluid/distributed/downpour.py
+15
-1
python/paddle/fluid/distributed/fleet.py
python/paddle/fluid/distributed/fleet.py
+19
-7
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+1
-0
python/paddle/fluid/trainer_desc.py
python/paddle/fluid/trainer_desc.py
+16
-23
python/paddle/fluid/trainer_factory.py
python/paddle/fluid/trainer_factory.py
+15
-8
未找到文件。
paddle/fluid/framework/trainer.h
浏览文件 @
ff87698a
...
...
@@ -61,8 +61,7 @@ class MultiTrainer : public TrainerBase {
public:
MultiTrainer
()
{}
virtual
~
MultiTrainer
()
{}
virtual
void
Initialize
(
const
TrainerDesc
&
trainer_desc
,
Dataset
*
data_set
);
virtual
void
Initialize
(
const
TrainerDesc
&
trainer_desc
,
Dataset
*
data_set
);
virtual
void
InitTrainerEnv
(
const
ProgramDesc
&
main_program
,
const
platform
::
Place
&
place
);
virtual
void
InitOtherEnv
(
const
ProgramDesc
&
main_program
)
{}
...
...
@@ -80,14 +79,12 @@ class DistMultiTrainer : public MultiTrainer {
public:
DistMultiTrainer
()
{}
virtual
~
DistMultiTrainer
()
{}
virtual
void
Initialize
(
const
TrainerDesc
&
trainer_desc
,
Dataset
*
data_set
);
virtual
void
Initialize
(
const
TrainerDesc
&
trainer_desc
,
Dataset
*
data_set
);
virtual
void
InitOtherEnv
(
const
ProgramDesc
&
main_program
);
virtual
void
Finalize
();
protected:
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
std
::
shared_ptr
<
paddle
::
framework
::
FleetWrapper
>
fleet_ptr_
;
};
}
// namespace framework
...
...
python/paddle/fluid/device_worker.py
浏览文件 @
ff87698a
...
...
@@ -29,7 +29,7 @@ class Hogwild(DeviceWorker):
trainer_desc
.
device_worker_name
=
"HogwildWorker"
class
Downpour
(
DeviceWorker
):
class
Downpour
SGD
(
DeviceWorker
):
def
__init__
(
self
):
super
(
Downpour
,
self
).
__init__
()
...
...
@@ -55,6 +55,7 @@ class Downpour(DeviceWorker):
sparse_table
.
emb_dim
=
fleet_desc
.
server_param
.
downpour_server_param
.
downpour_table_param
[
0
].
accessor
.
fea_dim
-
2
sparse_table
.
fea_dim
=
sparse_table
.
emb_dim
+
2
# TODO(guru4elephant): hard code here, need to improve
sparse_table
.
label_var_name
=
"click"
dense_table
=
downpour
.
dense_table
.
add
()
...
...
@@ -70,6 +71,4 @@ class Downpour(DeviceWorker):
class
DeviceWorkerFactory
(
object
):
def
create_device_worker
(
self
,
worker_type
):
classname
=
worker_type
.
capitalize
()
print
(
"------------"
)
print
(
classname
)
return
globals
()[
classname
]()
python/paddle/fluid/distributed/downpour.py
浏览文件 @
ff87698a
...
...
@@ -142,4 +142,18 @@ class DownpourSGD(object):
# currently only support lookup_table
worker_skipped_ops
=
[
"lookup_table"
,
"lookup_table_grad"
]
ps_param
.
trainer_param
.
skip_op
.
extend
(
worker_skipped_ops
)
return
[
ps_param
,
worker_skipped_ops
]
# all fleet operations should be defined in operators in the future
# we want to return an object here containing:
# 1) worker execution strategy
# 2) pserver execution strategy
# 3) fleet configurations
# 4) skipped operators in runtime
# 5) distributed optimization
opt_info
=
{}
opt_info
[
"trainer"
]
=
"DistMultiTrainer"
opt_info
[
"device_worker"
]
=
"DownpourSGD"
opt_info
[
"optimizer"
]
=
"DownpourSGD"
opt_info
[
"fleet_desc"
]
=
ps_param
opt_info
[
"worker_skipped_ops"
]
=
worker_skipped_ops
return
opt_info
python/paddle/fluid/distributed/fleet.py
浏览文件 @
ff87698a
...
...
@@ -10,6 +10,7 @@
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import
sys
from
..
import
core
from
.
import
ps_instance
...
...
@@ -33,9 +34,15 @@ class Fleet(object):
self
.
instance_
.
barrier_all
()
self
.
instance
.
finalize
()
def
init_pserver
(
self
,
dist_desc
):
self
.
dist_desc_str_
=
text_format
.
MessageToString
(
dist_desc
)
self
.
dist_desc
=
dist_desc
def
init_pserver
(
self
,
opt_info
):
if
"fleet_desc"
in
opt_info
:
self
.
dist_desc_str_
=
text_format
.
MessageToString
(
opt_info
[
"fleet_desc"
])
self
.
dist_desc_
=
opt_info
[
"fleet_desc"
]
else
:
print
(
"You should run distributed optimization to get opt_info first"
)
sys
.
exit
(
-
1
)
self
.
fleet_
.
init_server
(
self
.
dist_desc_str_
)
ip
=
self
.
fleet_
.
start_server
()
self
.
instance_
.
set_ip
(
ip
)
...
...
@@ -44,10 +51,15 @@ class Fleet(object):
self
.
fleet
.
gather_servers
(
ips
,
self
.
instance_
.
get_node_cnt
())
self
.
instance_
.
barrier_all
()
def
init_worker
(
self
,
dist_desc
):
self
.
dist_desc_str_
=
text_format
.
MessageToString
(
dist_desc
)
self
.
dist_desc_
=
dist_desc
def
init_worker
(
self
,
opt_info
):
if
"fleet_desc"
in
opt_info
:
self
.
dist_desc_str_
=
text_format
.
MessageToString
(
opt_info
[
"fleet_desc"
])
self
.
dist_desc_
=
opt_info
[
"fleet_desc"
]
else
:
print
(
"You should run distributed optimization to get opt_info first"
)
sys
.
exit
(
-
1
)
self
.
instance_
.
barrier_all
()
ips
=
self
.
instance
.
gather_ips
()
self
.
fleet_
.
init_worker
(
self
.
dist_desc_str_
,
ips
,
...
...
python/paddle/fluid/executor.py
浏览文件 @
ff87698a
...
...
@@ -630,6 +630,7 @@ class Executor(object):
trainer
.
set_thread
(
dataset
.
thread_num
)
else
:
trainer
.
set_thread
(
thread
)
trainer
.
gen_trainer_desc
()
dataset
.
_prepare_to_run
()
print
(
"run_from_dataset called"
)
self
.
_default_executor
.
run_from_dataset
(
program
.
desc
,
scope
,
...
...
python/paddle/fluid/trainer_desc.py
浏览文件 @
ff87698a
...
...
@@ -32,19 +32,19 @@ class TrainerDesc(object):
import
multiprocessing
as
mp
# set default thread num == cpu count
self
.
proto_desc
.
thread_num
=
mp
.
cpu_count
()
self
.
fleet_desc_
=
None
self
.
device_worker_
=
None
def
set_thread
(
self
,
thread_num
):
self
.
proto_desc
.
thread_num
=
thread_num
def
set_filelist
(
self
,
filelist
):
self
.
proto_desc
.
filelist
.
extend
(
filelist
)
self
.
proto_desc
.
thread_num
=
min
(
len
(
filelist
),
self
.
proto_desc
.
thread_num
)
def
set_device_worker
(
self
,
device_worker
):
self
.
device_worker_
=
device_worker
def
set_
data_feed
(
self
,
datafeed
):
self
.
proto_desc
.
data_desc
.
CopyFrom
(
datafeed
.
proto_desc
)
def
set_
fleet_desc
(
self
,
fleet_desc
):
self
.
fleet_desc_
=
fleet_desc
def
gen_trainer_desc
(
self
,
dataset
=
None
,
fleet_desc
=
None
,
worker
=
None
):
def
gen_trainer_desc
(
self
):
pass
def
_desc
(
self
):
...
...
@@ -52,17 +52,14 @@ class TrainerDesc(object):
class
MultiTrainer
(
TrainerDesc
):
def
__init__
(
self
,
dataset
=
None
,
worker
=
"Hogwild"
):
def
__init__
(
self
):
super
(
MultiTrainer
,
self
).
__init__
()
if
worker
==
"Hogwild"
:
self
.
proto_desc
.
device_worker_name
=
worker
+
"Worker"
self
.
proto_desc
.
class_name
=
"MultiTrainer"
else
:
raise
ValueError
(
'ValueError: DeviceWorker %s '
'is not supported in MultiTrainer'
%
worker
)
pass
def
gen_trainer_desc
(
self
,
dataset
=
None
,
fleet_desc
=
None
,
worker
=
"Hogwild"
):
super
(
MultiTrainer
,
self
).
gen_trainer_desc
(
fleet_desc
,
worker
)
def
gen_trainer_desc
(
self
):
super
(
MultiTrainer
,
self
).
gen_trainer_desc
()
self
.
proto_desc
.
class_name
=
"MultiTrainer"
self
.
device_worker_
.
gen_worker_desc
(
self
.
proto_desc
,
fleet_desc_
)
class
DistMultiTrainer
(
TrainerDesc
):
...
...
@@ -70,14 +67,10 @@ class DistMultiTrainer(TrainerDesc):
super
(
DistMultiTrainer
,
self
).
__init__
()
pass
def
gen_trainer_desc
(
self
,
dataset
=
None
,
fleet_desc
=
None
,
worker
=
"Downpour"
):
super
(
DistMultiTrainer
,
self
).
gen_trainer_desc
(
fleet_desc
,
worker
)
def
gen_trainer_desc
(
self
):
super
(
DistMultiTrainer
,
self
).
gen_trainer_desc
()
self
.
proto_desc
.
class_name
=
"DistMultiTrainer"
self
.
proto_desc
.
data_desc
.
CopyFrom
(
dataset
.
proto_desc
)
worker_builder
=
DeviceWorkerFactory
()
device_worker
=
worker_builder
.
create_device_worker
(
"Downpour"
)
device_worker
.
gen_worker_desc
(
self
.
proto_desc
,
fleet_desc
)
self
.
device_worker_
.
gen_worker_desc
(
self
.
proto_desc
,
self
.
fleet_desc_
)
def
set_program_config
(
self
,
fleet_desc
,
program_id
):
for
program_config
in
fleet_desc
.
trainer_param
.
program_config
:
...
...
python/paddle/fluid/trainer_factory.py
浏览文件 @
ff87698a
...
...
@@ -20,13 +20,20 @@ class TrainerFactory(object):
pass
def
create_trainer
(
self
,
opt_info
=
None
):
trainer
=
None
device_worker
=
None
if
opt_info
==
None
:
return
MultiTrainer
()
# default is MultiTrainer + Hogwild
trainer
=
MultiTrainer
()
device_worker
=
Hogwild
()
trainer
.
set_device_worker
(
device_worker
)
trainer
.
gen_trainer_desc
()
else
:
if
opt_info
[
"optimizer"
]
==
"DownpourSGD"
:
trainer
=
DistMultiTrainer
()
trainer
.
gen_trainer_desc
(
fleet_desc
=
opt_info
[
"fleet"
],
worker
=
"downpour"
)
return
trainer
else
:
print
(
"Currently only support DownpourSGD"
)
trainer_class
=
opt_info
[
"trainer"
]
device_worker_class
=
opt_info
[
"device_worker"
]
trainer
=
globals
()[
trainer_class
]()
device_worker
=
globals
()[
device_worker_class
]()
trainer
.
set_device_worker
(
device_worker
)
trainer
.
set_fleet_desc
(
opt_info
[
"fleet_desc"
])
trainer
.
gen_trainer_desc
(
fleet_desc
=
opt_info
[
"fleet_desc"
])
return
trainer
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录