Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
6af697ad
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6af697ad
编写于
3月 15, 2019
作者:
D
dongdaxiang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add trainfileswithprofiler for downpour worker
上级
2644b886
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
203 addition
and
10 deletion
+203
-10
paddle/fluid/framework/dist_multi_trainer.cc
paddle/fluid/framework/dist_multi_trainer.cc
+12
-0
paddle/fluid/framework/downpour_worker.cc
paddle/fluid/framework/downpour_worker.cc
+168
-6
paddle/fluid/framework/hogwild_worker.cc
paddle/fluid/framework/hogwild_worker.cc
+1
-1
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+1
-0
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+6
-1
python/paddle/fluid/incubate/fleet/base/role_maker.py
python/paddle/fluid/incubate/fleet/base/role_maker.py
+2
-2
python/paddle/fluid/trainer_desc.py
python/paddle/fluid/trainer_desc.py
+13
-0
未找到文件。
paddle/fluid/framework/dist_multi_trainer.cc
浏览文件 @
6af697ad
...
...
@@ -53,6 +53,18 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc& main_program) {
VLOG
(
3
)
<<
"init other env done."
;
}
void
DistMultiTrainer
::
Run
()
{
for
(
int
thidx
=
0
;
thidx
<
thread_num_
;
++
thidx
)
{
if
(
!
debug_
)
{
threads_
.
push_back
(
std
::
thread
(
&
DeviceWorker
::
TrainFiles
,
workers_
[
thidx
].
get
()));
}
else
{
threads_
.
push_back
(
std
::
thread
(
&
DeviceWorker
::
TrainFilesWithProfiler
,
workers_
[
thidx
].
get
()));
}
}
}
void
DistMultiTrainer
::
Finalize
()
{
for
(
auto
&
th
:
threads_
)
{
th
.
join
();
...
...
paddle/fluid/framework/downpour_worker.cc
浏览文件 @
6af697ad
...
...
@@ -82,14 +82,10 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) {
auto
&
feature
=
features_
[
table_id
];
auto
&
feature_label
=
feature_labels_
[
table_id
];
feature_label
.
resize
(
feature
.
size
());
VLOG
(
3
)
<<
"going to get label_var_name "
<<
label_var_name_
[
table_id
];
Variable
*
var
=
thread_scope_
->
FindVar
(
label_var_name_
[
table_id
]);
VLOG
(
3
)
<<
"going to get tensor"
;
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
VLOG
(
3
)
<<
"going to get ptr"
;
int64_t
*
label_ptr
=
tensor
->
data
<
int64_t
>
();
VLOG
(
3
)
<<
"lele"
;
int
global_index
=
0
;
for
(
size_t
i
=
0
;
i
<
sparse_key_names_
[
table_id
].
size
();
++
i
)
{
VLOG
(
3
)
<<
"sparse_key_names_["
<<
i
...
...
@@ -98,7 +94,6 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) {
LoDTensor
*
tensor
=
fea_var
->
GetMutable
<
LoDTensor
>
();
int64_t
*
ids
=
tensor
->
data
<
int64_t
>
();
int
fea_idx
=
0
;
VLOG
(
3
)
<<
"Haha"
;
// tensor->lod()[0].size() == batch_size + 1
for
(
auto
lod_idx
=
1u
;
lod_idx
<
tensor
->
lod
()[
0
].
size
();
++
lod_idx
)
{
for
(;
fea_idx
<
tensor
->
lod
()[
0
][
lod_idx
];
++
fea_idx
)
{
...
...
@@ -110,7 +105,6 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) {
static_cast
<
float
>
(
label_ptr
[
lod_idx
-
1
]);
}
}
VLOG
(
3
)
<<
"EE"
;
}
CHECK
(
global_index
==
feature
.
size
())
<<
"expect fea info size:"
<<
feature
.
size
()
<<
" real:"
<<
global_index
;
...
...
@@ -163,6 +157,174 @@ void DownpourWorker::FillSparseValue(size_t table_idx) {
void
DownpourWorker
::
TrainFilesWithProfiler
()
{
VLOG
(
3
)
<<
"Begin to train files with profiler"
;
platform
::
SetNumThreads
(
1
);
device_reader_
->
Start
();
std
::
vector
<
double
>
op_total_time
;
std
::
vector
<
std
::
string
>
op_name
;
for
(
auto
&
op
:
ops_
)
{
bool
need_skip
=
false
;
for
(
auto
t
=
0u
;
t
<
skip_ops_
.
size
();
++
t
)
{
if
(
op
->
Type
().
find
(
skip_ops_
[
t
])
!=
std
::
string
::
npos
)
{
need_skip
=
true
;
break
;
}
}
if
(
!
need_skip
)
{
op_name
.
push_back
(
op
->
Type
());
}
}
VLOG
(
3
)
<<
"op name size: "
<<
op_name
.
size
();
op_total_time
.
resize
(
op_name
.
size
());
for
(
size_t
i
=
0
;
i
<
op_total_time
.
size
();
++
i
)
{
op_total_time
[
i
]
=
0.0
;
}
platform
::
Timer
timeline
;
double
total_time
=
0.0
;
double
read_time
=
0.0
;
double
pull_sparse_time
=
0.0
;
double
collect_label_time
=
0.0
;
double
fill_sparse_time
=
0.0
;
double
push_sparse_time
=
0.0
;
double
push_dense_time
=
0.0
;
int
cur_batch
;
int
batch_cnt
=
0
;
timeline
.
Start
();
while
((
cur_batch
=
device_reader_
->
Next
())
>
0
)
{
timeline
.
Pause
();
read_time
+=
timeline
.
ElapsedSec
();
total_time
+=
timeline
.
ElapsedSec
();
VLOG
(
3
)
<<
"program config size: "
<<
param_
.
program_config_size
();
for
(
size_t
i
=
0
;
i
<
param_
.
program_config
(
0
).
pull_sparse_table_id_size
();
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
pull_sparse_table_id
(
i
));
TableParameter
table
;
for
(
auto
i
:
param_
.
sparse_table
())
{
if
(
i
.
table_id
()
==
tid
)
{
table
=
i
;
break
;
}
}
timeline
.
Start
();
fleet_ptr_
->
PullSparseVarsSync
(
*
thread_scope_
,
tid
,
sparse_key_names_
[
tid
],
&
features_
[
tid
],
&
feature_values_
[
tid
],
table
.
fea_dim
());
timeline
.
Pause
();
pull_sparse_time
+=
timeline
.
ElapsedSec
();
CollectLabelInfo
(
i
);
timeline
.
Pause
();
collect_label_time
+=
timeline
.
ElapsedSec
();
timeline
.
Start
();
FillSparseValue
(
i
);
timeline
.
Pause
();
fill_sparse_time
+=
timeline
.
ElapsedSec
();
}
VLOG
(
3
)
<<
"Fill sparse value for all sparse table done."
;
int
run_op_idx
=
0
;
for
(
auto
&
op
:
ops_
)
{
bool
need_skip
=
false
;
for
(
auto
t
=
0u
;
t
<
skip_ops_
.
size
();
++
t
)
{
if
(
op
->
Type
().
find
(
skip_ops_
[
t
])
!=
std
::
string
::
npos
)
{
need_skip
=
true
;
break
;
}
}
if
(
!
need_skip
)
{
timeline
.
Start
();
op
->
Run
(
*
thread_scope_
,
place_
);
timeline
.
Pause
();
op_total_time
[
run_op_idx
++
]
+=
timeline
.
ElapsedSec
();
total_time
+=
timeline
.
ElapsedSec
();
}
}
for
(
size_t
i
=
0
;
i
<
param_
.
program_config
(
0
).
push_sparse_table_id_size
();
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
push_sparse_table_id
(
i
));
TableParameter
table
;
for
(
auto
i
:
param_
.
sparse_table
())
{
if
(
i
.
table_id
()
==
tid
)
{
table
=
i
;
break
;
}
}
timeline
.
Start
();
fleet_ptr_
->
PushSparseVarsWithLabelAsync
(
*
thread_scope_
,
tid
,
features_
[
tid
],
feature_labels_
[
tid
],
sparse_key_names_
[
tid
],
sparse_grad_names_
[
tid
],
table
.
emb_dim
(),
&
feature_grads_
[
tid
],
&
push_sparse_status_
);
timeline
.
Pause
();
push_sparse_time
+=
timeline
.
ElapsedSec
();
}
timeline
.
Start
();
for
(
size_t
i
=
0
;
i
<
param_
.
program_config
(
0
).
push_dense_table_id_size
();
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
push_dense_table_id
(
i
));
fleet_ptr_
->
PushDenseVarsAsync
(
*
thread_scope_
,
tid
,
dense_grad_names_
[
tid
],
&
push_sparse_status_
);
}
timeline
.
Pause
();
push_dense_time
+=
timeline
.
ElapsedSec
();
VLOG
(
3
)
<<
"push sparse and dense gradient done."
;
int32_t
tmp_push_dense_wait_times
=
-
1
;
int32_t
tmp_push_sparse_wait_times
=
-
1
;
static
uint32_t
push_dense_wait_times
=
static_cast
<
uint32_t
>
(
tmp_push_dense_wait_times
);
static
uint32_t
push_sparse_wait_times
=
static_cast
<
uint32_t
>
(
tmp_push_sparse_wait_times
);
if
(
push_dense_status_
.
size
()
>=
push_dense_wait_times
)
{
for
(
auto
&
t
:
push_dense_status_
)
{
t
.
wait
();
}
push_dense_status_
.
resize
(
0
);
}
if
(
tmp_push_dense_wait_times
==
-
1
)
{
push_dense_status_
.
resize
(
0
);
}
if
(
push_sparse_status_
.
size
()
>=
push_sparse_wait_times
)
{
for
(
auto
&
t
:
push_sparse_status_
)
{
t
.
wait
();
}
push_sparse_status_
.
resize
(
0
);
}
if
(
tmp_push_sparse_wait_times
==
-
1
)
{
push_sparse_status_
.
resize
(
0
);
}
VLOG
(
3
)
<<
"going to increase thread version"
;
VLOG
(
3
)
<<
"push dense table id size: "
<<
param_
.
program_config
(
0
).
push_dense_table_id_size
();
for
(
size_t
i
=
0
;
i
<
param_
.
program_config
(
0
).
push_dense_table_id_size
();
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
push_dense_table_id
(
i
));
pull_dense_worker_
->
IncreaseThreadVersion
(
thread_id_
,
tid
);
}
thread_scope_
->
DropKids
();
++
batch_cnt
;
if
(
thread_id_
==
0
)
{
// should be configured here
if
(
batch_cnt
>
0
&&
batch_cnt
%
100
==
0
)
{
for
(
size_t
i
=
0
;
i
<
op_total_time
.
size
();
++
i
)
{
fprintf
(
stderr
,
"op_name:[%zu][%s], op_mean_time:[%fs]
\n
"
,
i
,
op_name
[
i
].
c_str
(),
op_total_time
[
i
]
/
batch_cnt
);
}
fprintf
(
stderr
,
"mean read time: %fs
\n
"
,
read_time
/
batch_cnt
);
fprintf
(
stderr
,
"IO percent: %f
\n
"
,
read_time
/
total_time
*
100
);
}
}
}
}
void
DownpourWorker
::
TrainFiles
()
{
...
...
paddle/fluid/framework/hogwild_worker.cc
浏览文件 @
6af697ad
...
...
@@ -90,7 +90,7 @@ void HogwildWorker::TrainFilesWithProfiler() {
int
batch_cnt
=
0
;
timeline
.
Start
();
while
((
cur_batch
=
device_reader_
->
Next
())
>
0
)
{
LOG
(
WARNING
)
<<
"read a batch in thread "
<<
thread_id_
;
VLOG
(
3
)
<<
"read a batch in thread "
<<
thread_id_
;
timeline
.
Pause
();
read_time
+=
timeline
.
ElapsedSec
();
total_time
+=
timeline
.
ElapsedSec
();
...
...
paddle/fluid/framework/trainer.h
浏览文件 @
6af697ad
...
...
@@ -83,6 +83,7 @@ class DistMultiTrainer : public MultiTrainer {
virtual
~
DistMultiTrainer
()
{}
virtual
void
Initialize
(
const
TrainerDesc
&
trainer_desc
,
Dataset
*
data_set
);
virtual
void
InitOtherEnv
(
const
ProgramDesc
&
main_program
);
virtual
void
Run
();
virtual
void
Finalize
();
protected:
...
...
python/paddle/fluid/executor.py
浏览文件 @
6af697ad
...
...
@@ -627,7 +627,7 @@ class Executor(object):
fetch_list
=
None
,
scope
=
None
,
thread
=
0
,
opt_info
=
Non
e
):
debug
=
Fals
e
):
if
scope
is
None
:
scope
=
global_scope
()
if
fetch_list
is
None
:
...
...
@@ -636,6 +636,8 @@ class Executor(object):
if
not
compiled
:
trainer
=
TrainerFactory
().
create_trainer
(
program
.
_fleet_opt
)
trainer
.
set_program
(
program
)
with
open
(
"fleet_desc.prototxt"
,
"w"
)
as
fout
:
fout
.
write
(
str
(
program
.
_fleet_opt
[
"fleet_desc"
]))
else
:
trainer
=
TrainerFactory
().
create_trainer
(
program
.
program
.
_fleet_opt
)
...
...
@@ -644,8 +646,11 @@ class Executor(object):
trainer
.
set_thread
(
dataset
.
thread_num
)
else
:
trainer
.
set_thread
(
thread
)
trainer
.
set_debug
(
debug
)
trainer
.
gen_trainer_desc
()
dataset
.
_prepare_to_run
()
with
open
(
"trainer_desc.prototxt"
,
"w"
)
as
fout
:
fout
.
write
(
trainer
.
_desc
())
self
.
_default_executor
.
run_from_dataset
(
program
.
desc
,
scope
,
dataset
.
dataset
,
trainer
.
_desc
())
python/paddle/fluid/incubate/fleet/base/role_maker.py
浏览文件 @
6af697ad
...
...
@@ -101,10 +101,10 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return
self
.
get_size
()
def
worker_index
(
self
):
return
self
.
rank
/
self
.
proc_per_node_
return
self
.
rank
_
/
self
.
proc_per_node_
def
server_index
(
self
):
return
self
.
rank
/
self
.
proc_per_node_
return
self
.
rank
_
/
self
.
proc_per_node_
def
barrier_worker
(
self
):
if
self
.
is_worker
():
...
...
python/paddle/fluid/trainer_desc.py
浏览文件 @
6af697ad
...
...
@@ -36,6 +36,9 @@ class TrainerDesc(object):
self
.
device_worker_
=
None
self
.
program_
=
None
def
set_debug
(
self
,
debug
):
self
.
proto_desc
.
debug
=
debug
def
set_thread
(
self
,
thread_num
):
self
.
proto_desc
.
thread_num
=
thread_num
...
...
@@ -60,6 +63,10 @@ class MultiTrainer(TrainerDesc):
super
(
MultiTrainer
,
self
).
__init__
()
pass
def
set_program
(
self
,
program
):
super
(
MultiTrainer
,
self
).
set_program
(
program
)
self
.
program_
=
program
def
gen_trainer_desc
(
self
):
super
(
MultiTrainer
,
self
).
gen_trainer_desc
()
self
.
proto_desc
.
class_name
=
"MultiTrainer"
...
...
@@ -71,8 +78,14 @@ class DistMultiTrainer(TrainerDesc):
super
(
DistMultiTrainer
,
self
).
__init__
()
pass
def
set_program
(
self
,
program
):
super
(
DistMultiTrainer
,
self
).
set_program
(
program
)
self
.
program_
=
program
def
gen_trainer_desc
(
self
):
super
(
DistMultiTrainer
,
self
).
gen_trainer_desc
()
self
.
proto_desc
.
class_name
=
"DistMultiTrainer"
if
self
.
program_
==
None
:
print
(
"None program"
)
self
.
device_worker_
.
set_program
(
self
.
program_
)
self
.
device_worker_
.
gen_worker_desc
(
self
.
proto_desc
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录