Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
6c6a7a14
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6c6a7a14
编写于
9月 02, 2019
作者:
X
xiexionghang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
mpi control && trainer-net update && bug fix
上级
50e6bfc0
变更
36
隐藏空白更改
内联
并排
Showing
36 changed file
with
498 addition
and
106 deletion
+498
-106
paddle/fluid/train/custom_trainer/feed/accessor/dense_input_accessor.cc
...rain/custom_trainer/feed/accessor/dense_input_accessor.cc
+43
-2
paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc
...luid/train/custom_trainer/feed/accessor/epoch_accessor.cc
+58
-22
paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h
...fluid/train/custom_trainer/feed/accessor/epoch_accessor.h
+8
-2
paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h
.../train/custom_trainer/feed/accessor/input_data_accessor.h
+2
-1
paddle/fluid/train/custom_trainer/feed/accessor/sparse_input_accessor.cc
...ain/custom_trainer/feed/accessor/sparse_input_accessor.cc
+62
-8
paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.cc
...e/fluid/train/custom_trainer/feed/common/pslib_warpper.cc
+1
-0
paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc
...d/train/custom_trainer/feed/common/runtime_environment.cc
+17
-2
paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h
...id/train/custom_trainer/feed/common/runtime_environment.h
+21
-0
paddle/fluid/train/custom_trainer/feed/conf/env.conf
paddle/fluid/train/custom_trainer/feed/conf/env.conf
+19
-0
paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml
paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml
+2
-2
paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc
...le/fluid/train/custom_trainer/feed/dataset/data_reader.cc
+1
-0
paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.cc
...ain/custom_trainer/feed/executor/multi_thread_executor.cc
+29
-13
paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h
...rain/custom_trainer/feed/executor/multi_thread_executor.h
+23
-0
paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc
.../fluid/train/custom_trainer/feed/io/hadoop_file_system.cc
+1
-1
paddle/fluid/train/custom_trainer/feed/io/shell.cc
paddle/fluid/train/custom_trainer/feed/io/shell.cc
+1
-0
paddle/fluid/train/custom_trainer/feed/main.cc
paddle/fluid/train/custom_trainer/feed/main.cc
+2
-0
paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.cc
...le/fluid/train/custom_trainer/feed/monitor/auc_monitor.cc
+8
-1
paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.h
paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.h
+2
-2
paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.cc
...e/fluid/train/custom_trainer/feed/monitor/cost_monitor.cc
+4
-2
paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.h
...le/fluid/train/custom_trainer/feed/monitor/cost_monitor.h
+1
-3
paddle/fluid/train/custom_trainer/feed/monitor/monitor.h
paddle/fluid/train/custom_trainer/feed/monitor/monitor.h
+3
-2
paddle/fluid/train/custom_trainer/feed/process/learner_process.cc
...luid/train/custom_trainer/feed/process/learner_process.cc
+20
-23
paddle/fluid/train/custom_trainer/feed/process/learner_process.h
...fluid/train/custom_trainer/feed/process/learner_process.h
+0
-2
paddle/fluid/train/custom_trainer/feed/scripts/compake_runable_package.sh
...in/custom_trainer/feed/scripts/compake_runable_package.sh
+44
-0
paddle/fluid/train/custom_trainer/feed/scripts/join.py
paddle/fluid/train/custom_trainer/feed/scripts/join.py
+21
-10
paddle/fluid/train/custom_trainer/feed/scripts/model/join/main_program
...train/custom_trainer/feed/scripts/model/join/main_program
+0
-0
paddle/fluid/train/custom_trainer/feed/scripts/model/join/model.yaml
...d/train/custom_trainer/feed/scripts/model/join/model.yaml
+1
-1
paddle/fluid/train/custom_trainer/feed/scripts/model/join/startup_program
...in/custom_trainer/feed/scripts/model/join/startup_program
+0
-0
paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program
...train/custom_trainer/feed/scripts/model/join/test_program
+0
-0
paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program
...ain/custom_trainer/feed/scripts/model/update/main_program
+0
-0
paddle/fluid/train/custom_trainer/feed/scripts/model/update/startup_program
.../custom_trainer/feed/scripts/model/update/startup_program
+0
-0
paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program
...ain/custom_trainer/feed/scripts/model/update/test_program
+0
-0
paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh
...d/train/custom_trainer/feed/scripts/start_feed_trainer.sh
+50
-2
paddle/fluid/train/custom_trainer/feed/scripts/submit_mpi.sh
paddle/fluid/train/custom_trainer/feed/scripts/submit_mpi.sh
+32
-0
paddle/fluid/train/custom_trainer/feed/scripts/update.py
paddle/fluid/train/custom_trainer/feed/scripts/update.py
+20
-5
paddle/fluid/train/custom_trainer/feed/trainer_context.h
paddle/fluid/train/custom_trainer/feed/trainer_context.h
+2
-0
未找到文件。
paddle/fluid/train/custom_trainer/feed/accessor/dense_input_accessor.cc
浏览文件 @
6c6a7a14
#include <sstream>
#include "gflags/gflags.h"
#include "paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h"
namespace
paddle
{
namespace
custom_trainer
{
namespace
feed
{
DEFINE_string
(
feed_trainer_debug_dense_name
,
""
,
"open dense debug for specif layer_name"
);
int
DenseInputAccessor
::
initialize
(
YAML
::
Node
config
,
std
::
shared_ptr
<
TrainerContext
>
context_ptr
)
{
...
...
@@ -85,7 +89,6 @@ int32_t DenseInputAccessor::forward(SampleInstance* samples, size_t num,
}
_pull_mutex
.
unlock
();
}
size_t
data_buffer_idx
=
0
;
for
(
auto
&
variable
:
_x_variables
)
{
auto
*
shape_ptr
=
&
(
variable
.
shape
[
0
]);
...
...
@@ -97,6 +100,26 @@ int32_t DenseInputAccessor::forward(SampleInstance* samples, size_t num,
memcpy
(
var_data
,
_data_buffer
+
data_buffer_idx
,
variable
.
dim
*
sizeof
(
float
));
data_buffer_idx
+=
variable
.
dim
;
}
if
(
!
FLAGS_feed_trainer_debug_dense_name
.
empty
())
{
data_buffer_idx
=
0
;
std
::
stringstream
ssm
;
for
(
auto
&
variable
:
_x_variables
)
{
if
(
variable
.
name
!=
FLAGS_feed_trainer_debug_dense_name
)
{
data_buffer_idx
+=
variable
.
dim
;
continue
;
}
ssm
.
str
(
""
);
auto
&
tensor
=
ScopeHelper
::
var
<
paddle
::
framework
::
LoDTensor
>
(
scope
,
variable
.
name
);
const
auto
*
var_data
=
tensor
.
data
<
float
>
();
for
(
size_t
data_idx
=
0
;
data_idx
<
variable
.
dim
;
++
data_idx
)
{
if
(
data_idx
>
0
)
ssm
<<
","
;
ssm
<<
_data_buffer
[
data_buffer_idx
+
data_idx
];
}
data_buffer_idx
+=
variable
.
dim
;
VLOG
(
2
)
<<
"[DEBUG]pull_dense: "
<<
ssm
.
str
();
}
}
if
(
_need_async_pull
)
{
++
_pull_request_num
;
}
...
...
@@ -118,7 +141,25 @@ int32_t DenseInputAccessor::backward(SampleInstance* samples, size_t num,
}
auto
*
ps_client
=
_trainer_context
->
pslib
->
ps_client
();
auto
push_status
=
ps_client
->
push_dense
(
regions
.
data
(),
regions
.
size
(),
_table_id
);
//return push_status.get();
//push_status.get();
if
(
!
FLAGS_feed_trainer_debug_dense_name
.
empty
())
{
std
::
stringstream
ssm
;
for
(
auto
&
variable
:
_x_variables
)
{
ssm
.
str
(
""
);
if
(
variable
.
name
!=
FLAGS_feed_trainer_debug_dense_name
)
{
continue
;
}
auto
&
tensor
=
scope
->
Var
(
variable
.
gradient_name
)
->
Get
<
paddle
::
framework
::
LoDTensor
>
();
const
auto
*
var_data
=
tensor
.
data
<
float
>
();
for
(
size_t
data_idx
=
0
;
data_idx
<
variable
.
dim
;
++
data_idx
)
{
if
(
data_idx
>
0
)
ssm
<<
","
;
ssm
<<
var_data
[
data_idx
];
}
VLOG
(
2
)
<<
"[DEBUG]push_dense: "
<<
ssm
.
str
();
}
}
return
0
;
}
...
...
paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc
浏览文件 @
6c6a7a14
...
...
@@ -15,22 +15,22 @@ namespace feed {
return
-
1
;
}
auto
fs
=
_trainer_context
->
file_system
.
get
();
if
(
config
[
"donefile"
])
{
_done_file_path
=
fs
->
path_join
(
_model_root_path
,
config
[
"donefile"
].
as
<
std
::
string
>
());
}
else
{
_done_file_path
=
fs
->
path_join
(
_model_root_path
,
"epoch_donefile.txt"
);
}
_done_file_path
=
fs
->
path_join
(
_model_root_path
,
config
[
"donefile"
].
as
<
std
::
string
>
(
"epoch_donefile.txt"
));
if
(
!
fs
->
exists
(
_done_file_path
))
{
VLOG
(
0
)
<<
"missing done file, path:"
<<
_done_file_path
;
return
-
1
;
}
std
::
string
done_text
=
fs
->
tail
(
_done_file_path
);
_done_status
=
paddle
::
string
::
split_string
(
done_text
,
std
::
string
(
"
\t
"
));
_current_epoch_id
=
get_status
<
uint64_t
>
(
EpochStatusFiled
::
EpochIdField
);
_last_checkpoint_epoch_id
=
get_status
<
uint64_t
>
(
EpochStatusFiled
::
CheckpointIdField
);
_last_checkpoint_path
=
get_status
<
std
::
string
>
(
EpochStatusFiled
::
CheckpointPathField
);
_inference_base_model_key
=
get_status
<
uint64_t
>
(
EpochStatusFiled
::
InferenceBaseKeyField
);
_inference_model_path
=
fs
->
path_join
(
_model_root_path
,
config
[
"inference_model_dir"
].
as
<
std
::
string
>
(
"xbox"
));
_inference_model_base_done_path
=
fs
->
path_join
(
_inference_model_path
,
config
[
"inference_base_done_name"
].
as
<
std
::
string
>
(
"xbox_base_done.txt"
));
_inference_model_delta_done_path
=
fs
->
path_join
(
_inference_model_path
,
config
[
"inference_delta_done_name"
].
as
<
std
::
string
>
(
"xbox_delta_done.txt"
));
return
0
;
}
...
...
@@ -46,31 +46,64 @@ namespace feed {
set_status
(
EpochStatusFiled
::
CheckpointIdField
,
_last_checkpoint_epoch_id
);
set_status
(
EpochStatusFiled
::
CheckpointPathField
,
_last_checkpoint_path
);
set_status
(
EpochStatusFiled
::
DateField
,
format_timestamp
(
epoch_id
,
"%Y%m%d"
));
// 非主节点不做状态持久化
if
(
!
_trainer_context
->
environment
->
is_master_node
(
EnvironmentRole
::
WORKER
))
{
set_status
(
EpochStatusFiled
::
InferenceBaseKeyField
,
_inference_base_model_key
);
return
0
;
}
int
EpochAccessor
::
update_model_donefile
(
uint64_t
epoch_id
,
ModelSaveWay
save_way
)
{
auto
*
env
=
_trainer_context
->
environment
.
get
();
// 非主节点不做done状态持久化
if
(
!
env
->
is_master_node
(
EnvironmentRole
::
WORKER
))
{
return
0
;
}
auto
fs
=
_trainer_context
->
file_system
.
get
();
std
::
string
done_str
=
paddle
::
string
::
join_strings
(
_done_status
,
'\t'
);
std
::
string
done_str
;
std
::
string
donefile
;
auto
model_path
=
model_save_path
(
epoch_id
,
save_way
);
std
::
string
inference_done_format
(
"{
\"
id
\"
:
\"
%lu
\"
,
\"
key
\"
:
\"
%lu
\"
,
\"
input
\"
:
\"
%s/000
\"
,
\"
record_count
\"
:
\"
1
\"
,
\"
file_format
\"
:
\"
pb
\"
,
\"
schema_version
\"
:
\"
2
\"
,
\"
partition_type
\"
:
\"
1
\"
,
\"
job_name
\"
:
\"
%s
\"
,
\"
job_id
\"
:
\"
%s
\"
,
\"
mpi_size
\"
:
\"
%d
\"
,
\"
monitor_data
\"
:
\"
%s
\"
}"
);
auto
id
=
time
(
NULL
);
switch
(
save_way
)
{
case
ModelSaveWay
::
ModelSaveTrainCheckpoint
:
donefile
=
_done_file_path
;
done_str
=
paddle
::
string
::
join_strings
(
_done_status
,
'\t'
);
break
;
case
ModelSaveWay
::
ModelSaveInferenceDelta
:
donefile
=
_inference_model_delta_done_path
;
done_str
=
string
::
format_string
(
inference_done_format
.
c_str
(),
id
,
_inference_base_model_key
,
model_path
.
c_str
(),
env
->
job_name
().
c_str
(),
env
->
job_id
().
c_str
(),
env
->
node_num
(
EnvironmentRole
::
PSERVER
),
_trainer_context
->
monitor_ssm
.
str
().
c_str
());
break
;
case
ModelSaveWay
::
ModelSaveInferenceBase
:
donefile
=
_inference_model_base_done_path
;
_inference_base_model_key
=
id
;
done_str
=
string
::
format_string
(
inference_done_format
.
c_str
(),
id
,
id
,
model_path
.
c_str
(),
env
->
job_name
().
c_str
(),
env
->
job_id
().
c_str
(),
env
->
node_num
(
EnvironmentRole
::
PSERVER
),
_trainer_context
->
monitor_ssm
.
str
().
c_str
());
break
;
}
// 保留末尾1000数据
std
::
string
tail_done_info
=
paddle
::
string
::
trim_spaces
(
fs
->
tail
(
_done_file_path
,
1000
));
std
::
string
tail_done_info
;
auto
fs
=
_trainer_context
->
file_system
.
get
();
if
(
fs
->
exists
(
donefile
))
{
tail_done_info
=
paddle
::
string
::
trim_spaces
(
fs
->
tail
(
donefile
,
1000
));
}
if
(
tail_done_info
.
size
()
>
0
)
{
tail_done_info
=
tail_done_info
+
"
\n
"
+
done_str
;
}
else
{
tail_done_info
=
done_str
;
}
VLOG
(
2
)
<<
"Write
epoch donefile to "
<<
_done_file_path
<<
", str:"
<<
done_str
;
VLOG
(
2
)
<<
"Write
donefile "
<<
donefile
<<
", str:"
<<
done_str
;
bool
write_success
=
false
;
while
(
true
)
{
fs
->
remove
(
_done_file_path
);
auto
fp
=
fs
->
open_write
(
_done_file_path
,
""
);
fs
->
remove
(
donefile
);
auto
fp
=
fs
->
open_write
(
donefile
,
""
);
if
(
fwrite
(
tail_done_info
.
c_str
(),
tail_done_info
.
length
(),
1
,
&*
fp
)
==
1
)
{
break
;
}
sleep
(
10
);
}
VLOG
(
2
)
<<
"Write
epoch donefile
success"
;
VLOG
(
2
)
<<
"Write
donefile "
<<
donefile
<<
"
success"
;
return
0
;
}
...
...
@@ -126,7 +159,10 @@ namespace feed {
case
ModelSaveWay
::
ModelSaveInferenceBase
:
return
is_last_epoch
(
epoch_id
);
case
ModelSaveWay
::
ModelSaveTrainCheckpoint
:
return
delta_id
(
epoch_id
)
%
8
==
0
;
if
(
is_last_epoch
(
epoch_id
))
{
return
true
;
}
return
delta_id
(
epoch_id
)
%
24
==
0
;
}
return
false
;
}
...
...
@@ -137,11 +173,11 @@ namespace feed {
std
::
string
date_with_hour
=
format_timestamp
(
epoch_id
,
"%Y%m%d%H"
);
switch
(
save_way
)
{
case
ModelSaveWay
::
ModelSaveInferenceDelta
:
return
_trainer_context
->
file_system
->
path_join
(
_
model_root
_path
,
string
::
format_string
(
"
xbox/
%s/delta-%d"
,
date
.
c_str
(),
delta
));
return
_trainer_context
->
file_system
->
path_join
(
_
inference_model
_path
,
string
::
format_string
(
"%s/delta-%d"
,
date
.
c_str
(),
delta
));
case
ModelSaveWay
::
ModelSaveInferenceBase
:
return
_trainer_context
->
file_system
->
path_join
(
_
model_root
_path
,
string
::
format_string
(
"
xbox/
%s/base"
,
date
.
c_str
()));
return
_trainer_context
->
file_system
->
path_join
(
_
inference_model
_path
,
string
::
format_string
(
"%s/base"
,
date
.
c_str
()));
case
ModelSaveWay
::
ModelSaveTrainCheckpoint
:
return
_trainer_context
->
file_system
->
path_join
(
_model_root_path
,
string
::
format_string
(
"batch_model/%s"
,
date_with_hour
.
c_str
()));
...
...
paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h
浏览文件 @
6c6a7a14
...
...
@@ -14,7 +14,8 @@ enum class EpochStatusFiled {
TimestampField
=
1
,
CheckpointPathField
=
2
,
EpochIdField
=
3
,
CheckpointIdField
=
4
CheckpointIdField
=
4
,
InferenceBaseKeyField
=
5
};
class
EpochAccessor
:
public
Accessor
{
...
...
@@ -62,14 +63,19 @@ public:
virtual
bool
need_save_model
(
uint64_t
epoch_id
,
ModelSaveWay
save_way
)
=
0
;
virtual
std
::
string
model_save_path
(
uint64_t
epoch_id
,
ModelSaveWay
save_way
)
=
0
;
virtual
int
update_model_donefile
(
uint64_t
epoch_id
,
ModelSaveWay
save_way
);
protected:
TrainerContext
*
_trainer_context
;
std
::
string
_done_file_path
;
std
::
string
_model_root_path
;
std
::
string
_inference_model_path
;
std
::
string
_inference_model_base_done_path
;
std
::
string
_inference_model_delta_done_path
;
uint64_t
_current_epoch_id
=
0
;
std
::
string
_last_checkpoint_path
;
uint64_t
_last_checkpoint_epoch_id
=
0
;
std
::
vector
<
std
::
string
>
_done_status
;
//当前完成状态,统一存成string
std
::
vector
<
std
::
string
>
_done_status
;
// 当前完成状态,统一存成string
uint64_t
_inference_base_model_key
=
0
;
// 预估模型的base-key
};
REGIST_REGISTERER
(
EpochAccessor
);
...
...
paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h
浏览文件 @
6c6a7a14
...
...
@@ -102,7 +102,8 @@ public:
paddle
::
framework
::
Scope
*
scope
);
// SparseGradValue会被依次调用,用于整理push的梯度
virtual
void
fill_gradient
(
float
*
push_value
,
const
float
*
gradient_raw
,
paddle
::
ps
::
ValueAccessor
&
,
SparseInputVariable
&
,
SampleInstance
&
)
=
0
;
paddle
::
ps
::
ValueAccessor
&
,
SparseInputVariable
&
,
SampleInstance
&
,
FeatureItem
&
)
=
0
;
protected:
// 输入层列表
...
...
paddle/fluid/train/custom_trainer/feed/accessor/sparse_input_accessor.cc
浏览文件 @
6c6a7a14
#include <math.h>
#include <vector>
#include <utility>
#include <sstream>
#include "gflags/gflags.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/fluid/train/custom_trainer/feed/common/scope_helper.h"
#include "paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h"
DEFINE_int32
(
feed_trainer_debug_sparse_slot
,
0
,
"open sparse debug for specif slot"
);
namespace
paddle
{
namespace
custom_trainer
{
namespace
feed
{
...
...
@@ -99,6 +103,30 @@ int32_t BaseSparseInputAccessor::forward(SampleInstance* samples,
}
}
}
if
(
FLAGS_feed_trainer_debug_sparse_slot
)
{
std
::
stringstream
ssm
;
for
(
size_t
samp_idx
=
0
;
samp_idx
<
num
;
++
samp_idx
)
{
ssm
.
str
(
""
);
auto
&
features
=
samples
[
samp_idx
].
features
;
for
(
auto
&
feature_item
:
features
)
{
for
(
size_t
i
=
0
;
i
<
_x_variables
.
size
();
++
i
)
{
auto
&
variable
=
_x_variables
[
i
];
if
(
feature_item
.
slot
()
!=
FLAGS_feed_trainer_debug_sparse_slot
)
{
continue
;
}
if
(
variable
.
slot_idx
[
feature_item
.
slot
()]
<
0
)
{
continue
;
}
ssm
<<
"("
<<
feature_item
.
sign
()
<<
","
<<
feature_item
.
slot
();
for
(
auto
weight
:
feature_item
.
weights
)
{
ssm
<<
","
<<
weight
;
}
ssm
<<
")"
;
}
}
VLOG
(
2
)
<<
"[DEBUG][sparse_slot_pull]"
<<
ssm
.
str
();
}
}
// Variable后置处理
for
(
size_t
i
=
0
;
i
<
_x_variables
.
size
();
++
i
)
{
auto
&
variable
=
_x_variables
[
i
];
...
...
@@ -145,12 +173,37 @@ int32_t BaseSparseInputAccessor::backward(SampleInstance* samples,
const
float
*
grad_data
=
var_runtime_data
[
i
].
gradient_data
+
samp_idx
*
variable
.
total_dim
+
variable
.
slot_dim
*
slot_idx
;
fill_gradient
(
&
(
feature_item
.
gradients
[
0
]),
grad_data
,
*
value_accessor
,
variable
,
samples
[
samp_idx
]);
*
value_accessor
,
variable
,
samples
[
samp_idx
]
,
feature_item
);
keys
[
key_idx
]
=
feature_item
.
sign
();
push_values
[
key_idx
++
]
=
&
(
feature_item
.
gradients
[
0
]);
}
}
}
if
(
FLAGS_feed_trainer_debug_sparse_slot
)
{
size_t
key_idx
=
0
;
std
::
stringstream
ssm
;
for
(
size_t
samp_idx
=
0
;
samp_idx
<
num
;
++
samp_idx
)
{
ssm
.
str
(
""
);
auto
&
features
=
samples
[
samp_idx
].
features
;
for
(
auto
&
feature_item
:
features
)
{
for
(
size_t
i
=
0
;
i
<
_x_variables
.
size
();
++
i
)
{
auto
&
variable
=
_x_variables
[
i
];
if
(
feature_item
.
slot
()
!=
FLAGS_feed_trainer_debug_sparse_slot
)
{
continue
;
}
if
(
variable
.
slot_idx
[
feature_item
.
slot
()]
<
0
)
{
continue
;
}
ssm
<<
"("
<<
feature_item
.
sign
()
<<
","
<<
feature_item
.
slot
();
for
(
auto
weight
:
feature_item
.
gradients
)
{
ssm
<<
","
<<
weight
;
}
ssm
<<
")"
;
}
}
VLOG
(
2
)
<<
"[DEBUG][sparse_slot_push]"
<<
ssm
.
str
();
}
}
auto
push_status
=
ps_client
->
push_sparse
(
_table_id
,
keys
.
data
(),
(
const
float
**
)
push_values
,
key_idx
);
//auto ret = push_status.get();
...
...
@@ -180,8 +233,8 @@ public:
}
virtual
void
fill_gradient
(
float
*
push_value
,
const
float
*
gradient_raw
,
paddle
::
ps
::
ValueAccessor
&
value_accessor
,
S
parseInputVariable
&
variable
,
SampleInstance
&
sampl
e
)
{
paddle
::
ps
::
ValueAccessor
&
value_accessor
,
SparseInputVariable
&
variable
,
S
ampleInstance
&
sample
,
FeatureItem
&
featur
e
)
{
// join阶段不回填梯度
CHECK
(
false
);
return
;
...
...
@@ -207,12 +260,13 @@ public:
}
virtual
void
fill_gradient
(
float
*
push_value
,
const
float
*
gradient_raw
,
paddle
::
ps
::
ValueAccessor
&
value_accessor
,
SparseInputVariable
&
variable
,
SampleInstance
&
sample
)
{
push_value
[
0
]
+=
1
;
push_value
[
1
]
+=
sample
.
labels
[
0
];
paddle
::
ps
::
ValueAccessor
&
value_accessor
,
SparseInputVariable
&
variable
,
SampleInstance
&
sample
,
FeatureItem
&
feature
)
{
push_value
[
0
]
=
feature
.
slot
();
push_value
[
1
]
+=
1
;
push_value
[
2
]
+=
sample
.
labels
[
0
];
for
(
size_t
i
=
0
;
i
<
variable
.
slot_dim
;
++
i
)
{
push_value
[
i
+
2
]
+=
gradient_raw
[
i
];
push_value
[
i
+
3
]
+=
gradient_raw
[
i
];
}
return
;
}
...
...
paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.cc
浏览文件 @
6c6a7a14
...
...
@@ -38,6 +38,7 @@ int PSlib::init_server() {
_environment
->
rank_id
(
EnvironmentRole
::
PSERVER
));
_server_ptr
->
start
();
}
_environment
->
barrier
(
EnvironmentRole
::
ALL
);
_environment
->
ps_environment
()
->
gather_ps_servers
();
return
0
;
}
...
...
paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc
浏览文件 @
6c6a7a14
...
...
@@ -56,7 +56,6 @@ struct mpi_type_trait<unsigned long long> {
return
MPI_UNSIGNED_LONG_LONG
;
}
};
RuntimeEnvironment
::
RuntimeEnvironment
()
{}
RuntimeEnvironment
::~
RuntimeEnvironment
()
{}
bool
RuntimeEnvironment
::
is_master_node
(
EnvironmentRole
role
)
{
...
...
@@ -87,13 +86,24 @@ public:
return
0
;
}
virtual
int
wireup
()
{
int
hr
=
MPI_Init
(
NULL
,
NULL
);
int
argc
=
0
;
char
**
argv
=
NULL
;
int
hr
=
MPI_Init
(
&
argc
,
&
argv
);
if
(
MPI_SUCCESS
!=
hr
)
{
LOG
(
FATAL
)
<<
"MPI_init failed with error code"
<<
hr
;
return
-
1
;
}
_roles_node_info
.
resize
(
static_cast
<
int
>
(
EnvironmentRole
::
ALL
)
+
1
);
add_role
(
EnvironmentRole
::
ALL
);
char
*
value
=
getenv
(
"JOB_ID"
);
if
(
value
)
{
_job_id
=
value
;
}
value
=
getenv
(
"JOB_NAME"
);
if
(
value
)
{
_job_name
=
value
;
}
return
0
;
}
...
...
@@ -155,6 +165,11 @@ protected:
return
;
}
VLOG
(
static_cast
<
int
>
(
level
))
<<
log_str
;
/*
static std::mutex mtx;
std::lock_guard<std::mutex> guard(mtx);
std::err << log_str;
*/
}
inline
MpiNodeInfo
&
mpi_node_info
(
EnvironmentRole
role
)
{
...
...
paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h
浏览文件 @
6c6a7a14
...
...
@@ -46,6 +46,15 @@ public:
virtual
~
RuntimeEnvironment
();
// 配置初始化
virtual
int
initialize
(
YAML
::
Node
config
)
=
0
;
// job 信息
virtual
std
::
string
job_id
()
{
return
_job_id
;
}
virtual
std
::
string
job_name
()
{
return
_job_name
;
}
// 设置role
virtual
int
add_role
(
EnvironmentRole
role
)
=
0
;
// 判断role
...
...
@@ -90,9 +99,21 @@ public:
protected:
virtual
void
print_log
(
EnvironmentRole
role
,
EnvironmentLogType
type
,
EnvironmentLogLevel
level
,
const
std
::
string
&
log_str
)
=
0
;
std
::
string
_job_id
=
"default_job_id"
;
std
::
string
_job_name
=
"default_job_name"
;
};
REGIST_REGISTERER
(
RuntimeEnvironment
);
#define ENVLOG_WORKER_ALL_NOTICE \
environment->log(EnvironmentRole::WORKER, EnvironmentLogType::ALL_LOG, EnvironmentLogType::NOTICE,
#define ENVLOG_WORKER_MASTER_NOTICE \
environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogType::NOTICE,
#define ENVLOG_WORKER_ALL_ERROR \
environment->log(EnvironmentRole::WORKER, EnvironmentLogType::ALL_LOG, EnvironmentLogType::ERROR,
#define ENVLOG_WORKER_MASTER_ERROR \
environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogType::ERROR,
std
::
string
format_timestamp
(
time_t
time
,
const
char
*
format
);
inline
std
::
string
format_timestamp
(
time_t
time
,
const
std
::
string
&
format
)
{
return
format_timestamp
(
time
,
format
.
c_str
());
...
...
paddle/fluid/train/custom_trainer/feed/conf/env.conf
0 → 100644
浏览文件 @
6c6a7a14
HPC_HOME
=/
home
/
work
/
xiexionghang
/
trainer
/
paddle_trainer
/
feed_muye
/
smart_client
HADOOP_HOME
=/
home
/
work
/
xiexionghang
/
trainer
/
paddle_trainer
/
feed_muye
/
hadoop
-
client
/
hadoop
/
#===============Job-related config======================
MPI_JOB_NAME
=
feed_smfw_shoubai_video_cupai_new_arch
MPI_QUEUE
=
feed5
MPI_PRIORITY
=
high
MPI_NODE_NUM
=
100
MPI_WALL_TIME
=
700
:
00
:
00
MPI_NODE_MEM
=
100000
MPI_RESOURCE
=
full
#===========MPI cluster Server(nmg-off/10g/hlan)==========
MPI_SERVER
=
yq01
-
hpc
-
lvliang01
-
smart
-
master
.
dmop
.
baidu
.
com
#===========Cluster-related (HDFS/MPI Server)==============
HDFS_ROOT
=/
user
/
feed
/
mlarch
/
mio_temp
/$(
date
+%
Y
%
m
%
d
-%
H
%
M
%
S
-%
N
)
HADOOP_FS
=
afs
://
xingtian
.
afs
.
baidu
.
com
:
9902
HADOOP_UGI
=
mlarch
,
Fv1M87
paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml
浏览文件 @
6c6a7a14
...
...
@@ -44,7 +44,7 @@ executor:
train_batch_size
:
32
input_parse_thread_num
:
10
push_gradient_thread_num
:
16
train_thread_num
:
1
6
train_thread_num
:
1
2
need_dump_all_model
:
true
-
name
:
update
class
:
SimpleExecutor
...
...
@@ -52,5 +52,5 @@ executor:
train_batch_size
:
32
input_parse_thread_num
:
10
push_gradient_thread_num
:
16
train_thread_num
:
1
6
train_thread_num
:
1
2
need_dump_all_model
:
false
paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc
浏览文件 @
6c6a7a14
...
...
@@ -469,6 +469,7 @@ public:
return
read_all
(
file_list
,
data_channel
);
}
virtual
int
read_all
(
const
std
::
vector
<
std
::
string
>&
file_list
,
::
paddle
::
framework
::
Channel
<
DataItem
>
data_channel
)
{
data_channel
->
Open
();
const
int
file_list_size
=
file_list
.
size
();
std
::
atomic
<
bool
>
is_failed
(
false
);
...
...
paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.cc
浏览文件 @
6c6a7a14
#include "paddle/fluid/platform/timer.h"
#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h"
#include "paddle/fluid/train/custom_trainer/feed/monitor/monitor.h"
#include "paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h"
...
...
@@ -94,20 +95,22 @@ paddle::framework::Channel<DataItem> MultiThreadExecutor::run(
[
this
,
parser
](
DataItem
*
item
,
size_t
item_num
,
ScopePoolObj
*
scope
,
size_t
*
scope_num
,
size_t
thread_idx
)
->
int
{
*
scope_num
=
1
;
paddle
::
platform
::
Timer
timer
;
timer
.
Start
();
auto
scope_obj
=
_scope_obj_pool
->
get
();
auto
*
samples
=
new
SampleInstance
[
item_num
];
auto
*
scope_context
=
new
ScopeExecutorContext
(
item_num
);
auto
*
samples
=
scope_context
->
samples
();
for
(
size_t
i
=
0
;
i
<
item_num
;
++
i
)
{
CHECK
(
parser
->
parse_to_sample
(
item
[
i
],
samples
[
i
])
==
0
);
}
for
(
size_t
i
=
0
;
i
<
_input_accessors
.
size
();
++
i
)
{
_input_accessors
[
i
]
->
forward
(
samples
,
item_num
,
scope_obj
.
get
());
}
int64_t
data_for_scope
=
(
int64_t
)
samples
;
timer
.
Pause
();
scope_context
->
prepare_cost_ms
=
timer
.
ElapsedMS
();
int64_t
data_for_scope
=
(
int64_t
)
scope_context
;
ScopeHelper
::
fill_value
(
scope_obj
.
get
(),
_trainer_context
->
cpu_place
,
"sample_data"
,
data_for_scope
);
data_for_scope
=
(
int64_t
)
item_num
;
ScopeHelper
::
fill_value
(
scope_obj
.
get
(),
_trainer_context
->
cpu_place
,
"sample_num"
,
data_for_scope
);
"scope_context"
,
data_for_scope
);
*
scope
=
std
::
move
(
scope_obj
);
return
0
;
});
...
...
@@ -123,7 +126,14 @@ paddle::framework::Channel<DataItem> MultiThreadExecutor::run(
auto
*
executor
=
_thread_executors
[
thread_idx
].
get
();
size_t
&
out_idx
=
*
out_num
;
for
(
out_idx
=
0
;
out_idx
<
in_num
;
++
out_idx
)
{
CHECK
(
executor
->
run
(
in_items
[
out_idx
].
get
())
==
0
);
auto
*
scope
=
in_items
[
out_idx
].
get
();
auto
*
scope_ctx
=
(
ScopeExecutorContext
*
)(
*
ScopeHelper
::
get_value
<
int64_t
>
(
scope
,
_trainer_context
->
cpu_place
,
"scope_context"
));
paddle
::
platform
::
Timer
timer
;
timer
.
Start
();
CHECK
(
executor
->
run
(
scope
)
==
0
);
timer
.
Pause
();
scope_ctx
->
executor_cost_ms
=
timer
.
ElapsedMS
();
out_items
[
out_idx
]
=
std
::
move
(
in_items
[
out_idx
]);
}
return
0
;
...
...
@@ -139,20 +149,24 @@ paddle::framework::Channel<DataItem> MultiThreadExecutor::run(
int
*
out_items
,
size_t
*
out_num
,
size_t
thread_idx
)
->
int
{
size_t
&
out_idx
=
*
out_num
;
for
(
out_idx
=
0
;
out_idx
<
in_num
;
++
out_idx
)
{
paddle
::
platform
::
Timer
timer
;
timer
.
Start
();
auto
*
scope
=
in_items
[
out_idx
].
get
();
auto
sample_num
=
*
ScopeHelper
::
get_value
<
int64_t
>
(
scope
,
_trainer_context
->
cpu_place
,
"sample_num"
);
auto
*
scope_ctx
=
(
ScopeExecutorContext
*
)(
*
ScopeHelper
::
get_value
<
int64_t
>
(
scope
,
_trainer_context
->
cpu_place
,
"scope_context"
));
auto
*
samples
=
scope_ctx
->
samples
();
auto
sample_num
=
scope_ctx
->
sample_num
();
auto
*
samples
=
(
SampleInstance
*
)(
*
ScopeHelper
::
get_value
<
int64_t
>
(
scope
,
_trainer_context
->
cpu_place
,
"sample_data"
));
for
(
size_t
i
=
0
;
i
<
_input_accessors
.
size
();
++
i
)
{
out_items
[
out_idx
]
=
_input_accessors
[
i
]
->
backward
(
samples
,
sample_num
,
scope
);
}
timer
.
Pause
();
scope_ctx
->
push_gradient_cost_ms
=
timer
.
ElapsedMS
();
for
(
auto
&
monitor
:
_monitors
)
{
monitor
->
add_data
(
epoch_id
,
this
,
s
amples
,
sample_num
);
monitor
->
add_data
(
epoch_id
,
this
,
s
cope_ctx
);
}
delete
[]
samples
;
// 所有pipe完成后,再回收sample
delete
scope_ctx
;
// 所有pipe完成后,再回收sample
}
return
0
;
});
...
...
@@ -167,6 +181,8 @@ paddle::framework::Channel<DataItem> MultiThreadExecutor::run(
monitor
->
compute_result
();
VLOG
(
2
)
<<
"[Monitor]"
<<
_train_exe_name
<<
", monitor:"
<<
monitor
->
get_name
()
<<
", result:"
<<
monitor
->
format_result
();
_trainer_context
->
monitor_ssm
<<
_train_exe_name
<<
":"
<<
monitor
->
get_name
()
<<
":"
<<
monitor
->
format_result
()
<<
","
;
monitor
->
reset
();
}
}
...
...
paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h
浏览文件 @
6c6a7a14
...
...
@@ -11,6 +11,29 @@ namespace feed {
class
Monitor
;
typedef
paddle
::
ps
::
ObjectPool
<::
paddle
::
framework
::
Scope
>::
PooledObject
ScopePoolObj
;
class
ScopeExecutorContext
{
public:
ScopeExecutorContext
(
size_t
sample_num
)
{
_samples
=
new
SampleInstance
[
sample_num
];
_sample_num
=
sample_num
;
}
virtual
~
ScopeExecutorContext
()
{
delete
[]
_samples
;
}
inline
SampleInstance
*
samples
()
{
return
_samples
;
}
inline
size_t
sample_num
()
{
return
_sample_num
;
}
size_t
executor_cost_ms
=
0
;
size_t
prepare_cost_ms
=
0
;
size_t
push_gradient_cost_ms
=
0
;
private:
size_t
_sample_num
=
0
;
SampleInstance
*
_samples
=
NULL
;
};
class
MultiThreadExecutor
{
public:
MultiThreadExecutor
()
{}
...
...
paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc
浏览文件 @
6c6a7a14
...
...
@@ -73,7 +73,7 @@ public:
}
shell_execute
(
string
::
format_string
(
"%s -rmr %s &>/dev/null; true"
,
_hdfs_command
.
c_str
(),
path
.
c_str
()));
"%s -rmr %s &>/dev/null; true"
,
hdfs_command
(
path
)
.
c_str
(),
path
.
c_str
()));
}
std
::
vector
<
std
::
string
>
list
(
const
std
::
string
&
path
)
override
{
...
...
paddle/fluid/train/custom_trainer/feed/io/shell.cc
浏览文件 @
6c6a7a14
...
...
@@ -356,6 +356,7 @@ std::string shell_get_command_output(const std::string& cmd) {
return
reader
.
get
();
}
}
VLOG
(
2
)
<<
"run shell cmd:"
<<
cmd
<<
", errno:"
<<
err_no
;
}
while
(
err_no
==
-
1
);
return
""
;
#endif
...
...
paddle/fluid/train/custom_trainer/feed/main.cc
浏览文件 @
6c6a7a14
...
...
@@ -32,6 +32,7 @@ int main(int argc, char* argv[]) {
}
auto
*
environment
=
trainer_context_ptr
->
environment
.
get
();
environment
->
wireup
();
VLOG
(
2
)
<<
"node_num: "
<<
environment
->
node_num
(
EnvironmentRole
::
ALL
);
if
(
environment
->
node_num
(
EnvironmentRole
::
ALL
)
==
1
)
{
environment
->
add_role
(
EnvironmentRole
::
WORKER
);
environment
->
add_role
(
EnvironmentRole
::
PSERVER
);
...
...
@@ -42,6 +43,7 @@ int main(int argc, char* argv[]) {
}
trainer_context_ptr
->
pslib
.
reset
(
new
PSlib
());
std
::
string
ps_config
=
config
[
"environment"
][
"ps"
].
as
<
std
::
string
>
();
trainer_context_ptr
->
environment
->
barrier
(
EnvironmentRole
::
ALL
);
trainer_context_ptr
->
pslib
->
initialize
(
ps_config
,
environment
);
//VLOG(3) << "Node Start With Role:" << role;
...
...
paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.cc
浏览文件 @
6c6a7a14
#include "paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.h"
#include "paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h"
namespace
paddle
{
namespace
custom_trainer
{
...
...
@@ -19,7 +20,9 @@ int AucMonitor::initialize(const YAML::Node& config, std::shared_ptr<TrainerCont
}
void
AucMonitor
::
add_data
(
int
epoch_id
,
const
MultiThreadExecutor
*
executor
,
SampleInstance
*
samples
,
size_t
num
)
{
const
MultiThreadExecutor
*
executor
,
ScopeExecutorContext
*
ctx
)
{
auto
num
=
ctx
->
sample_num
();
auto
*
samples
=
ctx
->
samples
();
CHECK
(
num
>
0
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
_mutex
);
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
...
...
@@ -80,6 +83,10 @@ std::string AucMonitor::format_result() {
}
void
AucMonitor
::
add_unlocked
(
double
pred
,
int
label
)
{
if
(
std
::
isnan
(
pred
))
{
VLOG
(
2
)
<<
"pred["
<<
pred
<<
"] outside of [0,1]"
;
continue
;
}
CHECK
(
pred
>=
0
&&
pred
<=
1
)
<<
"pred["
<<
pred
<<
"] outside of [0,1]"
;
CHECK
(
label
==
0
||
label
==
1
)
<<
"label["
<<
label
<<
"] invalid"
;
_table
[
label
][
std
::
min
(
int
(
pred
*
_table_size
),
_table_size
-
1
)]
++
;
...
...
paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.h
浏览文件 @
6c6a7a14
...
...
@@ -18,8 +18,8 @@ public:
std
::
shared_ptr
<
TrainerContext
>
context_ptr
)
override
;
//添加一项记录,统计内容Monitor自行从Executor按需获取
virtual
void
add_data
(
int
epoch_id
,
const
MultiThreadExecutor
*
executor
,
SampleInstance
*
samples
,
size_t
num
);
virtual
void
add_data
(
int
epoch_id
,
const
MultiThreadExecutor
*
executor
,
ScopeExecutorContext
*
);
//是否开始结果统计
virtual
bool
need_compute_result
(
int
epoch_id
);
...
...
paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.cc
浏览文件 @
6c6a7a14
#include "paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.h"
#include "paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h"
namespace
paddle
{
namespace
custom_trainer
{
...
...
@@ -12,8 +13,9 @@ int CostMonitor::initialize(const YAML::Node& config, std::shared_ptr<TrainerCon
}
void
CostMonitor
::
add_data
(
int
epoch_id
,
const
MultiThreadExecutor
*
executor
,
SampleInstance
*
samples
,
size_t
num
)
{
const
MultiThreadExecutor
*
executor
,
ScopeExecutorContext
*
ctx
)
{
auto
num
=
ctx
->
sample_num
();
auto
*
samples
=
ctx
->
samples
();
CHECK
(
executor
!=
nullptr
);
//TODO use paddle time
_total_time_ms
+=
1
;
...
...
paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.h
浏览文件 @
6c6a7a14
...
...
@@ -18,9 +18,7 @@ public:
//添加一项记录,统计内容Monitor自行从Executor按需获取
virtual
void
add_data
(
int
epoch_id
,
const
MultiThreadExecutor
*
executor
,
SampleInstance
*
samples
,
size_t
num
);
const
MultiThreadExecutor
*
executor
,
ScopeExecutorContext
*
);
//是否开始结果统计
virtual
bool
need_compute_result
(
int
epoch_id
);
...
...
paddle/fluid/train/custom_trainer/feed/monitor/monitor.h
浏览文件 @
6c6a7a14
...
...
@@ -10,6 +10,7 @@ namespace paddle {
namespace
custom_trainer
{
namespace
feed
{
class
MultiThreadExecutor
;
class
ScopeExecutorContext
;
class
Monitor
{
public:
...
...
@@ -25,8 +26,8 @@ public:
}
//添加一项记录,统计内容Monitor自行从Executor按需获取
virtual
void
add_data
(
int
epoch_id
,
const
MultiThreadExecutor
*
executor
,
SampleInstance
*
samples
,
size_t
num
)
=
0
;
virtual
void
add_data
(
int
epoch_id
,
const
MultiThreadExecutor
*
executor
,
ScopeExecutorContext
*
)
=
0
;
//是否对于当前epoch_id进行结果统计
virtual
bool
need_compute_result
(
int
epoch_id
)
=
0
;
...
...
paddle/fluid/train/custom_trainer/feed/process/learner_process.cc
浏览文件 @
6c6a7a14
...
...
@@ -3,6 +3,7 @@
*Train样本
*/
#include <omp.h>
#include "paddle/fluid/platform/timer.h"
#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h"
#include "paddle/fluid/train/custom_trainer/feed/dataset/dataset.h"
#include "paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h"
...
...
@@ -25,26 +26,18 @@ int LearnerProcess::initialize(std::shared_ptr<TrainerContext> context_ptr) {
return
0
;
}
std
::
future
<
int
>
LearnerProcess
::
save_model
(
uint64_t
epoch_id
,
int
table_id
,
ModelSaveWay
way
)
{
std
::
promise
<
int
>
p
;
auto
ret
=
p
.
get_future
();
auto
*
ps_client
=
_context_ptr
->
pslib
->
ps_client
();
auto
*
epoch_accessor
=
_context_ptr
->
epoch_accessor
.
get
();
if
(
epoch_accessor
->
need_save_model
(
epoch_id
,
way
))
{
VLOG
(
2
)
<<
"Start save model, table_id:"
<<
table_id
;
auto
model_dir
=
epoch_accessor
->
model_save_path
(
epoch_id
,
way
);
return
ps_client
->
save
(
table_id
,
model_dir
,
std
::
to_string
((
int
)
way
));
}
else
{
p
.
set_value
(
0
);
}
return
ret
;
}
int
LearnerProcess
::
wait_save_model
(
uint64_t
epoch_id
,
ModelSaveWay
way
)
{
auto
*
ps_client
=
_context_ptr
->
pslib
->
ps_client
();
auto
*
environment
=
_context_ptr
->
environment
.
get
();
auto
*
epoch_accessor
=
_context_ptr
->
epoch_accessor
.
get
();
if
(
!
environment
->
is_master_node
(
EnvironmentRole
::
WORKER
))
{
return
0
;
}
if
(
!
epoch_accessor
->
need_save_model
(
epoch_id
,
way
))
{
return
0
;
}
paddle
::
platform
::
Timer
timer
;
timer
.
Start
();
std
::
set
<
uint32_t
>
table_set
;
for
(
auto
&
executor
:
_executors
)
{
const
auto
&
table_accessors
=
executor
->
table_accessors
();
...
...
@@ -56,13 +49,18 @@ int LearnerProcess::wait_save_model(uint64_t epoch_id, ModelSaveWay way) {
auto
table_num
=
table_set
.
size
();
std
::
future
<
int
>
rets
[
table_num
];
for
(
auto
table_id
:
table_set
)
{
rets
[
ret_size
++
]
=
save_model
(
epoch_id
,
table_id
,
way
);
VLOG
(
2
)
<<
"Start save model, table_id:"
<<
table_id
;
auto
model_dir
=
epoch_accessor
->
model_save_path
(
epoch_id
,
way
);
rets
[
ret_size
++
]
=
ps_client
->
save
(
table_id
,
model_dir
,
std
::
to_string
((
int
)
way
));
}
int
all_ret
=
0
;
for
(
int
i
=
0
;
i
<
ret_size
;
++
i
)
{
rets
[
i
].
wait
();
all_ret
|=
rets
[
i
].
get
();
}
timer
.
Pause
();
VLOG
(
2
)
<<
"Save Model Cost(s):"
<<
timer
.
ElapsedSec
();
_context_ptr
->
epoch_accessor
->
update_model_donefile
(
epoch_id
,
way
);
return
all_ret
;
}
...
...
@@ -115,6 +113,7 @@ int LearnerProcess::run() {
while
(
true
)
{
epoch_accessor
->
next_epoch
();
_context_ptr
->
monitor_ssm
.
str
(
""
);
bool
already_dump_inference_model
=
false
;
epoch_id
=
epoch_accessor
->
current_epoch_id
();
std
::
string
epoch_log_title
=
paddle
::
string
::
format_string
(
...
...
@@ -141,6 +140,8 @@ int LearnerProcess::run() {
std
::
map
<
std
::
string
,
paddle
::
framework
::
Channel
<
DataItem
>>
backup_input_map
;
for
(
auto
&
executor
:
_executors
)
{
environment
->
barrier
(
EnvironmentRole
::
WORKER
);
paddle
::
platform
::
Timer
timer
;
timer
.
Start
();
VLOG
(
2
)
<<
"Start executor:"
<<
executor
->
train_exe_name
();
auto
data_name
=
executor
->
train_data_name
();
paddle
::
framework
::
Channel
<
DataItem
>
input_channel
;
...
...
@@ -150,12 +151,12 @@ int LearnerProcess::run() {
input_channel
=
dataset
->
fetch_data
(
data_name
,
epoch_id
);
}
input_channel
=
executor
->
run
(
input_channel
,
dataset
->
data_parser
(
data_name
));
VLOG
(
2
)
<<
"End executor:"
<<
executor
->
train_exe_name
();
timer
.
Pause
();
VLOG
(
2
)
<<
"End executor:"
<<
executor
->
train_exe_name
()
<<
", cost"
<<
timer
.
ElapsedSec
();
// 等待异步梯度完成
_context_ptr
->
ps_client
()
->
flush
();
environment
->
barrier
(
EnvironmentRole
::
WORKER
);
if
(
executor
->
is_dump_all_model
())
{
already_dump_inference_model
=
true
;
wait_save_model
(
epoch_id
,
ModelSaveWay
::
ModelSaveInferenceDelta
);
...
...
@@ -167,16 +168,12 @@ int LearnerProcess::run() {
//Step3. Dump Model For Delta&&Checkpoint
{
if
(
!
already_dump_inference_model
)
{
already_dump_inference_model
=
true
;
wait_save_model
(
epoch_id
,
ModelSaveWay
::
ModelSaveInferenceDelta
);
}
wait_save_model
(
epoch_id
,
ModelSaveWay
::
ModelSaveInferenceBase
);
wait_save_model
(
epoch_id
,
ModelSaveWay
::
ModelSaveTrainCheckpoint
);
environment
->
barrier
(
EnvironmentRole
::
WORKER
);
epoch_accessor
->
epoch_done
(
epoch_id
);
environment
->
barrier
(
EnvironmentRole
::
WORKER
);
}
//Step4. Output Monitor && RunStatus
...
...
paddle/fluid/train/custom_trainer/feed/process/learner_process.h
浏览文件 @
6c6a7a14
...
...
@@ -22,8 +22,6 @@ protected:
virtual
int
load_model
(
uint64_t
epoch_id
);
// 同步保存所有模型
virtual
int
wait_save_model
(
uint64_t
epoch_id
,
ModelSaveWay
way
);
// 异步保存指定模型
virtual
std
::
future
<
int
>
save_model
(
uint64_t
epoch_id
,
int
table_id
,
ModelSaveWay
way
);
private:
std
::
vector
<
std
::
shared_ptr
<
MultiThreadExecutor
>>
_executors
;
...
...
paddle/fluid/train/custom_trainer/feed/scripts/compake_runable_package.sh
0 → 100755
浏览文件 @
6c6a7a14
#!/bin/bash
#用于运行期的hadoop访问
TRAINER_HODOOP_HOME
=
""
#用于跟据网络脚本生成模型
TRAINER_PYTHON_HOME
=
"/home/xiexionghang/paddle/py-paddle/"
#环境准备
if
[
!
-f
${
TRAINER_PYTHON_HOME
}
/python/bin/paddle
]
;
then
echo
"Miss File:
${
TRAINER_PYTHON_HOME
}
/python/bin/paddle"
echo
"TRAINER_PYTHON_HOME:
${
TRAINER_PYTHON_HOME
}
is invalid, Fix it, or Get From here:"
echo
"wget ftp://cp01-arch-gr06.epc.baidu.com/home/xiexionghang/paddle/py-paddle.tar.gz"
echo
"Then set TRAINER_PYTHON_HOME"
exit
0
fi
TRAINER_PYTHON_BIN
=
${
TRAINER_PYTHON_HOME
}
/python/bin/python
# for bad paddle 这里需要想办法解决,paddle的前置目录太多
if
[
!
-f
../../../third_party/install/pslib/lib/libps.so
]
;
then
mkdir
-p
../../../third_party/install/pslib/lib/
ln
-s
${
TRAINER_PYTHON_HOME
}
/third_party/install/pslib/lib/libps.so ../../../third_party/install/pslib/lib/libps.so
fi
#生成模型配置
#这里按名匹配 可能会出现匹配错误&兼容性差的问题,最好是先python解析yaml文件
items
=
`
grep
" name:"
conf/trainer.yaml |
awk
-F
':'
'{print $2}'
|awk
'{sub("^ *","");sub(" *$","");print}'
`
for
item
in
${
items
[@]
}
;
do
if
[
!
-f
scripts/
${
item
}
.py
]
;
then
echo
"Missing model_net config: scripts/
${
item
}
.py, skip it
$item
"
continue
fi
rm
-rf
model/
$item
${
TRAINER_PYTHON_BIN
}
scripts/create_programs.py scripts/
${
item
}
.py
if
[
$?
-ne
0
]
;
then
echo
"Create model with scripts/
${
item
}
.py failed"
exit
1
fi
done
#输出package包
rm
-rf
package
mkdir
package
cp
-r
bin conf tool scripts model so package
cp
-r
${
TRAINER_HODOOP_HOME
}
package/hadoop-client
paddle/fluid/train/custom_trainer/feed/scripts/join.py
浏览文件 @
6c6a7a14
...
...
@@ -26,22 +26,33 @@ def inference():
# TODO: build network here
cvm_input
=
fluid
.
layers
.
data
(
name
=
'cvm_input'
,
shape
=
[
sparse_cvm_dim
(
sparse_cvm
)],
dtype
=
'float32'
,
stop_gradient
=
False
)
net
=
cvm_input
net
=
fluid
.
layers
.
data_norm
(
input
=
net
,
name
=
"bn6048"
,
epsilon
=
1e-4
,
param_attr
=
{
"batch_size"
:
1e4
,
"batch_sum_default"
:
0.0
,
"batch_square"
:
1e4
})
net
=
fluid
.
layers
.
fc
(
net
,
511
,
act
=
'relu'
,
name
=
'fc_1'
)
net
=
fluid
.
layers
.
fc
(
net
,
255
,
act
=
'relu'
,
name
=
'fc_2'
)
net
=
fluid
.
layers
.
fc
(
net
,
255
,
act
=
'relu'
,
name
=
'fc_3'
)
net
=
fluid
.
layers
.
fc
(
net
,
127
,
act
=
'relu'
,
name
=
'fc_4'
)
net
=
fluid
.
layers
.
fc
(
net
,
127
,
act
=
'relu'
,
name
=
'fc_5'
)
net
=
fluid
.
layers
.
fc
(
net
,
127
,
act
=
'relu'
,
name
=
'fc_6'
)
net
=
fluid
.
layers
.
fc
(
net
,
127
,
act
=
'relu'
,
name
=
'fc_7'
)
lr_x
=
1.0
init_range
=
0.2
fc_layers_size
=
[
511
,
255
,
255
,
127
,
127
,
127
,
127
]
fc_layers_act
=
[
"relu"
]
*
len
(
fc_layers_size
)
scales_tmp
=
[
net
.
shape
[
1
]]
+
fc_layers_size
scales
=
[]
for
i
in
range
(
len
(
scales_tmp
)):
scales
.
append
(
init_range
/
(
scales_tmp
[
i
]
**
0.5
))
for
i
in
range
(
len
(
fc_layers_size
)):
net
=
fluid
.
layers
.
fc
(
input
=
net
,
size
=
fc_layers_size
[
i
],
name
=
'fc_'
+
str
(
i
+
1
),
act
=
fc_layers_act
[
i
],
param_attr
=
\
fluid
.
ParamAttr
(
learning_rate
=
lr_x
,
\
initializer
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
1.0
*
scales
[
i
])),
bias_attr
=
\
fluid
.
ParamAttr
(
learning_rate
=
lr_x
,
\
initializer
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
1.0
*
scales
[
i
])))
ctr_output
=
fluid
.
layers
.
fc
(
net
,
1
,
act
=
'sigmoid'
,
name
=
'ctr'
)
accessors
=
[
{
"class"
:
"AbacusSparse
Update
Accessor"
,
"input"
:
"sparses"
,
"table_id"
:
0
,
"need_gradient"
:
False
},
{
"class"
:
"AbacusSparse
Join
Accessor"
,
"input"
:
"sparses"
,
"table_id"
:
0
,
"need_gradient"
:
False
},
{
"class"
:
"DenseInputAccessor"
,
"input"
:
"vars"
,
"table_id"
:
1
,
"need_gradient"
:
True
,
"async_pull"
:
True
},
{
"class"
:
"DenseInputAccessor"
,
"input"
:
"sums"
,
"table_id"
:
2
,
"need_gradient"
:
True
,
"async_pull"
:
True
},
{
"class"
:
"LabelInputAccessor"
,
"input"
:
"labels"
}
...
...
paddle/fluid/train/custom_trainer/feed/scripts/model/join/main_program
浏览文件 @
6c6a7a14
无法预览此类型文件
paddle/fluid/train/custom_trainer/feed/scripts/model/join/model.yaml
浏览文件 @
6c6a7a14
aa_Attention
:
Do Not Modify This File Manually, Unless You Really Know It
input_accessor
:
-
class
:
AbacusSparse
Update
Accessor
-
class
:
AbacusSparse
Join
Accessor
input
:
-
name
:
cvm_input
slot_dim
:
11
...
...
paddle/fluid/train/custom_trainer/feed/scripts/model/join/startup_program
浏览文件 @
6c6a7a14
无法预览此类型文件
paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program
浏览文件 @
6c6a7a14
无法预览此类型文件
paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program
浏览文件 @
6c6a7a14
无法预览此类型文件
paddle/fluid/train/custom_trainer/feed/scripts/model/update/startup_program
浏览文件 @
6c6a7a14
无法预览此类型文件
paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program
浏览文件 @
6c6a7a14
无法预览此类型文件
paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh
浏览文件 @
6c6a7a14
#!/bin/bash
export
LD_LIBRARY_PATH
=
LD_LIBRARY_PATH:./so
./bin/feed_trainer
"
$@
"
BIN_FILE
=
feed_trainer
work_dir
=
`
pwd
`
function
usage
()
{
echo
-e
"
\0
33[41mUSAGE: sh scripts/start_feed_trainer.sh [run_mode]
\0
33[0m"
echo
"run_mode=mpi, run job in mpi cluster"
echo
"run_mode=mpi_tmp, run 1 node with mpi in /tmp"
echo
"run_mode=local, run 1 node in local"
echo
"Example: sh scripts/start_feed_trainer.sh local"
exit
0
}
if
[
$#
-lt
1
]
;
then
run_mode
=
"mpi"
else
run_mode
=
"
$1
"
fi
export
PATH
=
/usr/local/openmpi/bin:
$PATH
export
LD_LIBRARY_PATH
=
$LD_LIBRARY_PATH
:/usr/local/openmpi/lib/
if
[
"
${
run_mode
}
"
=
"mpi"
]
;
then
mpirun
mv
package/
*
.
mpirun
mkdir
-p
log
export
HADOOP_HOME
=
"./hadoop-client/hadoop"
export
PATH
=
$HADOOP_HOME
/bin/:./bin:
$PATH
export
LD_LIBRARY_PATH
=
$LD_LIBRARY_PATH
:./so
mpirun
sed
-i
's/LocalRuntimeEnvironment/MPIRuntimeEnvironment/g'
conf/
*
.yaml
export
HADOOP_HOME
=
"./hadoop-client/hadoop"
export
PATH
=
$HADOOP_HOME
/bin/:/bin:
$PATH
export
LD_LIBRARY_PATH
=
$LD_LIBRARY_PATH
:./so
GLOG_logtostderr
=
0 mpirun
-npernode
2
-timestamp-output
-tag-output
--prefix
$work_dir
./bin/feed_trainer
--log_dir
=
log
elif
[
"
${
run_mode
}
"
=
"mpi_tmp"
]
;
then
mv
package/
*
.
mkdir
temp
export
HADOOP_HOME
=
"
$work_dir
/hadoop-client/hadoop"
export
PATH
=
$HADOOP_HOME
/bin/:/bin:
$PATH
export
LD_LIBRARY_PATH
=
$LD_LIBRARY_PATH
:
${
work_dir
}
/so
sed
-i
's/LocalRuntimeEnvironment/MPIRuntimeEnvironment/g'
conf/
*
.yaml
mpirun
-npernode
2
-timestamp-output
-tag-output
--prefix
$work_dir
--mca
orte_tmpdir_base
${
work_dir
}
/temp scripts/start_feed_trainer.sh random_log
elif
[
"
${
run_mode
}
"
=
"local"
]
;
then
sed
-i
's/MPIRuntimeEnvironment/LocalRuntimeEnvironment/g'
conf/
*
.yaml
export
LD_LIBRARY_PATH
=
$LD_LIBRARY_PATH
:
${
work_dir
}
/so
mkdir
log
./bin/feed_trainer
--log_dir
=
log
elif
[
"
${
run_mode
}
"
=
"random_log"
]
;
then
log_dir
=
"log/log.
${
RANDOM
}
"
./bin/feed_trainer
--log_dir
=
log
else
usage
fi
paddle/fluid/train/custom_trainer/feed/scripts/submit_mpi.sh
0 → 100755
浏览文件 @
6c6a7a14
#!/bin/bash
export
PATH
=
/bin/:
$PATH
set
-x
source
conf/env.conf
echo
"# This file is automatically generated. Don't change it."
>
conf/qsub_f.conf
echo
"SERVER=
$MPI_SERVER
"
>>
conf/qsub_f.conf
echo
"QUEUE=
$MPI_QUEUE
"
>>
conf/qsub_f.conf
echo
"PRIORITY=
$MPI_PRIORITY
"
>>
conf/qsub_f.conf
export
HADOOP_HOME
=
$HADOOP_HOME
sh scripts/compake_runable_package.sh
$HPC_HOME
/bin/qsub_f
\
-N
$MPI_JOB_NAME
\
--conf
conf/qsub_f.conf
\
--hdfs
$HADOOP_FS
\
--ugi
$HADOOP_UGI
\
--hout
$HDFS_ROOT
\
--files
package
\
-l
nodes
=
$MPI_NODE_NUM
,walltime
=
$MPI_WALL_TIME
,pmem-hard
=
$MPI_NODE_MEM
,pcpu-soft
=
180,pnetin-soft
=
1000,pnetout-soft
=
1000
\
scripts/start_feed_trainer.sh
if
[
$?
-ne
0
]
;
then
echo
-e
"qsub_f failed, please check the config or get help from abacus RD
\n
"
exit
-1
fi
exit
0
paddle/fluid/train/custom_trainer/feed/scripts/update.py
浏览文件 @
6c6a7a14
...
...
@@ -25,11 +25,26 @@ def inference():
cvm_input
=
fluid
.
layers
.
data
(
name
=
'cvm_input'
,
shape
=
[
sparse_cvm_dim
(
sparse_cvm
)],
dtype
=
'float32'
,
stop_gradient
=
False
)
net
=
cvm_input
net
=
fluid
.
layers
.
fc
(
net
,
511
,
act
=
'relu'
,
name
=
'fc_1'
)
net
=
fluid
.
layers
.
fc
(
net
,
255
,
act
=
'relu'
,
name
=
'fc_2'
)
net
=
fluid
.
layers
.
fc
(
net
,
127
,
act
=
'relu'
,
name
=
'fc_3'
)
net
=
fluid
.
layers
.
fc
(
net
,
127
,
act
=
'relu'
,
name
=
'fc_4'
)
net
=
fluid
.
layers
.
fc
(
net
,
127
,
act
=
'relu'
,
name
=
'fc_5'
)
lr_x
=
1.0
init_range
=
0.2
fc_layers_size
=
[
511
,
255
,
127
,
127
,
127
]
fc_layers_act
=
[
"relu"
]
*
len
(
fc_layers_size
)
scales_tmp
=
[
net
.
shape
[
1
]]
+
fc_layers_size
scales
=
[]
for
i
in
range
(
len
(
scales_tmp
)):
scales
.
append
(
init_range
/
(
scales_tmp
[
i
]
**
0.5
))
for
i
in
range
(
len
(
fc_layers_size
)):
net
=
fluid
.
layers
.
fc
(
input
=
net
,
size
=
fc_layers_size
[
i
],
name
=
'fc_'
+
str
(
i
+
1
),
act
=
fc_layers_act
[
i
],
param_attr
=
\
fluid
.
ParamAttr
(
learning_rate
=
lr_x
,
\
initializer
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
1.0
*
scales
[
i
])),
bias_attr
=
\
fluid
.
ParamAttr
(
learning_rate
=
lr_x
,
\
initializer
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
1.0
*
scales
[
i
])))
ctr_output
=
fluid
.
layers
.
fc
(
net
,
1
,
act
=
'sigmoid'
,
name
=
'ctr'
)
...
...
paddle/fluid/train/custom_trainer/feed/trainer_context.h
浏览文件 @
6c6a7a14
...
...
@@ -2,6 +2,7 @@
#include <string>
#include <memory>
#include <vector>
#include <sstream>
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/train/custom_trainer/feed/common/yaml_helper.h"
#include "paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.h"
...
...
@@ -48,6 +49,7 @@ public:
paddle
::
platform
::
CPUPlace
cpu_place
;
std
::
shared_ptr
<
PSlib
>
pslib
;
std
::
stringstream
monitor_ssm
;
//记录monitor信息
std
::
shared_ptr
<
Dataset
>
dataset
;
//训练样本
std
::
shared_ptr
<
FileSystem
>
file_system
;
//文件操作辅助类
std
::
shared_ptr
<
EpochAccessor
>
epoch_accessor
;
//训练轮次控制
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录