Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
38175506
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
38175506
编写于
9月 18, 2019
作者:
X
xiexionghang
浏览文件
操作
浏览文件
下载
差异文件
fix shuffler bug
上级
b23282f3
9552cf55
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
37 addition
and
32 deletion
+37
-32
paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc
...d/train/custom_trainer/feed/common/runtime_environment.cc
+6
-1
paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h
...id/train/custom_trainer/feed/common/runtime_environment.h
+10
-10
paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.cc
...ain/custom_trainer/feed/executor/multi_thread_executor.cc
+3
-3
paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.h
...le/fluid/train/custom_trainer/feed/monitor/cost_monitor.h
+1
-1
paddle/fluid/train/custom_trainer/feed/monitor/monitor.h
paddle/fluid/train/custom_trainer/feed/monitor/monitor.h
+1
-0
paddle/fluid/train/custom_trainer/feed/process/learner_process.cc
...luid/train/custom_trainer/feed/process/learner_process.cc
+16
-17
未找到文件。
paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc
100644 → 100755
浏览文件 @
38175506
...
...
@@ -161,7 +161,12 @@ public:
protected:
virtual
void
print_log
(
EnvironmentRole
role
,
EnvironmentLogType
type
,
EnvironmentLogLevel
level
,
const
std
::
string
&
log_str
)
{
if
(
type
==
EnvironmentLogType
::
MASTER_LOG
&&
!
is_master_node
(
role
))
{
if
(
type
==
EnvironmentLogType
::
MASTER_LOG
)
{
if
(
is_master_node
(
role
))
{
fprintf
(
stdout
,
log_str
.
c_str
());
fprintf
(
stdout
,
"
\n
"
);
fflush
(
stdout
);
}
return
;
}
VLOG
(
static_cast
<
int
>
(
level
))
<<
log_str
;
...
...
paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h
100644 → 100755
浏览文件 @
38175506
...
...
@@ -75,8 +75,8 @@ public:
// 环境定制化log
template
<
class
...
ARGS
>
void
log
(
EnvironmentRole
role
,
EnvironmentLogType
type
,
EnvironmentLogLevel
level
,
const
char
*
fmt
,
ARGS
&&
...
args
)
{
print_log
(
role
,
type
,
level
,
paddle
::
string
::
format_string
(
fmt
,
args
...));
EnvironmentLogLevel
level
,
ARGS
&&
...
args
)
{
print_log
(
role
,
type
,
level
,
paddle
::
string
::
format_string
(
args
...));
}
// 多线程可调用接口 End
...
...
@@ -106,14 +106,14 @@ protected:
};
REGIST_REGISTERER
(
RuntimeEnvironment
);
#define ENVLOG_WORKER_ALL_NOTICE \
environment->log(EnvironmentRole::WORKER, EnvironmentLogType::ALL_LOG, EnvironmentLog
Type::NOTICE,
#define ENVLOG_WORKER_MASTER_NOTICE \
environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLog
Type::NOTICE,
#define ENVLOG_WORKER_ALL_ERROR \
environment->log(EnvironmentRole::WORKER, EnvironmentLogType::ALL_LOG, EnvironmentLog
Type::ERROR,
#define ENVLOG_WORKER_MASTER_ERROR \
environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLog
Type::ERROR,
#define ENVLOG_WORKER_ALL_NOTICE
(...)
\
environment->log(EnvironmentRole::WORKER, EnvironmentLogType::ALL_LOG, EnvironmentLog
Level::NOTICE, __VA_ARGS__);
#define ENVLOG_WORKER_MASTER_NOTICE
(...)
\
environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLog
Level::NOTICE, __VA_ARGS__);
#define ENVLOG_WORKER_ALL_ERROR
(...)
\
environment->log(EnvironmentRole::WORKER, EnvironmentLogType::ALL_LOG, EnvironmentLog
Level::ERROR, __VA_ARGS__);
#define ENVLOG_WORKER_MASTER_ERROR
(...)
\
environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLog
Level::ERROR, __VA_ARGS__);
std
::
string
format_timestamp
(
time_t
time
,
const
char
*
format
);
inline
std
::
string
format_timestamp
(
time_t
time
,
const
std
::
string
&
format
)
{
...
...
paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.cc
100644 → 100755
浏览文件 @
38175506
...
...
@@ -137,7 +137,7 @@ paddle::framework::Channel<DataItem> MultiThreadExecutor::run(
paddle
::
framework
::
Channel
<
DataItem
>
input
,
const
DataParser
*
parser
)
{
uint64_t
epoch_id
=
_trainer_context
->
epoch_accessor
->
current_epoch_id
();
auto
*
environment
=
_trainer_context
->
environment
.
get
();
// 输入流
PipelineOptions
input_pipe_option
;
input_pipe_option
.
need_hold_input_data
=
true
;
...
...
@@ -243,8 +243,8 @@ paddle::framework::Channel<DataItem> MultiThreadExecutor::run(
for
(
auto
&
monitor
:
_monitors
)
{
if
(
monitor
->
need_compute_result
(
epoch_id
))
{
monitor
->
compute_result
();
VLOG
(
2
)
<<
"[Monitor]"
<<
_train_exe_name
<<
", monitor:"
<<
monitor
->
get_name
()
<<
", result:"
<<
monitor
->
format_result
(
);
ENVLOG_WORKER_MASTER_NOTICE
(
"[Monitor]%s, monitor:%s, result:%s"
,
_train_exe_name
.
c_str
(),
monitor
->
get_name
().
c_str
(),
monitor
->
format_result
().
c_str
()
);
_trainer_context
->
monitor_ssm
<<
_train_exe_name
<<
":"
<<
monitor
->
get_name
()
<<
":"
<<
monitor
->
format_result
()
<<
","
;
monitor
->
reset
();
...
...
paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.h
100644 → 100755
浏览文件 @
38175506
paddle/fluid/train/custom_trainer/feed/monitor/monitor.h
100644 → 100755
浏览文件 @
38175506
...
...
@@ -5,6 +5,7 @@
#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h"
#include "paddle/fluid/train/custom_trainer/feed/executor/executor.h"
#include "paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h"
#include "paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h"
namespace
paddle
{
namespace
custom_trainer
{
...
...
paddle/fluid/train/custom_trainer/feed/process/learner_process.cc
浏览文件 @
38175506
...
...
@@ -155,10 +155,11 @@ int LearnerProcess::load_model(uint64_t epoch_id) {
if
(
!
environment
->
is_master_node
(
EnvironmentRole
::
WORKER
))
{
return
0
;
}
VLOG
(
2
)
<<
"Start Load Model"
;
auto
*
fs
=
_context_ptr
->
file_system
.
get
();
std
::
set
<
uint32_t
>
loaded_table_set
;
auto
model_dir
=
_context_ptr
->
epoch_accessor
->
checkpoint_path
();
paddle
::
platform
::
Timer
timer
;
timer
.
Start
();
for
(
auto
&
executor
:
_executors
)
{
const
auto
&
table_accessors
=
executor
->
table_accessors
();
for
(
auto
&
itr
:
table_accessors
)
{
...
...
@@ -172,6 +173,7 @@ int LearnerProcess::load_model(uint64_t epoch_id) {
auto
scope
=
std
::
move
(
executor
->
fetch_scope
());
CHECK
(
itr
.
second
[
0
]
->
create
(
scope
.
get
())
==
0
);
}
else
{
ENVLOG_WORKER_MASTER_NOTICE
(
"Loading model %s"
,
model_dir
.
c_str
());
auto
status
=
_context_ptr
->
ps_client
()
->
load
(
itr
.
first
,
model_dir
,
std
::
to_string
((
int
)
ModelSaveWay
::
ModelSaveTrainCheckpoint
));
CHECK
(
status
.
get
()
==
0
)
<<
"table load failed, id:"
<<
itr
.
first
;
...
...
@@ -179,7 +181,8 @@ int LearnerProcess::load_model(uint64_t epoch_id) {
loaded_table_set
.
insert
(
itr
.
first
);
}
}
VLOG
(
2
)
<<
"Finish Load Model"
;
timer
.
Pause
();
ENVLOG_WORKER_MASTER_NOTICE
(
"Finished loading model, cost:%f"
,
timer
.
ElapsedSec
());
return
0
;
}
...
...
@@ -189,9 +192,7 @@ int LearnerProcess::run() {
auto
*
epoch_accessor
=
_context_ptr
->
epoch_accessor
.
get
();
uint64_t
epoch_id
=
epoch_accessor
->
current_epoch_id
();
environment
->
log
(
EnvironmentRole
::
WORKER
,
EnvironmentLogType
::
MASTER_LOG
,
EnvironmentLogLevel
::
NOTICE
,
"Resume train with epoch_id:%d %s"
,
epoch_id
,
_context_ptr
->
epoch_accessor
->
text
(
epoch_id
).
c_str
());
ENVLOG_WORKER_MASTER_NOTICE
(
"Resume train with epoch_id:%d %s"
,
epoch_id
,
_context_ptr
->
epoch_accessor
->
text
(
epoch_id
).
c_str
());
//尝试加载模型 or 初始化
CHECK
(
load_model
(
epoch_id
)
==
0
);
environment
->
barrier
(
EnvironmentRole
::
WORKER
);
...
...
@@ -208,19 +209,16 @@ int LearnerProcess::run() {
std
::
string
epoch_log_title
=
paddle
::
string
::
format_string
(
"train epoch_id:%d label:%s"
,
epoch_id
,
epoch_accessor
->
text
(
epoch_id
).
c_str
());
std
::
string
data_path
=
paddle
::
string
::
to_string
<
std
::
string
>
(
dataset
->
epoch_data_path
(
epoch_id
));
ENVLOG_WORKER_MASTER_NOTICE
(
" ==== begin %s ===="
,
epoch_accessor
->
text
(
epoch_id
).
c_str
());
//Step1. 等待样本ready
{
environment
->
log
(
EnvironmentRole
::
WORKER
,
EnvironmentLogType
::
MASTER_LOG
,
EnvironmentLogLevel
::
NOTICE
,
"%s, wait data ready:%s"
,
epoch_log_title
.
c_str
(),
data_path
.
c_str
());
ENVLOG_WORKER_MASTER_NOTICE
(
" %s, wait data ready:%s"
,
epoch_log_title
.
c_str
(),
data_path
.
c_str
());
while
(
dataset
->
epoch_data_status
(
epoch_id
)
!=
DatasetStatus
::
Ready
)
{
sleep
(
30
);
dataset
->
pre_detect_data
(
epoch_id
);
environment
->
log
(
EnvironmentRole
::
WORKER
,
EnvironmentLogType
::
MASTER_LOG
,
EnvironmentLogLevel
::
NOTICE
,
"data not ready, wait 30s"
);
ENVLOG_WORKER_MASTER_NOTICE
(
" epoch_id:%d data not ready, wait 30s"
,
epoch_id
);
}
environment
->
log
(
EnvironmentRole
::
WORKER
,
EnvironmentLogType
::
MASTER_LOG
,
EnvironmentLogLevel
::
NOTICE
,
"Start %s, data is ready"
,
epoch_log_title
.
c_str
());
ENVLOG_WORKER_MASTER_NOTICE
(
" Start %s, data is ready"
,
epoch_log_title
.
c_str
());
environment
->
barrier
(
EnvironmentRole
::
WORKER
);
}
...
...
@@ -232,7 +230,7 @@ int LearnerProcess::run() {
environment
->
barrier
(
EnvironmentRole
::
WORKER
);
paddle
::
platform
::
Timer
timer
;
timer
.
Start
();
VLOG
(
2
)
<<
"Start executor:"
<<
executor
->
train_exe_name
(
);
ENVLOG_WORKER_MASTER_NOTICE
(
"Start executor:%s"
,
executor
->
train_exe_name
().
c_str
()
);
auto
data_name
=
executor
->
train_data_name
();
paddle
::
framework
::
Channel
<
DataItem
>
input_channel
;
if
(
backup_input_map
.
count
(
data_name
))
{
...
...
@@ -242,7 +240,7 @@ int LearnerProcess::run() {
}
input_channel
=
executor
->
run
(
input_channel
,
dataset
->
data_parser
(
data_name
));
timer
.
Pause
();
VLOG
(
2
)
<<
"End executor:"
<<
executor
->
train_exe_name
()
<<
", cost:"
<<
timer
.
ElapsedSec
(
);
ENVLOG_WORKER_MASTER_NOTICE
(
"End executor:%s, cost:%f"
,
executor
->
train_exe_name
().
c_str
(),
timer
.
ElapsedSec
()
);
// 等待异步梯度完成
_context_ptr
->
ps_client
()
->
flush
();
...
...
@@ -273,21 +271,22 @@ int LearnerProcess::run() {
environment
->
is_master_node
(
EnvironmentRole
::
WORKER
))
{
paddle
::
platform
::
Timer
timer
;
timer
.
Start
();
VLOG
(
2
)
<<
"Start shrink table"
;
ENVLOG_WORKER_MASTER_NOTICE
(
"Start shrink table"
);
for
(
auto
&
executor
:
_executors
)
{
const
auto
&
table_accessors
=
executor
->
table_accessors
();
for
(
auto
&
itr
:
table_accessors
)
{
CHECK
(
itr
.
second
[
0
]
->
shrink
()
==
0
);
}
}
VLOG
(
2
)
<<
"End shrink table, cost:"
<<
timer
.
ElapsedSec
();
timer
.
Pause
();
ENVLOG_WORKER_MASTER_NOTICE
(
"End shrink table, cost:%f"
,
timer
.
ElapsedSec
());
}
environment
->
barrier
(
EnvironmentRole
::
WORKER
);
epoch_accessor
->
epoch_done
(
epoch_id
);
environment
->
barrier
(
EnvironmentRole
::
WORKER
);
}
ENVLOG_WORKER_MASTER_NOTICE
(
" ==== end %s ===="
,
epoch_accessor
->
text
(
epoch_id
).
c_str
());
//Step4. Output Monitor && RunStatus
//TODO
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录