Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleRec
提交
9042cb45
P
PaddleRec
项目概览
PaddlePaddle
/
PaddleRec
通知
68
Star
12
Fork
5
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
27
列表
看板
标记
里程碑
合并请求
10
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
27
Issue
27
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9042cb45
编写于
3月 06, 2020
作者:
X
xiexionghang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
depend on paddle with bcloud
上级
61b1fd00
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
184 addition
and
50 deletion
+184
-50
kagle/kagle_model.py
kagle/kagle_model.py
+8
-6
kagle/kagle_util.py
kagle/kagle_util.py
+106
-3
kagle/trainer/abacus_trainer.py
kagle/trainer/abacus_trainer.py
+69
-40
kagle/trainer/kagle_trainer.py
kagle/trainer/kagle_trainer.py
+1
-1
未找到文件。
kagle/kagle_model.py
浏览文件 @
9042cb45
...
@@ -77,6 +77,7 @@ class Model(object):
...
@@ -77,6 +77,7 @@ class Model(object):
"""R
"""R
"""
"""
pass
pass
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
dump_inference_program
(
self
,
inference_layer
,
path
):
def
dump_inference_program
(
self
,
inference_layer
,
path
):
"""R
"""R
...
@@ -101,7 +102,8 @@ class Model(object):
...
@@ -101,7 +102,8 @@ class Model(object):
if
node
[
'name'
]
not
in
self
.
_inference_meta
[
'dependency'
][
layer
]:
if
node
[
'name'
]
not
in
self
.
_inference_meta
[
'dependency'
][
layer
]:
continue
continue
if
'inference_param'
in
self
.
_build_param
[
'layer_extend'
][
node
[
'name'
]]:
if
'inference_param'
in
self
.
_build_param
[
'layer_extend'
][
node
[
'name'
]]:
self
.
_inference_meta
[
'params'
][
layer
]
+=
self
.
_build_param
[
'layer_extend'
][
node
[
'name'
]][
'inference_param'
][
'params'
]
self
.
_inference_meta
[
'params'
][
layer
]
+=
\
self
.
_build_param
[
'layer_extend'
][
node
[
'name'
]][
'inference_param'
][
'params'
]
return
self
.
_inference_meta
[
'params'
][
layer
]
return
self
.
_inference_meta
[
'params'
][
layer
]
def
get_dependency
(
self
,
layer_graph
,
dest_layer
):
def
get_dependency
(
self
,
layer_graph
,
dest_layer
):
...
@@ -192,10 +194,10 @@ class FluidModel(Model):
...
@@ -192,10 +194,10 @@ class FluidModel(Model):
metrics
=
params
[
'metrics'
]
metrics
=
params
[
'metrics'
]
for
name
in
metrics
:
for
name
in
metrics
:
model_metrics
=
metrics
[
name
]
model_metrics
=
metrics
[
name
]
stat_var_names
+=
[
model_metrics
[
metric
][
'var'
].
name
for
metric
in
model_metrics
]
stat_var_names
+=
[
model_metrics
[
metric
][
'var'
].
name
for
metric
in
model_metrics
]
strategy
[
'stat_var_names'
]
=
list
(
set
(
stat_var_names
))
strategy
[
'stat_var_names'
]
=
list
(
set
(
stat_var_names
))
optimizer_generator
=
'optimizer = fluid.optimizer.'
+
optimizer_conf
[
'class'
]
+
\
optimizer_generator
=
'optimizer = fluid.optimizer.'
+
optimizer_conf
[
'class'
]
+
\
'(learning_rate='
+
str
(
optimizer_conf
[
'learning_rate'
])
+
')'
'(learning_rate='
+
str
(
optimizer_conf
[
'learning_rate'
])
+
')'
exec
(
optimizer_generator
)
exec
(
optimizer_generator
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
return
optimizer
return
optimizer
...
@@ -233,12 +235,12 @@ class FluidModel(Model):
...
@@ -233,12 +235,12 @@ class FluidModel(Model):
fleet
.
_fleet_ptr
.
pull_dense
(
scope
,
table
[
'_meta'
].
_table_id
,
table
[
'params'
])
fleet
.
_fleet_ptr
.
pull_dense
(
scope
,
table
[
'_meta'
].
_table_id
,
table
[
'params'
])
for
infernce_item
in
params
[
'inference_list'
]:
for
infernce_item
in
params
[
'inference_list'
]:
params_name_list
=
self
.
inference_params
(
infernce_item
[
'layer_name'
])
params_name_list
=
self
.
inference_params
(
infernce_item
[
'layer_name'
])
params_var_list
=
[
program
.
global_block
().
var
(
i
)
for
i
in
params_name_list
]
params_var_list
=
[
program
.
global_block
().
var
(
i
)
for
i
in
params_name_list
]
params_file_name
=
infernce_item
[
'save_file_name'
]
params_file_name
=
infernce_item
[
'save_file_name'
]
with
fluid
.
scope_guard
(
scope
):
with
fluid
.
scope_guard
(
scope
):
if
params
[
'save_combine'
]:
if
params
[
'save_combine'
]:
fluid
.
io
.
save_vars
(
fluid
.
io
.
save_vars
(
executor
,
"./"
,
\
executor
,
"./"
,
program
,
vars
=
params_var_list
,
filename
=
params_file_name
)
program
,
vars
=
params_var_list
,
filename
=
params_file_name
)
else
:
else
:
fluid
.
io
.
save_vars
(
executor
,
params_file_name
,
program
,
vars
=
params_var_list
)
fluid
.
io
.
save_vars
(
executor
,
params_file_name
,
program
,
vars
=
params_var_list
)
pass
pass
kagle/kagle_util.py
浏览文件 @
9042cb45
"""
Util lib
"""
import
os
import
os
import
sys
import
sys
import
time
import
time
import
datetime
import
datetime
import
kagle_fs
import
kagle
.kagle
_fs
import
numpy
as
np
import
numpy
as
np
from
paddle.fluid.incubate.fleet.parameter_server.pslib
import
fleet
from
paddle.fluid.incubate.fleet.parameter_server.pslib
import
fleet
def
get_env_value
(
env_name
):
def
get_env_value
(
env_name
):
"""
get os environment value
"""
return
os
.
popen
(
"echo -n ${"
+
env_name
+
"}"
).
read
().
strip
()
return
os
.
popen
(
"echo -n ${"
+
env_name
+
"}"
).
read
().
strip
()
def
now_time_str
():
def
now_time_str
():
return
"
\n
"
+
time
.
strftime
(
"%Y-%m-%d %H:%M:%S"
,
time
.
localtime
())
+
"[0]:"
"""
get current format str_time
"""
return
"
\n
"
+
time
.
strftime
(
"%Y-%m-%d %H:%M:%S"
,
time
.
localtime
())
+
"[0]:"
def
get_absolute_path
(
path
,
params
):
def
get_absolute_path
(
path
,
params
):
"""R
"""
if
path
.
startswith
(
'afs:'
)
or
path
.
startswith
(
'hdfs:'
):
if
path
.
startswith
(
'afs:'
)
or
path
.
startswith
(
'hdfs:'
):
sub_path
=
path
.
split
(
'fs:'
)[
1
]
sub_path
=
path
.
split
(
'fs:'
)[
1
]
if
':'
in
sub_path
:
#such as afs://xxx:prot/xxxx
if
':'
in
sub_path
:
#such as afs://xxx:prot/xxxx
...
@@ -23,6 +34,14 @@ def get_absolute_path(path, params):
...
@@ -23,6 +34,14 @@ def get_absolute_path(path, params):
return
path
return
path
def
make_datetime
(
date_str
,
fmt
=
None
):
def
make_datetime
(
date_str
,
fmt
=
None
):
"""
create a datetime instance by date_string
Args:
date_str: such as 2020-01-14
date_str_format: "%Y-%m-%d"
Return:
datetime
"""
if
fmt
is
None
:
if
fmt
is
None
:
if
len
(
date_str
)
==
8
:
#%Y%m%d
if
len
(
date_str
)
==
8
:
#%Y%m%d
return
datetime
.
datetime
.
strptime
(
date_str
,
'%Y%m%d'
)
return
datetime
.
datetime
.
strptime
(
date_str
,
'%Y%m%d'
)
...
@@ -32,28 +51,51 @@ def make_datetime(date_str, fmt = None):
...
@@ -32,28 +51,51 @@ def make_datetime(date_str, fmt = None):
def
wroker_numric_opt
(
value
,
opt
):
def
wroker_numric_opt
(
value
,
opt
):
"""
numric count opt for workers
Args:
value: value for count
opt: count operator, SUM/MAX/MIN/AVG
Return:
count result
"""
local_value
=
np
.
array
([
value
])
local_value
=
np
.
array
([
value
])
global_value
=
np
.
copy
(
local_value
)
*
0
global_value
=
np
.
copy
(
local_value
)
*
0
fleet
.
_role_maker
.
_node_type_comm
.
Allreduce
(
local_value
,
global_value
,
op
=
opt
)
fleet
.
_role_maker
.
_node_type_comm
.
Allreduce
(
local_value
,
global_value
,
op
=
opt
)
return
global_value
[
0
]
return
global_value
[
0
]
def
worker_numric_sum
(
value
):
def
worker_numric_sum
(
value
):
"""R
"""
from
mpi4py
import
MPI
from
mpi4py
import
MPI
return
wroker_numric_opt
(
value
,
MPI
.
SUM
)
return
wroker_numric_opt
(
value
,
MPI
.
SUM
)
def
worker_numric_avg
(
value
):
def
worker_numric_avg
(
value
):
"""R
"""
return
worker_numric_sum
(
value
)
/
fleet
.
worker_num
()
return
worker_numric_sum
(
value
)
/
fleet
.
worker_num
()
def
worker_numric_min
(
value
):
def
worker_numric_min
(
value
):
"""R
"""
from
mpi4py
import
MPI
from
mpi4py
import
MPI
return
wroker_numric_opt
(
value
,
MPI
.
MIN
)
return
wroker_numric_opt
(
value
,
MPI
.
MIN
)
def
worker_numric_max
(
value
):
def
worker_numric_max
(
value
):
"""R
"""
from
mpi4py
import
MPI
from
mpi4py
import
MPI
return
wroker_numric_opt
(
value
,
MPI
.
MAX
)
return
wroker_numric_opt
(
value
,
MPI
.
MAX
)
def
rank0_print
(
log_str
):
def
rank0_print
(
log_str
):
"""R
"""
print_log
(
log_str
,
{
'master'
:
True
})
print_log
(
log_str
,
{
'master'
:
True
})
def
print_log
(
log_str
,
params
):
def
print_log
(
log_str
,
params
):
"""R
"""
if
params
[
'master'
]:
if
params
[
'master'
]:
if
fleet
.
worker_index
()
==
0
:
if
fleet
.
worker_index
()
==
0
:
print
(
log_str
)
print
(
log_str
)
...
@@ -64,22 +106,33 @@ def print_log(log_str, params):
...
@@ -64,22 +106,33 @@ def print_log(log_str, params):
params
[
'stdout'
]
+=
str
(
datetime
.
datetime
.
now
())
+
log_str
params
[
'stdout'
]
+=
str
(
datetime
.
datetime
.
now
())
+
log_str
def
print_cost
(
cost
,
params
):
def
print_cost
(
cost
,
params
):
"""R
"""
log_str
=
params
[
'log_format'
]
%
cost
log_str
=
params
[
'log_format'
]
%
cost
print_log
(
log_str
,
params
)
print_log
(
log_str
,
params
)
return
log_str
return
log_str
class
CostPrinter
:
class
CostPrinter
:
"""
For count cost time && print cost log
"""
def
__init__
(
self
,
callback
,
callback_params
):
def
__init__
(
self
,
callback
,
callback_params
):
"""R
"""
self
.
reset
(
callback
,
callback_params
)
self
.
reset
(
callback
,
callback_params
)
pass
pass
def
__del__
(
self
):
def
__del__
(
self
):
"""R
"""
if
not
self
.
_done
:
if
not
self
.
_done
:
self
.
done
()
self
.
done
()
pass
pass
def
reset
(
self
,
callback
,
callback_params
):
def
reset
(
self
,
callback
,
callback_params
):
"""R
"""
self
.
_done
=
False
self
.
_done
=
False
self
.
_callback
=
callback
self
.
_callback
=
callback
self
.
_callback_params
=
callback_params
self
.
_callback_params
=
callback_params
...
@@ -87,24 +140,35 @@ class CostPrinter:
...
@@ -87,24 +140,35 @@ class CostPrinter:
pass
pass
def
done
(
self
):
def
done
(
self
):
"""R
"""
cost
=
time
.
time
()
-
self
.
_begin_time
cost
=
time
.
time
()
-
self
.
_begin_time
log_str
=
self
.
_callback
(
cost
,
self
.
_callback_params
)
#cost(s)
log_str
=
self
.
_callback
(
cost
,
self
.
_callback_params
)
#cost(s)
self
.
_done
=
True
self
.
_done
=
True
return
cost
,
log_str
return
cost
,
log_str
class
PathGenerator
:
class
PathGenerator
:
"""
generate path with template & runtime variables
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
"""R
"""
self
.
_templates
=
{}
self
.
_templates
=
{}
self
.
add_path_template
(
config
)
self
.
add_path_template
(
config
)
pass
pass
def
add_path_template
(
self
,
config
):
def
add_path_template
(
self
,
config
):
"""R
"""
if
'templates'
in
config
:
if
'templates'
in
config
:
for
template
in
config
[
'templates'
]:
for
template
in
config
[
'templates'
]:
self
.
_templates
[
template
[
'name'
]]
=
template
[
'template'
]
self
.
_templates
[
template
[
'name'
]]
=
template
[
'template'
]
pass
pass
def
generate_path
(
self
,
template_name
,
param
):
def
generate_path
(
self
,
template_name
,
param
):
"""R
"""
if
template_name
in
self
.
_templates
:
if
template_name
in
self
.
_templates
:
if
'time_format'
in
param
:
if
'time_format'
in
param
:
str
=
param
[
'time_format'
].
strftime
(
self
.
_templates
[
template_name
])
str
=
param
[
'time_format'
].
strftime
(
self
.
_templates
[
template_name
])
...
@@ -113,8 +177,15 @@ class PathGenerator:
...
@@ -113,8 +177,15 @@ class PathGenerator:
else
:
else
:
return
""
return
""
class
TimeTrainPass
:
class
TimeTrainPass
:
"""
timely pass
define pass time_interval && start_time && end_time
"""
def
__init__
(
self
,
global_config
):
def
__init__
(
self
,
global_config
):
"""R
"""
self
.
_config
=
global_config
[
'epoch'
]
self
.
_config
=
global_config
[
'epoch'
]
if
'+'
in
self
.
_config
[
'days'
]:
if
'+'
in
self
.
_config
[
'days'
]:
day_str
=
self
.
_config
[
'days'
].
replace
(
' '
,
''
)
day_str
=
self
.
_config
[
'days'
].
replace
(
' '
,
''
)
...
@@ -156,9 +227,13 @@ class TimeTrainPass:
...
@@ -156,9 +227,13 @@ class TimeTrainPass:
self
.
init_pass_by_id
(
done_fileds
[
0
],
self
.
_checkpoint_pass_id
)
self
.
init_pass_by_id
(
done_fileds
[
0
],
self
.
_checkpoint_pass_id
)
def
max_pass_num_day
(
self
):
def
max_pass_num_day
(
self
):
"""R
"""
return
24
*
60
/
self
.
_interval_per_pass
return
24
*
60
/
self
.
_interval_per_pass
def
save_train_progress
(
self
,
day
,
pass_id
,
base_key
,
model_path
,
is_checkpoint
):
def
save_train_progress
(
self
,
day
,
pass_id
,
base_key
,
model_path
,
is_checkpoint
):
"""R
"""
if
is_checkpoint
:
if
is_checkpoint
:
self
.
_checkpoint_pass_id
=
pass_id
self
.
_checkpoint_pass_id
=
pass_id
self
.
_checkpoint_model_path
=
model_path
self
.
_checkpoint_model_path
=
model_path
...
@@ -168,6 +243,12 @@ class TimeTrainPass:
...
@@ -168,6 +243,12 @@ class TimeTrainPass:
pass
pass
def
init_pass_by_id
(
self
,
date_str
,
pass_id
):
def
init_pass_by_id
(
self
,
date_str
,
pass_id
):
"""
init pass context with pass_id
Args:
date_str: example "20200110"
pass_id(int): pass_id of date
"""
date_time
=
make_datetime
(
date_str
)
date_time
=
make_datetime
(
date_str
)
if
pass_id
<
1
:
if
pass_id
<
1
:
pass_id
=
0
pass_id
=
0
...
@@ -179,14 +260,23 @@ class TimeTrainPass:
...
@@ -179,14 +260,23 @@ class TimeTrainPass:
print
(
self
.
_current_train_time
)
print
(
self
.
_current_train_time
)
def
init_pass_by_time
(
self
,
datetime_str
):
def
init_pass_by_time
(
self
,
datetime_str
):
"""
init pass context with datetime
Args:
date_str: example "20200110000" -> "%Y%m%d%H%M"
"""
self
.
_current_train_time
=
make_datetime
(
datetime_str
)
self
.
_current_train_time
=
make_datetime
(
datetime_str
)
minus
=
self
.
_current_train_time
.
hour
*
60
+
self
.
_current_train_time
.
minute
;
minus
=
self
.
_current_train_time
.
hour
*
60
+
self
.
_current_train_time
.
minute
;
self
.
_pass_id
=
minus
/
self
.
_interval_per_pass
+
1
self
.
_pass_id
=
minus
/
self
.
_interval_per_pass
+
1
def
current_pass
():
def
current_pass
(
self
):
"""R
"""
return
self
.
_pass_id
return
self
.
_pass_id
def
next
(
self
):
def
next
(
self
):
"""R
"""
has_next
=
True
has_next
=
True
old_pass_id
=
self
.
_pass_id
old_pass_id
=
self
.
_pass_id
if
self
.
_pass_id
<
1
:
if
self
.
_pass_id
<
1
:
...
@@ -202,6 +292,8 @@ class TimeTrainPass:
...
@@ -202,6 +292,8 @@ class TimeTrainPass:
return
has_next
return
has_next
def
is_checkpoint_pass
(
self
,
pass_id
):
def
is_checkpoint_pass
(
self
,
pass_id
):
"""R
"""
if
pass_id
<
1
:
if
pass_id
<
1
:
return
True
return
True
if
pass_id
==
self
.
max_pass_num_day
():
if
pass_id
==
self
.
max_pass_num_day
():
...
@@ -211,10 +303,21 @@ class TimeTrainPass:
...
@@ -211,10 +303,21 @@ class TimeTrainPass:
return
False
return
False
def
need_dump_inference
(
self
,
pass_id
):
def
need_dump_inference
(
self
,
pass_id
):
"""R
"""
return
self
.
_inference_pass_id
<
pass_id
and
pass_id
%
self
.
_dump_inference_interval
==
0
return
self
.
_inference_pass_id
<
pass_id
and
pass_id
%
self
.
_dump_inference_interval
==
0
def
date
(
self
,
delta_day
=
0
):
def
date
(
self
,
delta_day
=
0
):
"""
get train date
Args:
delta_day(int): n day afer current_train_date
Return:
date(current_train_time + delta_day)
"""
return
(
self
.
_current_train_time
+
datetime
.
timedelta
(
days
=
delta_day
)).
strftime
(
"%Y%m%d"
)
return
(
self
.
_current_train_time
+
datetime
.
timedelta
(
days
=
delta_day
)).
strftime
(
"%Y%m%d"
)
def
timestamp
(
self
,
delta_day
=
0
):
def
timestamp
(
self
,
delta_day
=
0
):
"""R
"""
return
(
self
.
_current_train_time
+
datetime
.
timedelta
(
days
=
delta_day
)).
timestamp
()
return
(
self
.
_current_train_time
+
datetime
.
timedelta
(
days
=
delta_day
)).
timestamp
()
kagle/trainer/abacus_trainer.py
浏览文件 @
9042cb45
"""
A paddle trainer Adapt to Abacus
"""
import
abc
import
sys
import
sys
import
copy
import
copy
import
yaml
import
yaml
import
time
import
time
import
json
import
json
import
datetime
import
datetime
import
kagle
_trainer
import
kagle
.kagle_fs
from
..
import
kagle_fs
import
kagle.kagle_util
from
..
import
kagle_uti
l
import
kagle.kagle_mode
l
from
..
import
kagle_model
import
kagle.kagle_metric
from
..
import
kagle_metric
import
kagle.kagle_dataset
from
..
import
kagle_dataset
import
kagle.trainer.kagle_trainer
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
abc
import
ABCMeta
,
abstractmethod
from
paddle.fluid.incubate.fleet.parameter_server.pslib
import
fleet
from
paddle.fluid.incubate.fleet.parameter_server.pslib
import
fleet
class
AbacusPaddleTrainer
(
kagle_trainer
.
Trainer
):
class
AbacusPaddleTrainer
(
kagle_trainer
.
Trainer
):
"""R
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
"""R
"""
kagle_trainer
.
Trainer
.
__init__
(
self
,
config
)
kagle_trainer
.
Trainer
.
__init__
(
self
,
config
)
config
[
'output_path'
]
=
kagle_util
.
get_absolute_path
(
config
[
'output_path'
]
=
kagle_util
.
get_absolute_path
(
config
[
'output_path'
],
config
[
'io'
][
'afs'
])
config
[
'output_path'
],
config
[
'io'
][
'afs'
])
...
@@ -43,6 +50,8 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
...
@@ -43,6 +50,8 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
self
.
regist_context_processor
(
'end_day'
,
self
.
end_day
)
self
.
regist_context_processor
(
'end_day'
,
self
.
end_day
)
def
init
(
self
,
context
):
def
init
(
self
,
context
):
"""R
"""
fleet
.
init
(
self
.
_exe
)
fleet
.
init
(
self
.
_exe
)
data_var_list
=
[]
data_var_list
=
[]
data_var_name_dict
=
{}
data_var_name_dict
=
{}
...
@@ -77,7 +86,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
...
@@ -77,7 +86,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
if
not
executor
[
'is_update_sparse'
]:
if
not
executor
[
'is_update_sparse'
]:
program
.
_fleet_opt
[
"program_configs"
][
str
(
id
(
model
.
get_cost_op
().
block
.
program
))][
"push_sparse"
]
=
[]
program
.
_fleet_opt
[
"program_configs"
][
str
(
id
(
model
.
get_cost_op
().
block
.
program
))][
"push_sparse"
]
=
[]
if
'train_thread_num'
not
in
executor
:
if
'train_thread_num'
not
in
executor
:
executor
[
'train_thread_num'
]
=
global_config
[
'train_thread_num'
]
executor
[
'train_thread_num'
]
=
self
.
global_config
[
'train_thread_num'
]
with
fluid
.
scope_guard
(
scope
):
with
fluid
.
scope_guard
(
scope
):
self
.
_exe
.
run
(
model
.
_build_param
[
'model'
][
'startup_program'
])
self
.
_exe
.
run
(
model
.
_build_param
[
'model'
][
'startup_program'
])
model
.
dump_model_program
(
'./'
)
model
.
dump_model_program
(
'./'
)
...
@@ -98,23 +107,29 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
...
@@ -98,23 +107,29 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
pass
pass
def
print_log
(
self
,
log_str
,
params
):
def
print_log
(
self
,
log_str
,
params
):
"""R
"""
params
[
'index'
]
=
fleet
.
worker_index
()
params
[
'index'
]
=
fleet
.
worker_index
()
return
kagle_util
.
print_log
(
log_str
,
params
)
return
kagle_util
.
print_log
(
log_str
,
params
)
def
print_global_metrics
(
self
,
scope
,
model
,
monitor_data
,
stdout_str
):
def
print_global_metrics
(
self
,
scope
,
model
,
monitor_data
,
stdout_str
):
"""R
"""
metrics
=
model
.
get_metrics
()
metrics
=
model
.
get_metrics
()
metric_calculator
=
kagle_metric
.
PaddleAUCMetric
(
None
)
metric_calculator
=
kagle_metric
.
PaddleAUCMetric
(
None
)
for
metric
in
metrics
:
for
metric
in
metrics
:
metric_param
=
{
'label'
:
metric
,
'metric_dict'
:
metrics
[
metric
]}
metric_param
=
{
'label'
:
metric
,
'metric_dict'
:
metrics
[
metric
]}
metric_calculator
.
calculate
(
scope
,
metric_param
)
metric_calculator
.
calculate
(
scope
,
metric_param
)
metric_result
=
metric_calculator
.
get_result_to_string
()
metric_result
=
metric_calculator
.
get_result_to_string
()
self
.
print_log
(
metric_result
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
self
.
print_log
(
metric_result
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
monitor_data
+=
metric_result
monitor_data
+=
metric_result
metric_calculator
.
clear
(
scope
,
metric_param
)
metric_calculator
.
clear
(
scope
,
metric_param
)
def
save_model
(
self
,
day
,
pass_index
,
base_key
):
def
save_model
(
self
,
day
,
pass_index
,
base_key
):
"""R
"""
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'save model cost %s sec'
})
{
'master'
:
True
,
'log_format'
:
'save model cost %s sec'
})
model_path
=
self
.
_path_generator
.
generate_path
(
'batch_model'
,
{
'day'
:
day
,
'pass_id'
:
pass_index
})
model_path
=
self
.
_path_generator
.
generate_path
(
'batch_model'
,
{
'day'
:
day
,
'pass_id'
:
pass_index
})
save_mode
=
0
# just save all
save_mode
=
0
# just save all
if
pass_index
<
1
:
#batch_model
if
pass_index
<
1
:
#batch_model
...
@@ -126,27 +141,30 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
...
@@ -126,27 +141,30 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
return
model_path
return
model_path
def
save_xbox_model
(
self
,
day
,
pass_index
,
xbox_base_key
,
monitor_data
):
def
save_xbox_model
(
self
,
day
,
pass_index
,
xbox_base_key
,
monitor_data
):
"""R
"""
stdout_str
=
""
stdout_str
=
""
xbox_patch_id
=
str
(
int
(
time
.
time
()))
xbox_patch_id
=
str
(
int
(
time
.
time
()))
kagle_util
.
rank0_print
(
"begin save delta model"
)
kagle_util
.
rank0_print
(
"begin save delta model"
)
model_path
=
""
model_path
=
""
xbox_model_donefile
=
""
xbox_model_donefile
=
""
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'save xbox model cost %s sec'
,
'stdout'
:
stdout_str
})
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
\
'log_format'
:
'save xbox model cost %s sec'
,
'stdout'
:
stdout_str
})
if
pass_index
<
1
:
if
pass_index
<
1
:
save_mode
=
2
save_mode
=
2
xbox_patch_id
=
xbox_base_key
xbox_patch_id
=
xbox_base_key
model_path
=
self
.
_path_generator
.
generate_path
(
'xbox_base'
,
{
'day'
:
day
})
model_path
=
self
.
_path_generator
.
generate_path
(
'xbox_base'
,
{
'day'
:
day
})
xbox_model_donefile
=
self
.
_path_generator
.
generate_path
(
'xbox_base_done'
,
{
'day'
:
day
})
xbox_model_donefile
=
self
.
_path_generator
.
generate_path
(
'xbox_base_done'
,
{
'day'
:
day
})
else
:
else
:
save_mode
=
1
save_mode
=
1
model_path
=
self
.
_path_generator
.
generate_path
(
'xbox_delta'
,
{
'day'
:
day
,
'pass_id'
:
pass_index
})
model_path
=
self
.
_path_generator
.
generate_path
(
'xbox_delta'
,
{
'day'
:
day
,
'pass_id'
:
pass_index
})
xbox_model_donefile
=
self
.
_path_generator
.
generate_path
(
'xbox_delta_done'
,
{
'day'
:
day
})
xbox_model_donefile
=
self
.
_path_generator
.
generate_path
(
'xbox_delta_done'
,
{
'day'
:
day
})
total_save_num
=
fleet
.
save_persistables
(
None
,
model_path
,
mode
=
save_mode
)
total_save_num
=
fleet
.
save_persistables
(
None
,
model_path
,
mode
=
save_mode
)
cost_printer
.
done
()
cost_printer
.
done
()
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'save cache model cost %s sec'
,
'stdout'
:
stdout_str
})
'log_format'
:
'save cache model cost %s sec'
,
'stdout'
:
stdout_str
})
model_file_handler
=
kagle_fs
.
FileHandler
(
self
.
global_config
[
'io'
][
'afs'
])
model_file_handler
=
kagle_fs
.
FileHandler
(
self
.
global_config
[
'io'
][
'afs'
])
if
self
.
global_config
[
'save_cache_model'
]:
if
self
.
global_config
[
'save_cache_model'
]:
cache_save_num
=
fleet
.
save_cache_model
(
None
,
model_path
,
mode
=
save_mode
)
cache_save_num
=
fleet
.
save_cache_model
(
None
,
model_path
,
mode
=
save_mode
)
...
@@ -161,7 +179,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
...
@@ -161,7 +179,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
'save_combine'
:
True
'save_combine'
:
True
}
}
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'save dense model cost %s sec'
,
'stdout'
:
stdout_str
})
'log_format'
:
'save dense model cost %s sec'
,
'stdout'
:
stdout_str
})
for
executor
in
self
.
global_config
[
'executor'
]:
for
executor
in
self
.
global_config
[
'executor'
]:
if
'layer_for_inference'
not
in
executor
:
if
'layer_for_inference'
not
in
executor
:
continue
continue
...
@@ -176,17 +194,17 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
...
@@ -176,17 +194,17 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
cost_printer
.
done
()
cost_printer
.
done
()
xbox_done_info
=
{
xbox_done_info
=
{
"id"
:
xbox_patch_id
,
"id"
:
xbox_patch_id
,
"key"
:
xbox_base_key
,
"key"
:
xbox_base_key
,
"ins_path"
:
""
,
"ins_path"
:
""
,
"ins_tag"
:
"feasign"
,
"ins_tag"
:
"feasign"
,
"partition_type"
:
"2"
,
"partition_type"
:
"2"
,
"record_count"
:
"111111"
,
"record_count"
:
"111111"
,
"monitor_data"
:
monitor_data
,
"monitor_data"
:
monitor_data
,
"mpi_size"
:
str
(
fleet
.
worker_num
()),
"mpi_size"
:
str
(
fleet
.
worker_num
()),
"input"
:
model_path
.
rstrip
(
"/"
)
+
"/000"
,
"input"
:
model_path
.
rstrip
(
"/"
)
+
"/000"
,
"job_id"
:
kagle_util
.
get_env_value
(
"JOB_ID"
),
"job_id"
:
kagle_util
.
get_env_value
(
"JOB_ID"
),
"job_name"
:
kagle_util
.
get_env_value
(
"JOB_NAME"
)
"job_name"
:
kagle_util
.
get_env_value
(
"JOB_NAME"
)
}
}
model_file_handler
.
write
(
json
.
dumps
(
xbox_done_info
)
+
"
\n
"
,
xbox_model_donefile
,
'a'
)
model_file_handler
.
write
(
json
.
dumps
(
xbox_done_info
)
+
"
\n
"
,
xbox_model_donefile
,
'a'
)
if
pass_index
>
0
:
if
pass_index
>
0
:
...
@@ -194,6 +212,8 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
...
@@ -194,6 +212,8 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
return
stdout_str
return
stdout_str
def
run_executor
(
self
,
executor_config
,
dataset
,
stdout_str
):
def
run_executor
(
self
,
executor_config
,
dataset
,
stdout_str
):
"""R
"""
day
=
self
.
_train_pass
.
date
()
day
=
self
.
_train_pass
.
date
()
pass_id
=
self
.
_train_pass
.
_pass_id
pass_id
=
self
.
_train_pass
.
_pass_id
xbox_base_key
=
self
.
_train_pass
.
_base_key
xbox_base_key
=
self
.
_train_pass
.
_base_key
...
@@ -221,6 +241,8 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
...
@@ -221,6 +241,8 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
stdout_str
+=
self
.
save_xbox_model
(
day
,
pass_id
,
xbox_base_key
,
monitor_data
)
stdout_str
+=
self
.
save_xbox_model
(
day
,
pass_id
,
xbox_base_key
,
monitor_data
)
def
startup
(
self
,
context
):
def
startup
(
self
,
context
):
"""R
"""
if
fleet
.
is_server
():
if
fleet
.
is_server
():
fleet
.
run_server
()
fleet
.
run_server
()
context
[
'status'
]
=
'wait'
context
[
'status'
]
=
'wait'
...
@@ -239,24 +261,28 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
...
@@ -239,24 +261,28 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
cost_printer
.
done
()
cost_printer
.
done
()
if
self
.
global_config
[
'save_first_base'
]:
if
self
.
global_config
[
'save_first_base'
]:
self
.
print_log
(
"save_first_base=True"
,
{
'master'
:
True
})
self
.
print_log
(
"save_first_base=True"
,
{
'master'
:
True
})
self
.
print_log
(
"going to save xbox base model"
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
self
.
print_log
(
"going to save xbox base model"
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
self
.
_train_pass
.
_base_key
=
int
(
time
.
time
())
self
.
_train_pass
.
_base_key
=
int
(
time
.
time
())
stdout_str
+=
self
.
save_xbox_model
(
day
,
0
,
self
.
_train_pass
.
_base_key
,
""
)
stdout_str
+=
self
.
save_xbox_model
(
self
.
_train_pass
.
date
()
,
0
,
self
.
_train_pass
.
_base_key
,
""
)
context
[
'status'
]
=
'begin_day'
context
[
'status'
]
=
'begin_day'
def
begin_day
(
self
,
context
):
def
begin_day
(
self
,
context
):
"""R
"""
stdout_str
=
""
stdout_str
=
""
if
not
self
.
_train_pass
.
next
():
if
not
self
.
_train_pass
.
next
():
context
[
'is_exit'
]
=
True
context
[
'is_exit'
]
=
True
day
=
self
.
_train_pass
.
date
()
day
=
self
.
_train_pass
.
date
()
pass_id
=
self
.
_train_pass
.
_pass_id
pass_id
=
self
.
_train_pass
.
_pass_id
self
.
print_log
(
"======== BEGIN DAY:%s ========"
%
day
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
self
.
print_log
(
"======== BEGIN DAY:%s ========"
%
day
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
if
pass_id
==
self
.
_train_pass
.
max_pass_num_day
():
if
pass_id
==
self
.
_train_pass
.
max_pass_num_day
():
context
[
'status'
]
=
'end_day'
context
[
'status'
]
=
'end_day'
else
:
else
:
context
[
'status'
]
=
'train_pass'
context
[
'status'
]
=
'train_pass'
def
end_day
(
self
,
context
):
def
end_day
(
self
,
context
):
"""R
"""
day
=
self
.
_train_pass
.
date
()
day
=
self
.
_train_pass
.
date
()
pass_id
=
self
.
_train_pass
.
_pass_id
pass_id
=
self
.
_train_pass
.
_pass_id
xbox_base_key
=
int
(
time
.
time
())
xbox_base_key
=
int
(
time
.
time
())
...
@@ -264,7 +290,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
...
@@ -264,7 +290,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
kagle_util
.
rank0_print
(
"shrink table"
)
kagle_util
.
rank0_print
(
"shrink table"
)
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'shrink table done, cost %s sec'
})
{
'master'
:
True
,
'log_format'
:
'shrink table done, cost %s sec'
})
fleet
.
shrink_sparse_table
()
fleet
.
shrink_sparse_table
()
for
executor
in
self
.
_exector_context
:
for
executor
in
self
.
_exector_context
:
self
.
_exector_context
[
executor
][
'model'
].
shrink
({
self
.
_exector_context
[
executor
][
'model'
].
shrink
({
...
@@ -281,27 +307,29 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
...
@@ -281,27 +307,29 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
self
.
_train_pass
.
_base_key
=
xbox_base_key
self
.
_train_pass
.
_base_key
=
xbox_base_key
def
train_pass
(
self
,
context
):
def
train_pass
(
self
,
context
):
"""R
"""
stdout_str
=
""
stdout_str
=
""
day
=
self
.
_train_pass
.
date
()
day
=
self
.
_train_pass
.
date
()
pass_id
=
self
.
_train_pass
.
_pass_id
pass_id
=
self
.
_train_pass
.
_pass_id
base_key
=
self
.
_train_pass
.
_base_key
base_key
=
self
.
_train_pass
.
_base_key
pass_time
=
self
.
_train_pass
.
_current_train_time
.
strftime
(
"%Y%m%d%H%M"
)
pass_time
=
self
.
_train_pass
.
_current_train_time
.
strftime
(
"%Y%m%d%H%M"
)
self
.
print_log
(
" ==== begin delta:%s ========"
%
pass_id
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
self
.
print_log
(
" ==== begin delta:%s ========"
%
pass_id
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
train_begin_time
=
time
.
time
()
train_begin_time
=
time
.
time
()
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'load into memory done, cost %s sec'
,
'stdout'
:
stdout_str
})
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'load into memory done, cost %s sec'
,
'stdout'
:
stdout_str
})
current_dataset
=
{}
current_dataset
=
{}
for
name
in
self
.
_dataset
:
for
name
in
self
.
_dataset
:
current_dataset
[
name
]
=
self
.
_dataset
[
name
].
load_dataset
({
current_dataset
[
name
]
=
self
.
_dataset
[
name
].
load_dataset
({
'node_num'
:
fleet
.
worker_num
(),
'node_idx'
:
fleet
.
worker_index
(),
'node_num'
:
fleet
.
worker_num
(),
'node_idx'
:
fleet
.
worker_index
(),
'begin_time'
:
pass_time
,
'time_window_min'
:
self
.
_train_pass
.
_interval_per_pass
'begin_time'
:
pass_time
,
'time_window_min'
:
self
.
_train_pass
.
_interval_per_pass
})
})
cost_printer
.
done
()
cost_printer
.
done
()
kagle_util
.
rank0_print
(
"going to global shuffle"
)
kagle_util
.
rank0_print
(
"going to global shuffle"
)
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'stdout'
:
stdout_str
,
'master'
:
True
,
'stdout'
:
stdout_str
,
'log_format'
:
'global shuffle done, cost %s sec'
})
'log_format'
:
'global shuffle done, cost %s sec'
})
for
name
in
current_dataset
:
for
name
in
current_dataset
:
current_dataset
[
name
].
global_shuffle
(
fleet
,
self
.
global_config
[
'dataset'
][
'shuffle_thread'
])
current_dataset
[
name
].
global_shuffle
(
fleet
,
self
.
global_config
[
'dataset'
][
'shuffle_thread'
])
cost_printer
.
done
()
cost_printer
.
done
()
...
@@ -313,13 +341,14 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
...
@@ -313,13 +341,14 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
for
name
in
self
.
_dataset
:
for
name
in
self
.
_dataset
:
self
.
_dataset
[
name
].
preload_dataset
({
self
.
_dataset
[
name
].
preload_dataset
({
'node_num'
:
fleet
.
worker_num
(),
'node_idx'
:
fleet
.
worker_index
(),
'node_num'
:
fleet
.
worker_num
(),
'node_idx'
:
fleet
.
worker_index
(),
'begin_time'
:
next_pass_time
,
'time_window_min'
:
self
.
_train_pass
.
_interval_per_pass
'begin_time'
:
next_pass_time
,
'time_window_min'
:
self
.
_train_pass
.
_interval_per_pass
})
})
pure_train_begin
=
time
.
time
()
pure_train_begin
=
time
.
time
()
for
executor
in
self
.
global_config
[
'executor'
]:
for
executor
in
self
.
global_config
[
'executor'
]:
self
.
run_executor
(
executor
,
current_dataset
[
executor
[
'dataset_name'
]],
stdout_str
)
self
.
run_executor
(
executor
,
current_dataset
[
executor
[
'dataset_name'
]],
stdout_str
)
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'release_memory cost %s sec'
})
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
\
{
'master'
:
True
,
'log_format'
:
'release_memory cost %s sec'
})
for
name
in
current_dataset
:
for
name
in
current_dataset
:
current_dataset
[
name
].
release_memory
()
current_dataset
[
name
].
release_memory
()
pure_train_cost
=
time
.
time
()
-
pure_train_begin
pure_train_cost
=
time
.
time
()
-
pure_train_begin
...
...
kagle/trainer/kagle_trainer.py
浏览文件 @
9042cb45
...
@@ -8,7 +8,7 @@ import time
...
@@ -8,7 +8,7 @@ import time
class
Trainer
(
object
):
class
Trainer
(
object
):
"""R
"""R
"""
"""
__metaclass__
=
self
.
ABCMeta
__metaclass__
=
abc
.
ABCMeta
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
"""R
"""R
"""
"""
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录