Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleRec
提交
9042cb45
P
PaddleRec
项目概览
PaddlePaddle
/
PaddleRec
通知
68
Star
12
Fork
5
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
27
列表
看板
标记
里程碑
合并请求
10
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
27
Issue
27
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9042cb45
编写于
3月 06, 2020
作者:
X
xiexionghang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
depend on paddle with bcloud
上级
61b1fd00
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
184 addition
and
50 deletion
+184
-50
kagle/kagle_model.py
kagle/kagle_model.py
+8
-6
kagle/kagle_util.py
kagle/kagle_util.py
+106
-3
kagle/trainer/abacus_trainer.py
kagle/trainer/abacus_trainer.py
+69
-40
kagle/trainer/kagle_trainer.py
kagle/trainer/kagle_trainer.py
+1
-1
未找到文件。
kagle/kagle_model.py
浏览文件 @
9042cb45
...
...
@@ -77,6 +77,7 @@ class Model(object):
"""R
"""
pass
@
abc
.
abstractmethod
def
dump_inference_program
(
self
,
inference_layer
,
path
):
"""R
...
...
@@ -101,7 +102,8 @@ class Model(object):
if
node
[
'name'
]
not
in
self
.
_inference_meta
[
'dependency'
][
layer
]:
continue
if
'inference_param'
in
self
.
_build_param
[
'layer_extend'
][
node
[
'name'
]]:
self
.
_inference_meta
[
'params'
][
layer
]
+=
self
.
_build_param
[
'layer_extend'
][
node
[
'name'
]][
'inference_param'
][
'params'
]
self
.
_inference_meta
[
'params'
][
layer
]
+=
\
self
.
_build_param
[
'layer_extend'
][
node
[
'name'
]][
'inference_param'
][
'params'
]
return
self
.
_inference_meta
[
'params'
][
layer
]
def
get_dependency
(
self
,
layer_graph
,
dest_layer
):
...
...
@@ -192,10 +194,10 @@ class FluidModel(Model):
metrics
=
params
[
'metrics'
]
for
name
in
metrics
:
model_metrics
=
metrics
[
name
]
stat_var_names
+=
[
model_metrics
[
metric
][
'var'
].
name
for
metric
in
model_metrics
]
stat_var_names
+=
[
model_metrics
[
metric
][
'var'
].
name
for
metric
in
model_metrics
]
strategy
[
'stat_var_names'
]
=
list
(
set
(
stat_var_names
))
optimizer_generator
=
'optimizer = fluid.optimizer.'
+
optimizer_conf
[
'class'
]
+
\
'(learning_rate='
+
str
(
optimizer_conf
[
'learning_rate'
])
+
')'
'(learning_rate='
+
str
(
optimizer_conf
[
'learning_rate'
])
+
')'
exec
(
optimizer_generator
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
return
optimizer
...
...
@@ -233,12 +235,12 @@ class FluidModel(Model):
fleet
.
_fleet_ptr
.
pull_dense
(
scope
,
table
[
'_meta'
].
_table_id
,
table
[
'params'
])
for
infernce_item
in
params
[
'inference_list'
]:
params_name_list
=
self
.
inference_params
(
infernce_item
[
'layer_name'
])
params_var_list
=
[
program
.
global_block
().
var
(
i
)
for
i
in
params_name_list
]
params_var_list
=
[
program
.
global_block
().
var
(
i
)
for
i
in
params_name_list
]
params_file_name
=
infernce_item
[
'save_file_name'
]
with
fluid
.
scope_guard
(
scope
):
if
params
[
'save_combine'
]:
fluid
.
io
.
save_vars
(
executor
,
"./"
,
program
,
vars
=
params_var_list
,
filename
=
params_file_name
)
fluid
.
io
.
save_vars
(
executor
,
"./"
,
\
program
,
vars
=
params_var_list
,
filename
=
params_file_name
)
else
:
fluid
.
io
.
save_vars
(
executor
,
params_file_name
,
program
,
vars
=
params_var_list
)
pass
kagle/kagle_util.py
浏览文件 @
9042cb45
"""
Util lib
"""
import
os
import
sys
import
time
import
datetime
import
kagle_fs
import
kagle
.kagle
_fs
import
numpy
as
np
from
paddle.fluid.incubate.fleet.parameter_server.pslib
import
fleet
def
get_env_value
(
env_name
):
"""
get os environment value
"""
return
os
.
popen
(
"echo -n ${"
+
env_name
+
"}"
).
read
().
strip
()
def
now_time_str
():
return
"
\n
"
+
time
.
strftime
(
"%Y-%m-%d %H:%M:%S"
,
time
.
localtime
())
+
"[0]:"
"""
get current format str_time
"""
return
"
\n
"
+
time
.
strftime
(
"%Y-%m-%d %H:%M:%S"
,
time
.
localtime
())
+
"[0]:"
def
get_absolute_path
(
path
,
params
):
"""R
"""
if
path
.
startswith
(
'afs:'
)
or
path
.
startswith
(
'hdfs:'
):
sub_path
=
path
.
split
(
'fs:'
)[
1
]
if
':'
in
sub_path
:
#such as afs://xxx:prot/xxxx
...
...
@@ -23,6 +34,14 @@ def get_absolute_path(path, params):
return
path
def
make_datetime
(
date_str
,
fmt
=
None
):
"""
create a datetime instance by date_string
Args:
date_str: such as 2020-01-14
date_str_format: "%Y-%m-%d"
Return:
datetime
"""
if
fmt
is
None
:
if
len
(
date_str
)
==
8
:
#%Y%m%d
return
datetime
.
datetime
.
strptime
(
date_str
,
'%Y%m%d'
)
...
...
@@ -32,28 +51,51 @@ def make_datetime(date_str, fmt = None):
def
wroker_numric_opt
(
value
,
opt
):
"""
numric count opt for workers
Args:
value: value for count
opt: count operator, SUM/MAX/MIN/AVG
Return:
count result
"""
local_value
=
np
.
array
([
value
])
global_value
=
np
.
copy
(
local_value
)
*
0
fleet
.
_role_maker
.
_node_type_comm
.
Allreduce
(
local_value
,
global_value
,
op
=
opt
)
return
global_value
[
0
]
def
worker_numric_sum
(
value
):
"""R
"""
from
mpi4py
import
MPI
return
wroker_numric_opt
(
value
,
MPI
.
SUM
)
def
worker_numric_avg
(
value
):
"""R
"""
return
worker_numric_sum
(
value
)
/
fleet
.
worker_num
()
def
worker_numric_min
(
value
):
"""R
"""
from
mpi4py
import
MPI
return
wroker_numric_opt
(
value
,
MPI
.
MIN
)
def
worker_numric_max
(
value
):
"""R
"""
from
mpi4py
import
MPI
return
wroker_numric_opt
(
value
,
MPI
.
MAX
)
def
rank0_print
(
log_str
):
"""R
"""
print_log
(
log_str
,
{
'master'
:
True
})
def
print_log
(
log_str
,
params
):
"""R
"""
if
params
[
'master'
]:
if
fleet
.
worker_index
()
==
0
:
print
(
log_str
)
...
...
@@ -64,22 +106,33 @@ def print_log(log_str, params):
params
[
'stdout'
]
+=
str
(
datetime
.
datetime
.
now
())
+
log_str
def
print_cost
(
cost
,
params
):
"""R
"""
log_str
=
params
[
'log_format'
]
%
cost
print_log
(
log_str
,
params
)
return
log_str
class
CostPrinter
:
"""
For count cost time && print cost log
"""
def
__init__
(
self
,
callback
,
callback_params
):
"""R
"""
self
.
reset
(
callback
,
callback_params
)
pass
def
__del__
(
self
):
"""R
"""
if
not
self
.
_done
:
self
.
done
()
pass
def
reset
(
self
,
callback
,
callback_params
):
"""R
"""
self
.
_done
=
False
self
.
_callback
=
callback
self
.
_callback_params
=
callback_params
...
...
@@ -87,24 +140,35 @@ class CostPrinter:
pass
def
done
(
self
):
"""R
"""
cost
=
time
.
time
()
-
self
.
_begin_time
log_str
=
self
.
_callback
(
cost
,
self
.
_callback_params
)
#cost(s)
self
.
_done
=
True
return
cost
,
log_str
class
PathGenerator
:
"""
generate path with template & runtime variables
"""
def
__init__
(
self
,
config
):
"""R
"""
self
.
_templates
=
{}
self
.
add_path_template
(
config
)
pass
def
add_path_template
(
self
,
config
):
"""R
"""
if
'templates'
in
config
:
for
template
in
config
[
'templates'
]:
self
.
_templates
[
template
[
'name'
]]
=
template
[
'template'
]
pass
def
generate_path
(
self
,
template_name
,
param
):
"""R
"""
if
template_name
in
self
.
_templates
:
if
'time_format'
in
param
:
str
=
param
[
'time_format'
].
strftime
(
self
.
_templates
[
template_name
])
...
...
@@ -113,8 +177,15 @@ class PathGenerator:
else
:
return
""
class
TimeTrainPass
:
"""
timely pass
define pass time_interval && start_time && end_time
"""
def
__init__
(
self
,
global_config
):
"""R
"""
self
.
_config
=
global_config
[
'epoch'
]
if
'+'
in
self
.
_config
[
'days'
]:
day_str
=
self
.
_config
[
'days'
].
replace
(
' '
,
''
)
...
...
@@ -156,9 +227,13 @@ class TimeTrainPass:
self
.
init_pass_by_id
(
done_fileds
[
0
],
self
.
_checkpoint_pass_id
)
def
max_pass_num_day
(
self
):
"""R
"""
return
24
*
60
/
self
.
_interval_per_pass
def
save_train_progress
(
self
,
day
,
pass_id
,
base_key
,
model_path
,
is_checkpoint
):
"""R
"""
if
is_checkpoint
:
self
.
_checkpoint_pass_id
=
pass_id
self
.
_checkpoint_model_path
=
model_path
...
...
@@ -168,6 +243,12 @@ class TimeTrainPass:
pass
def
init_pass_by_id
(
self
,
date_str
,
pass_id
):
"""
init pass context with pass_id
Args:
date_str: example "20200110"
pass_id(int): pass_id of date
"""
date_time
=
make_datetime
(
date_str
)
if
pass_id
<
1
:
pass_id
=
0
...
...
@@ -179,14 +260,23 @@ class TimeTrainPass:
print
(
self
.
_current_train_time
)
def
init_pass_by_time
(
self
,
datetime_str
):
"""
init pass context with datetime
Args:
date_str: example "20200110000" -> "%Y%m%d%H%M"
"""
self
.
_current_train_time
=
make_datetime
(
datetime_str
)
minus
=
self
.
_current_train_time
.
hour
*
60
+
self
.
_current_train_time
.
minute
;
self
.
_pass_id
=
minus
/
self
.
_interval_per_pass
+
1
def
current_pass
():
def
current_pass
(
self
):
"""R
"""
return
self
.
_pass_id
def
next
(
self
):
"""R
"""
has_next
=
True
old_pass_id
=
self
.
_pass_id
if
self
.
_pass_id
<
1
:
...
...
@@ -202,6 +292,8 @@ class TimeTrainPass:
return
has_next
def
is_checkpoint_pass
(
self
,
pass_id
):
"""R
"""
if
pass_id
<
1
:
return
True
if
pass_id
==
self
.
max_pass_num_day
():
...
...
@@ -211,10 +303,21 @@ class TimeTrainPass:
return
False
def
need_dump_inference
(
self
,
pass_id
):
"""R
"""
return
self
.
_inference_pass_id
<
pass_id
and
pass_id
%
self
.
_dump_inference_interval
==
0
def
date
(
self
,
delta_day
=
0
):
"""
get train date
Args:
delta_day(int): n day afer current_train_date
Return:
date(current_train_time + delta_day)
"""
return
(
self
.
_current_train_time
+
datetime
.
timedelta
(
days
=
delta_day
)).
strftime
(
"%Y%m%d"
)
def
timestamp
(
self
,
delta_day
=
0
):
"""R
"""
return
(
self
.
_current_train_time
+
datetime
.
timedelta
(
days
=
delta_day
)).
timestamp
()
kagle/trainer/abacus_trainer.py
浏览文件 @
9042cb45
"""
A paddle trainer Adapt to Abacus
"""
import
abc
import
sys
import
copy
import
yaml
import
time
import
json
import
datetime
import
kagle
_trainer
from
..
import
kagle_fs
from
..
import
kagle_uti
l
from
..
import
kagle_model
from
..
import
kagle_metric
from
..
import
kagle_dataset
import
kagle
.kagle_fs
import
kagle.kagle_util
import
kagle.kagle_mode
l
import
kagle.kagle_metric
import
kagle.kagle_dataset
import
kagle.trainer.kagle_trainer
import
paddle.fluid
as
fluid
from
abc
import
ABCMeta
,
abstractmethod
from
paddle.fluid.incubate.fleet.parameter_server.pslib
import
fleet
class
AbacusPaddleTrainer
(
kagle_trainer
.
Trainer
):
"""R
"""
def
__init__
(
self
,
config
):
"""R
"""
kagle_trainer
.
Trainer
.
__init__
(
self
,
config
)
config
[
'output_path'
]
=
kagle_util
.
get_absolute_path
(
config
[
'output_path'
],
config
[
'io'
][
'afs'
])
...
...
@@ -43,6 +50,8 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
self
.
regist_context_processor
(
'end_day'
,
self
.
end_day
)
def
init
(
self
,
context
):
"""R
"""
fleet
.
init
(
self
.
_exe
)
data_var_list
=
[]
data_var_name_dict
=
{}
...
...
@@ -77,7 +86,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
if
not
executor
[
'is_update_sparse'
]:
program
.
_fleet_opt
[
"program_configs"
][
str
(
id
(
model
.
get_cost_op
().
block
.
program
))][
"push_sparse"
]
=
[]
if
'train_thread_num'
not
in
executor
:
executor
[
'train_thread_num'
]
=
global_config
[
'train_thread_num'
]
executor
[
'train_thread_num'
]
=
self
.
global_config
[
'train_thread_num'
]
with
fluid
.
scope_guard
(
scope
):
self
.
_exe
.
run
(
model
.
_build_param
[
'model'
][
'startup_program'
])
model
.
dump_model_program
(
'./'
)
...
...
@@ -98,23 +107,29 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
pass
def
print_log
(
self
,
log_str
,
params
):
"""R
"""
params
[
'index'
]
=
fleet
.
worker_index
()
return
kagle_util
.
print_log
(
log_str
,
params
)
def
print_global_metrics
(
self
,
scope
,
model
,
monitor_data
,
stdout_str
):
"""R
"""
metrics
=
model
.
get_metrics
()
metric_calculator
=
kagle_metric
.
PaddleAUCMetric
(
None
)
for
metric
in
metrics
:
metric_param
=
{
'label'
:
metric
,
'metric_dict'
:
metrics
[
metric
]}
metric_param
=
{
'label'
:
metric
,
'metric_dict'
:
metrics
[
metric
]}
metric_calculator
.
calculate
(
scope
,
metric_param
)
metric_result
=
metric_calculator
.
get_result_to_string
()
self
.
print_log
(
metric_result
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
self
.
print_log
(
metric_result
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
monitor_data
+=
metric_result
metric_calculator
.
clear
(
scope
,
metric_param
)
def
save_model
(
self
,
day
,
pass_index
,
base_key
):
"""R
"""
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'save model cost %s sec'
})
{
'master'
:
True
,
'log_format'
:
'save model cost %s sec'
})
model_path
=
self
.
_path_generator
.
generate_path
(
'batch_model'
,
{
'day'
:
day
,
'pass_id'
:
pass_index
})
save_mode
=
0
# just save all
if
pass_index
<
1
:
#batch_model
...
...
@@ -126,27 +141,30 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
return
model_path
def
save_xbox_model
(
self
,
day
,
pass_index
,
xbox_base_key
,
monitor_data
):
"""R
"""
stdout_str
=
""
xbox_patch_id
=
str
(
int
(
time
.
time
()))
kagle_util
.
rank0_print
(
"begin save delta model"
)
model_path
=
""
xbox_model_donefile
=
""
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'save xbox model cost %s sec'
,
'stdout'
:
stdout_str
})
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
\
'log_format'
:
'save xbox model cost %s sec'
,
'stdout'
:
stdout_str
})
if
pass_index
<
1
:
save_mode
=
2
xbox_patch_id
=
xbox_base_key
model_path
=
self
.
_path_generator
.
generate_path
(
'xbox_base'
,
{
'day'
:
day
})
xbox_model_donefile
=
self
.
_path_generator
.
generate_path
(
'xbox_base_done'
,
{
'day'
:
day
})
model_path
=
self
.
_path_generator
.
generate_path
(
'xbox_base'
,
{
'day'
:
day
})
xbox_model_donefile
=
self
.
_path_generator
.
generate_path
(
'xbox_base_done'
,
{
'day'
:
day
})
else
:
save_mode
=
1
model_path
=
self
.
_path_generator
.
generate_path
(
'xbox_delta'
,
{
'day'
:
day
,
'pass_id'
:
pass_index
})
xbox_model_donefile
=
self
.
_path_generator
.
generate_path
(
'xbox_delta_done'
,
{
'day'
:
day
})
model_path
=
self
.
_path_generator
.
generate_path
(
'xbox_delta'
,
{
'day'
:
day
,
'pass_id'
:
pass_index
})
xbox_model_donefile
=
self
.
_path_generator
.
generate_path
(
'xbox_delta_done'
,
{
'day'
:
day
})
total_save_num
=
fleet
.
save_persistables
(
None
,
model_path
,
mode
=
save_mode
)
cost_printer
.
done
()
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'save cache model cost %s sec'
,
'stdout'
:
stdout_str
})
'log_format'
:
'save cache model cost %s sec'
,
'stdout'
:
stdout_str
})
model_file_handler
=
kagle_fs
.
FileHandler
(
self
.
global_config
[
'io'
][
'afs'
])
if
self
.
global_config
[
'save_cache_model'
]:
cache_save_num
=
fleet
.
save_cache_model
(
None
,
model_path
,
mode
=
save_mode
)
...
...
@@ -161,7 +179,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
'save_combine'
:
True
}
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'save dense model cost %s sec'
,
'stdout'
:
stdout_str
})
'log_format'
:
'save dense model cost %s sec'
,
'stdout'
:
stdout_str
})
for
executor
in
self
.
global_config
[
'executor'
]:
if
'layer_for_inference'
not
in
executor
:
continue
...
...
@@ -176,17 +194,17 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
cost_printer
.
done
()
xbox_done_info
=
{
"id"
:
xbox_patch_id
,
"key"
:
xbox_base_key
,
"ins_path"
:
""
,
"ins_tag"
:
"feasign"
,
"partition_type"
:
"2"
,
"record_count"
:
"111111"
,
"monitor_data"
:
monitor_data
,
"mpi_size"
:
str
(
fleet
.
worker_num
()),
"input"
:
model_path
.
rstrip
(
"/"
)
+
"/000"
,
"job_id"
:
kagle_util
.
get_env_value
(
"JOB_ID"
),
"job_name"
:
kagle_util
.
get_env_value
(
"JOB_NAME"
)
"id"
:
xbox_patch_id
,
"key"
:
xbox_base_key
,
"ins_path"
:
""
,
"ins_tag"
:
"feasign"
,
"partition_type"
:
"2"
,
"record_count"
:
"111111"
,
"monitor_data"
:
monitor_data
,
"mpi_size"
:
str
(
fleet
.
worker_num
()),
"input"
:
model_path
.
rstrip
(
"/"
)
+
"/000"
,
"job_id"
:
kagle_util
.
get_env_value
(
"JOB_ID"
),
"job_name"
:
kagle_util
.
get_env_value
(
"JOB_NAME"
)
}
model_file_handler
.
write
(
json
.
dumps
(
xbox_done_info
)
+
"
\n
"
,
xbox_model_donefile
,
'a'
)
if
pass_index
>
0
:
...
...
@@ -194,6 +212,8 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
return
stdout_str
def
run_executor
(
self
,
executor_config
,
dataset
,
stdout_str
):
"""R
"""
day
=
self
.
_train_pass
.
date
()
pass_id
=
self
.
_train_pass
.
_pass_id
xbox_base_key
=
self
.
_train_pass
.
_base_key
...
...
@@ -221,6 +241,8 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
stdout_str
+=
self
.
save_xbox_model
(
day
,
pass_id
,
xbox_base_key
,
monitor_data
)
def
startup
(
self
,
context
):
"""R
"""
if
fleet
.
is_server
():
fleet
.
run_server
()
context
[
'status'
]
=
'wait'
...
...
@@ -239,24 +261,28 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
cost_printer
.
done
()
if
self
.
global_config
[
'save_first_base'
]:
self
.
print_log
(
"save_first_base=True"
,
{
'master'
:
True
})
self
.
print_log
(
"going to save xbox base model"
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
self
.
print_log
(
"going to save xbox base model"
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
self
.
_train_pass
.
_base_key
=
int
(
time
.
time
())
stdout_str
+=
self
.
save_xbox_model
(
day
,
0
,
self
.
_train_pass
.
_base_key
,
""
)
stdout_str
+=
self
.
save_xbox_model
(
self
.
_train_pass
.
date
()
,
0
,
self
.
_train_pass
.
_base_key
,
""
)
context
[
'status'
]
=
'begin_day'
def
begin_day
(
self
,
context
):
"""R
"""
stdout_str
=
""
if
not
self
.
_train_pass
.
next
():
context
[
'is_exit'
]
=
True
day
=
self
.
_train_pass
.
date
()
pass_id
=
self
.
_train_pass
.
_pass_id
self
.
print_log
(
"======== BEGIN DAY:%s ========"
%
day
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
self
.
print_log
(
"======== BEGIN DAY:%s ========"
%
day
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
if
pass_id
==
self
.
_train_pass
.
max_pass_num_day
():
context
[
'status'
]
=
'end_day'
else
:
context
[
'status'
]
=
'train_pass'
def
end_day
(
self
,
context
):
"""R
"""
day
=
self
.
_train_pass
.
date
()
pass_id
=
self
.
_train_pass
.
_pass_id
xbox_base_key
=
int
(
time
.
time
())
...
...
@@ -264,7 +290,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
kagle_util
.
rank0_print
(
"shrink table"
)
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'shrink table done, cost %s sec'
})
{
'master'
:
True
,
'log_format'
:
'shrink table done, cost %s sec'
})
fleet
.
shrink_sparse_table
()
for
executor
in
self
.
_exector_context
:
self
.
_exector_context
[
executor
][
'model'
].
shrink
({
...
...
@@ -281,27 +307,29 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
self
.
_train_pass
.
_base_key
=
xbox_base_key
def
train_pass
(
self
,
context
):
"""R
"""
stdout_str
=
""
day
=
self
.
_train_pass
.
date
()
pass_id
=
self
.
_train_pass
.
_pass_id
base_key
=
self
.
_train_pass
.
_base_key
pass_time
=
self
.
_train_pass
.
_current_train_time
.
strftime
(
"%Y%m%d%H%M"
)
self
.
print_log
(
" ==== begin delta:%s ========"
%
pass_id
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
self
.
print_log
(
" ==== begin delta:%s ========"
%
pass_id
,
{
'master'
:
True
,
'stdout'
:
stdout_str
})
train_begin_time
=
time
.
time
()
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'load into memory done, cost %s sec'
,
'stdout'
:
stdout_str
})
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'load into memory done, cost %s sec'
,
'stdout'
:
stdout_str
})
current_dataset
=
{}
for
name
in
self
.
_dataset
:
current_dataset
[
name
]
=
self
.
_dataset
[
name
].
load_dataset
({
'node_num'
:
fleet
.
worker_num
(),
'node_idx'
:
fleet
.
worker_index
(),
'begin_time'
:
pass_time
,
'time_window_min'
:
self
.
_train_pass
.
_interval_per_pass
'begin_time'
:
pass_time
,
'time_window_min'
:
self
.
_train_pass
.
_interval_per_pass
})
cost_printer
.
done
()
kagle_util
.
rank0_print
(
"going to global shuffle"
)
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'stdout'
:
stdout_str
,
'log_format'
:
'global shuffle done, cost %s sec'
})
'master'
:
True
,
'stdout'
:
stdout_str
,
'log_format'
:
'global shuffle done, cost %s sec'
})
for
name
in
current_dataset
:
current_dataset
[
name
].
global_shuffle
(
fleet
,
self
.
global_config
[
'dataset'
][
'shuffle_thread'
])
cost_printer
.
done
()
...
...
@@ -313,13 +341,14 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
for
name
in
self
.
_dataset
:
self
.
_dataset
[
name
].
preload_dataset
({
'node_num'
:
fleet
.
worker_num
(),
'node_idx'
:
fleet
.
worker_index
(),
'begin_time'
:
next_pass_time
,
'time_window_min'
:
self
.
_train_pass
.
_interval_per_pass
'begin_time'
:
next_pass_time
,
'time_window_min'
:
self
.
_train_pass
.
_interval_per_pass
})
pure_train_begin
=
time
.
time
()
for
executor
in
self
.
global_config
[
'executor'
]:
self
.
run_executor
(
executor
,
current_dataset
[
executor
[
'dataset_name'
]],
stdout_str
)
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'release_memory cost %s sec'
})
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
\
{
'master'
:
True
,
'log_format'
:
'release_memory cost %s sec'
})
for
name
in
current_dataset
:
current_dataset
[
name
].
release_memory
()
pure_train_cost
=
time
.
time
()
-
pure_train_begin
...
...
kagle/trainer/kagle_trainer.py
浏览文件 @
9042cb45
...
...
@@ -8,7 +8,7 @@ import time
class
Trainer
(
object
):
"""R
"""
__metaclass__
=
self
.
ABCMeta
__metaclass__
=
abc
.
ABCMeta
def
__init__
(
self
,
config
):
"""R
"""
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录