Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleRec
提交
f9ef376e
P
PaddleRec
项目概览
PaddlePaddle
/
PaddleRec
通知
68
Star
12
Fork
5
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
27
列表
看板
标记
里程碑
合并请求
10
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
27
Issue
27
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f9ef376e
编写于
5月 20, 2020
作者:
T
tangwei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix code style
上级
bfce2242
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
383 addition
and
227 deletion
+383
-227
core/modules/modul/build.py
core/modules/modul/build.py
+54
-22
core/modules/modul/layers.py
core/modules/modul/layers.py
+87
-26
core/trainer.py
core/trainer.py
+4
-2
core/trainers/cluster_trainer.py
core/trainers/cluster_trainer.py
+22
-19
core/trainers/online_learning_trainer.py
core/trainers/online_learning_trainer.py
+23
-19
core/trainers/single_trainer.py
core/trainers/single_trainer.py
+15
-15
core/trainers/tdm_cluster_trainer.py
core/trainers/tdm_cluster_trainer.py
+21
-19
core/trainers/tdm_single_trainer.py
core/trainers/tdm_single_trainer.py
+30
-25
core/trainers/transpiler_trainer.py
core/trainers/transpiler_trainer.py
+39
-34
core/utils/dataset_holder.py
core/utils/dataset_holder.py
+45
-20
core/utils/dataset_instance.py
core/utils/dataset_instance.py
+3
-2
core/utils/envs.py
core/utils/envs.py
+6
-7
core/utils/util.py
core/utils/util.py
+34
-17
未找到文件。
core/modules/modul/build.py
浏览文件 @
f9ef376e
...
...
@@ -31,6 +31,7 @@ def create(config):
Model Instance
"""
model
=
None
if
config
[
'mode'
]
==
'fluid'
:
model
=
YamlModel
(
config
)
model
.
train_net
()
...
...
@@ -50,7 +51,12 @@ class YamlModel(Model):
f
=
open
(
config
[
'layer_file'
],
'r'
)
self
.
_build_nodes
=
yaml
.
safe_load
(
f
.
read
())
self
.
_build_phase
=
[
'input'
,
'param'
,
'summary'
,
'layer'
]
self
.
_build_param
=
{
'layer'
:
{},
'inner_layer'
:
{},
'layer_extend'
:
{},
'model'
:
{}}
self
.
_build_param
=
{
'layer'
:
{},
'inner_layer'
:
{},
'layer_extend'
:
{},
'model'
:
{}
}
self
.
_inference_meta
=
{
'dependency'
:
{},
'params'
:
{}}
def
train_net
(
self
):
...
...
@@ -76,10 +82,12 @@ class YamlModel(Model):
if
self
.
_build_nodes
[
phase
]
is
None
:
continue
for
node
in
self
.
_build_nodes
[
phase
]:
exec
(
"""layer=layer.{}(node)"""
.
format
(
node
[
'class'
]))
layer_output
,
extend_output
=
layer
.
generate
(
self
.
_config
[
'mode'
],
self
.
_build_param
)
exec
(
"""layer=layer.{}(node)"""
.
format
(
node
[
'class'
]))
layer_output
,
extend_output
=
layer
.
generate
(
self
.
_config
[
'mode'
],
self
.
_build_param
)
self
.
_build_param
[
'layer'
][
node
[
'name'
]]
=
layer_output
self
.
_build_param
[
'layer_extend'
][
node
[
'name'
]]
=
extend_output
self
.
_build_param
[
'layer_extend'
][
node
[
'name'
]]
=
extend_output
if
extend_output
is
None
:
continue
if
'loss'
in
extend_output
:
...
...
@@ -89,17 +97,24 @@ class YamlModel(Model):
self
.
_cost
+=
extend_output
[
'loss'
]
if
'data_var'
in
extend_output
:
self
.
_data_var
+=
extend_output
[
'data_var'
]
if
'metric_label'
in
extend_output
and
extend_output
[
'metric_label'
]
is
not
None
:
self
.
_metrics
[
extend_output
[
'metric_label'
]]
=
extend_output
[
'metric_dict'
]
if
'metric_label'
in
extend_output
and
extend_output
[
'metric_label'
]
is
not
None
:
self
.
_metrics
[
extend_output
[
'metric_label'
]]
=
extend_output
[
'metric_dict'
]
if
'inference_param'
in
extend_output
:
inference_param
=
extend_output
[
'inference_param'
]
param_name
=
inference_param
[
'name'
]
if
param_name
not
in
self
.
_build_param
[
'table'
]:
self
.
_build_param
[
'table'
][
param_name
]
=
{
'params'
:
[]}
table_meta
=
table
.
TableMeta
.
alloc_new_table
(
inference_param
[
'table_id'
])
self
.
_build_param
[
'table'
][
param_name
][
'_meta'
]
=
table_meta
self
.
_build_param
[
'table'
][
param_name
][
'params'
]
+=
inference_param
[
'params'
]
self
.
_build_param
[
'table'
][
param_name
]
=
{
'params'
:
[]
}
table_meta
=
table
.
TableMeta
.
alloc_new_table
(
inference_param
[
'table_id'
])
self
.
_build_param
[
'table'
][
param_name
][
'_meta'
]
=
table_meta
self
.
_build_param
[
'table'
][
param_name
][
'params'
]
+=
inference_param
[
'params'
]
pass
@
classmethod
...
...
@@ -114,20 +129,25 @@ class YamlModel(Model):
metrics
=
params
[
'metrics'
]
for
name
in
metrics
:
model_metrics
=
metrics
[
name
]
stat_var_names
+=
[
model_metrics
[
metric
][
'var'
].
name
for
metric
in
model_metrics
]
stat_var_names
+=
[
model_metrics
[
metric
][
'var'
].
name
for
metric
in
model_metrics
]
strategy
[
'stat_var_names'
]
=
list
(
set
(
stat_var_names
))
optimizer_generator
=
'optimizer = fluid.optimizer.'
+
optimizer_conf
[
'class'
]
+
\
'(learning_rate='
+
str
(
optimizer_conf
[
'learning_rate'
])
+
')'
exec
(
optimizer_generator
)
exec
(
optimizer_generator
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
return
optimizer
def
dump_model_program
(
self
,
path
):
"""R
"""
with
open
(
path
+
'/'
+
self
.
_name
+
'_main_program.pbtxt'
,
"w"
)
as
fout
:
with
open
(
path
+
'/'
+
self
.
_name
+
'_main_program.pbtxt'
,
"w"
)
as
fout
:
print
>>
fout
,
self
.
_build_param
[
'model'
][
'train_program'
]
with
open
(
path
+
'/'
+
self
.
_name
+
'_startup_program.pbtxt'
,
"w"
)
as
fout
:
with
open
(
path
+
'/'
+
self
.
_name
+
'_startup_program.pbtxt'
,
"w"
)
as
fout
:
print
>>
fout
,
self
.
_build_param
[
'model'
][
'startup_program'
]
pass
...
...
@@ -137,7 +157,8 @@ class YamlModel(Model):
scope
=
params
[
'scope'
]
decay
=
params
[
'decay'
]
for
param_table
in
self
.
_build_param
[
'table'
]:
table_id
=
self
.
_build_param
[
'table'
][
param_table
][
'_meta'
].
_table_id
table_id
=
self
.
_build_param
[
'table'
][
param_table
][
'_meta'
].
_table_id
fleet
.
shrink_dense_table
(
decay
,
scope
=
scope
,
table_id
=
table_id
)
def
dump_inference_program
(
self
,
inference_layer
,
path
):
...
...
@@ -152,17 +173,25 @@ class YamlModel(Model):
executor
=
params
[
'executor'
]
program
=
self
.
_build_param
[
'model'
][
'train_program'
]
for
table_name
,
table
in
self
.
_build_param
[
'table'
].
items
():
fleet
.
_fleet_ptr
.
pull_dense
(
scope
,
table
[
'_meta'
].
_table_id
,
table
[
'params'
])
fleet
.
_fleet_ptr
.
pull_dense
(
scope
,
table
[
'_meta'
].
_table_id
,
table
[
'params'
])
for
infernce_item
in
params
[
'inference_list'
]:
params_name_list
=
self
.
inference_params
(
infernce_item
[
'layer_name'
])
params_var_list
=
[
program
.
global_block
().
var
(
i
)
for
i
in
params_name_list
]
params_name_list
=
self
.
inference_params
(
infernce_item
[
'layer_name'
])
params_var_list
=
[
program
.
global_block
().
var
(
i
)
for
i
in
params_name_list
]
params_file_name
=
infernce_item
[
'save_file_name'
]
with
fluid
.
scope_guard
(
scope
):
if
params
[
'save_combine'
]:
fluid
.
io
.
save_vars
(
executor
,
"./"
,
\
program
,
vars
=
params_var_list
,
filename
=
params_file_name
)
else
:
fluid
.
io
.
save_vars
(
executor
,
params_file_name
,
program
,
vars
=
params_var_list
)
fluid
.
io
.
save_vars
(
executor
,
params_file_name
,
program
,
vars
=
params_var_list
)
def
inference_params
(
self
,
inference_layer
):
"""
...
...
@@ -177,11 +206,13 @@ class YamlModel(Model):
return
self
.
_inference_meta
[
'params'
][
layer
]
self
.
_inference_meta
[
'params'
][
layer
]
=
[]
self
.
_inference_meta
[
'dependency'
][
layer
]
=
self
.
get_dependency
(
self
.
_build_param
[
'inner_layer'
],
layer
)
self
.
_inference_meta
[
'dependency'
][
layer
]
=
self
.
get_dependency
(
self
.
_build_param
[
'inner_layer'
],
layer
)
for
node
in
self
.
_build_nodes
[
'layer'
]:
if
node
[
'name'
]
not
in
self
.
_inference_meta
[
'dependency'
][
layer
]:
continue
if
'inference_param'
in
self
.
_build_param
[
'layer_extend'
][
node
[
'name'
]]:
if
'inference_param'
in
self
.
_build_param
[
'layer_extend'
][
node
[
'name'
]]:
self
.
_inference_meta
[
'params'
][
layer
]
+=
\
self
.
_build_param
[
'layer_extend'
][
node
[
'name'
]][
'inference_param'
][
'params'
]
return
self
.
_inference_meta
[
'params'
][
layer
]
...
...
@@ -199,5 +230,6 @@ class YamlModel(Model):
dependencys
=
copy
.
deepcopy
(
layer_graph
[
dest_layer
][
'input'
])
dependency_list
=
copy
.
deepcopy
(
dependencys
)
for
dependency
in
dependencys
:
dependency_list
=
dependency_list
+
self
.
get_dependency
(
layer_graph
,
dependency
)
dependency_list
=
dependency_list
+
self
.
get_dependency
(
layer_graph
,
dependency
)
return
list
(
set
(
dependency_list
))
core/modules/modul/layers.py
浏览文件 @
f9ef376e
...
...
@@ -18,7 +18,7 @@ from paddlerec.core.layer import Layer
class
EmbeddingFuseLayer
(
Layer
):
"""
R
"""
embedding + sequence + concat
"""
def
__init__
(
self
,
config
):
...
...
@@ -40,7 +40,8 @@ class EmbeddingFuseLayer(Layer):
show_clk
.
stop_gradient
=
True
data_var
=
[]
for
slot
in
self
.
_slots
:
l
=
fluid
.
layers
.
data
(
name
=
slot
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
l
=
fluid
.
layers
.
data
(
name
=
slot
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
data_var
.
append
(
l
)
emb
=
fluid
.
layers
.
embedding
(
input
=
l
,
size
=
[
10
,
self
.
_emb_dim
],
\
is_sparse
=
True
,
is_distributed
=
True
,
...
...
@@ -48,7 +49,8 @@ class EmbeddingFuseLayer(Layer):
emb
=
fluid
.
layers
.
sequence_pool
(
input
=
emb
,
pool_type
=
'sum'
)
emb
=
fluid
.
layers
.
continuous_value_model
(
emb
,
show_clk
,
self
.
_cvm
)
self
.
_emb_layers
.
append
(
emb
)
output
=
fluid
.
layers
.
concat
(
input
=
self
.
_emb_layers
,
axis
=
1
,
name
=
self
.
_name
)
output
=
fluid
.
layers
.
concat
(
input
=
self
.
_emb_layers
,
axis
=
1
,
name
=
self
.
_name
)
return
output
,
{
'data_var'
:
data_var
}
...
...
@@ -111,7 +113,13 @@ class ParamLayer(Layer):
def
generate
(
self
,
param
):
"""R
"""
return
self
.
_config
,
{
'inference_param'
:
{
'name'
:
'param'
,
'params'
:
[],
'table_id'
:
self
.
_table_id
}}
return
self
.
_config
,
{
'inference_param'
:
{
'name'
:
'param'
,
'params'
:
[],
'table_id'
:
self
.
_table_id
}
}
class
SummaryLayer
(
Layer
):
...
...
@@ -129,7 +137,13 @@ class SummaryLayer(Layer):
def
generate
(
self
,
param
):
"""R
"""
return
self
.
_config
,
{
'inference_param'
:
{
'name'
:
'summary'
,
'params'
:
[],
'table_id'
:
self
.
_table_id
}}
return
self
.
_config
,
{
'inference_param'
:
{
'name'
:
'summary'
,
'params'
:
[],
'table_id'
:
self
.
_table_id
}
}
class
NormalizationLayer
(
Layer
):
...
...
@@ -152,9 +166,19 @@ class NormalizationLayer(Layer):
if
len
(
self
.
_input
)
>
0
:
input_list
=
[
param
[
'layer'
][
i
]
for
i
in
self
.
_input
]
input_layer
=
fluid
.
layers
.
concat
(
input
=
input_list
,
axis
=
1
)
bn
=
fluid
.
layers
.
data_norm
(
input
=
input_layer
,
name
=
self
.
_name
,
epsilon
=
1e-4
,
param_attr
=
{
"batch_size"
:
1e4
,
"batch_sum_default"
:
0.0
,
"batch_square"
:
1e4
})
inference_param
=
[
self
.
_name
+
'.batch_size'
,
self
.
_name
+
'.batch_sum'
,
self
.
_name
+
'.batch_square_sum'
]
bn
=
fluid
.
layers
.
data_norm
(
input
=
input_layer
,
name
=
self
.
_name
,
epsilon
=
1e-4
,
param_attr
=
{
"batch_size"
:
1e4
,
"batch_sum_default"
:
0.0
,
"batch_square"
:
1e4
})
inference_param
=
[
self
.
_name
+
'.batch_size'
,
self
.
_name
+
'.batch_sum'
,
self
.
_name
+
'.batch_square_sum'
]
return
bn
,
{
'inference_param'
:
{
'name'
:
'summary'
,
\
'params'
:
inference_param
,
'table_id'
:
summary_layer
.
get
(
'table_id'
,
-
1
)}}
...
...
@@ -181,11 +205,13 @@ class FCLayer(Layer):
input_list
=
[
param
[
'layer'
][
i
]
for
i
in
self
.
_input
]
input_layer
=
fluid
.
layers
.
concat
(
input
=
input_list
,
axis
=
1
)
input_coln
=
input_layer
.
shape
[
1
]
scale
=
param_layer
[
'init_range'
]
/
(
input_coln
**
0.5
)
scale
=
param_layer
[
'init_range'
]
/
(
input_coln
**
0.5
)
bias
=
None
if
self
.
_bias
:
bias
=
fluid
.
ParamAttr
(
learning_rate
=
1.0
,
initializer
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
scale
))
bias
=
fluid
.
ParamAttr
(
learning_rate
=
1.0
,
initializer
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
scale
))
fc
=
fluid
.
layers
.
fc
(
name
=
self
.
_name
,
input
=
input_layer
,
...
...
@@ -216,18 +242,46 @@ class LogLossLayer(Layer):
self
.
_extend_output
=
{
'metric_label'
:
self
.
_metric_label
,
'metric_dict'
:
{
'auc'
:
{
'var'
:
None
},
'batch_auc'
:
{
'var'
:
None
},
'stat_pos'
:
{
'var'
:
None
,
'data_type'
:
'int64'
},
'stat_neg'
:
{
'var'
:
None
,
'data_type'
:
'int64'
},
'batch_stat_pos'
:
{
'var'
:
None
,
'data_type'
:
'int64'
},
'batch_stat_neg'
:
{
'var'
:
None
,
'data_type'
:
'int64'
},
'pos_ins_num'
:
{
'var'
:
None
},
'abserr'
:
{
'var'
:
None
},
'sqrerr'
:
{
'var'
:
None
},
'prob'
:
{
'var'
:
None
},
'total_ins_num'
:
{
'var'
:
None
},
'q'
:
{
'var'
:
None
}
'auc'
:
{
'var'
:
None
},
'batch_auc'
:
{
'var'
:
None
},
'stat_pos'
:
{
'var'
:
None
,
'data_type'
:
'int64'
},
'stat_neg'
:
{
'var'
:
None
,
'data_type'
:
'int64'
},
'batch_stat_pos'
:
{
'var'
:
None
,
'data_type'
:
'int64'
},
'batch_stat_neg'
:
{
'var'
:
None
,
'data_type'
:
'int64'
},
'pos_ins_num'
:
{
'var'
:
None
},
'abserr'
:
{
'var'
:
None
},
'sqrerr'
:
{
'var'
:
None
},
'prob'
:
{
'var'
:
None
},
'total_ins_num'
:
{
'var'
:
None
},
'q'
:
{
'var'
:
None
}
}
}
...
...
@@ -236,9 +290,12 @@ class LogLossLayer(Layer):
"""
input_layer
=
param
[
'layer'
][
self
.
_input
[
0
]]
label_layer
=
param
[
'layer'
][
self
.
_label
]
output
=
fluid
.
layers
.
clip
(
input_layer
,
self
.
_bound
[
0
],
self
.
_bound
[
1
],
name
=
self
.
_name
)
output
=
fluid
.
layers
.
clip
(
input_layer
,
self
.
_bound
[
0
],
self
.
_bound
[
1
],
name
=
self
.
_name
)
norm
=
fluid
.
layers
.
sigmoid
(
output
,
name
=
self
.
_name
)
output
=
fluid
.
layers
.
log_loss
(
norm
,
fluid
.
layers
.
cast
(
x
=
label_layer
,
dtype
=
'float32'
))
output
=
fluid
.
layers
.
log_loss
(
norm
,
fluid
.
layers
.
cast
(
x
=
label_layer
,
dtype
=
'float32'
))
if
self
.
_weight
:
weight_layer
=
param
[
'layer'
][
self
.
_weight
]
output
=
fluid
.
layers
.
elementwise_mul
(
output
,
weight_layer
)
...
...
@@ -248,7 +305,11 @@ class LogLossLayer(Layer):
# For AUC Metric
metric
=
self
.
_extend_output
[
'metric_dict'
]
binary_predict
=
fluid
.
layers
.
concat
(
input
=
[
fluid
.
layers
.
elementwise_sub
(
fluid
.
layers
.
ceil
(
norm
),
norm
),
norm
],
axis
=
1
)
input
=
[
fluid
.
layers
.
elementwise_sub
(
fluid
.
layers
.
ceil
(
norm
),
norm
),
norm
],
axis
=
1
)
metric
[
'auc'
][
'var'
],
metric
[
'batch_auc'
][
'var'
],
[
metric
[
'batch_stat_pos'
][
'var'
],
\
metric
[
'batch_stat_neg'
][
'var'
],
metric
[
'stat_pos'
][
'var'
],
metric
[
'stat_neg'
][
'var'
]]
=
\
...
...
core/trainer.py
浏览文件 @
f9ef376e
...
...
@@ -30,8 +30,10 @@ class Trainer(object):
def
__init__
(
self
,
config
=
None
):
self
.
_status_processor
=
{}
self
.
_place
=
fluid
.
CPUPlace
()
self
.
_exe
=
fluid
.
Executor
(
self
.
_place
)
self
.
_exector_context
=
{}
self
.
_context
=
{
'status'
:
'uninit'
,
'is_exit'
:
False
}
self
.
_config_yaml
=
config
...
...
@@ -95,6 +97,6 @@ def user_define_engine(engine_yaml):
train_dirname
=
os
.
path
.
dirname
(
train_location
)
base_name
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
train_location
))[
0
]
sys
.
path
.
append
(
train_dirname
)
trainer_class
=
envs
.
lazy_instance_by_fliename
(
base_name
,
"UserDefineTraining"
)
trainer_class
=
envs
.
lazy_instance_by_fliename
(
base_name
,
"UserDefineTraining"
)
return
trainer_class
core/trainers/cluster_trainer.py
浏览文件 @
f9ef376e
...
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
...
...
@@ -43,11 +42,14 @@ class ClusterTrainer(TranspileTrainer):
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'init_pass'
,
self
.
init
)
self
.
regist_context_processor
(
'startup_pass'
,
self
.
startup
)
if
envs
.
get_platform
()
==
"LINUX"
and
envs
.
get_global_env
(
"dataset_class"
,
None
,
"train.reader"
)
!=
"DataLoader"
:
if
envs
.
get_platform
()
==
"LINUX"
and
envs
.
get_global_env
(
"dataset_class"
,
None
,
"train.reader"
)
!=
"DataLoader"
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataset_train
)
else
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataloader_train
)
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataloader_train
)
self
.
regist_context_processor
(
'infer_pass'
,
self
.
infer
)
self
.
regist_context_processor
(
'terminal_pass'
,
self
.
terminal
)
...
...
@@ -75,8 +77,8 @@ class ClusterTrainer(TranspileTrainer):
def
init
(
self
,
context
):
self
.
model
.
train_net
()
optimizer
=
self
.
model
.
optimizer
()
optimizer_name
=
envs
.
get_global_env
(
"hyper_parameters.optimizer"
,
None
,
"train.model"
)
optimizer_name
=
envs
.
get_global_env
(
"hyper_parameters.optimizer"
,
None
,
"train.model"
)
if
optimizer_name
not
in
[
""
,
"sgd"
,
"SGD"
,
"Sgd"
]:
os
.
environ
[
"FLAGS_communicator_is_sgd_optimizer"
]
=
'0'
...
...
@@ -114,9 +116,9 @@ class ClusterTrainer(TranspileTrainer):
program
=
fluid
.
compiler
.
CompiledProgram
(
fleet
.
main_program
).
with_data_parallel
(
loss_name
=
self
.
model
.
get_avg_cost
().
name
,
build_strategy
=
self
.
strategy
.
get_build_strategy
(),
exec_strategy
=
self
.
strategy
.
get_execute_strategy
())
loss_name
=
self
.
model
.
get_avg_cost
().
name
,
build_strategy
=
self
.
strategy
.
get_build_strategy
(),
exec_strategy
=
self
.
strategy
.
get_execute_strategy
())
metrics_varnames
=
[]
metrics_format
=
[]
...
...
@@ -135,9 +137,8 @@ class ClusterTrainer(TranspileTrainer):
batch_id
=
0
try
:
while
True
:
metrics_rets
=
self
.
_exe
.
run
(
program
=
program
,
fetch_list
=
metrics_varnames
)
metrics_rets
=
self
.
_exe
.
run
(
program
=
program
,
fetch_list
=
metrics_varnames
)
metrics
=
[
epoch
,
batch_id
]
metrics
.
extend
(
metrics_rets
)
...
...
@@ -162,14 +163,16 @@ class ClusterTrainer(TranspileTrainer):
for
i
in
range
(
epochs
):
begin_time
=
time
.
time
()
self
.
_exe
.
train_from_dataset
(
program
=
fluid
.
default_main_program
(),
dataset
=
dataset
,
fetch_list
=
self
.
fetch_vars
,
fetch_info
=
self
.
fetch_alias
,
print_period
=
self
.
fetch_period
)
self
.
_exe
.
train_from_dataset
(
program
=
fluid
.
default_main_program
(),
dataset
=
dataset
,
fetch_list
=
self
.
fetch_vars
,
fetch_info
=
self
.
fetch_alias
,
print_period
=
self
.
fetch_period
)
end_time
=
time
.
time
()
times
=
end_time
-
begin_time
print
(
"epoch {} using time {}, speed {:.2f} lines/s"
.
format
(
i
,
times
,
ins
/
times
))
times
=
end_time
-
begin_time
print
(
"epoch {} using time {}, speed {:.2f} lines/s"
.
format
(
i
,
times
,
ins
/
times
))
self
.
save
(
i
,
"train"
,
is_fleet
=
True
)
fleet
.
stop_worker
()
...
...
core/trainers/online_learning_trainer.py
浏览文件 @
f9ef376e
...
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
...
...
@@ -44,11 +43,14 @@ class OnlineLearningTrainer(TranspileTrainer):
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'init_pass'
,
self
.
init
)
self
.
regist_context_processor
(
'startup_pass'
,
self
.
startup
)
if
envs
.
get_platform
()
==
"LINUX"
and
envs
.
get_global_env
(
"dataset_class"
,
None
,
"train.reader"
)
!=
"DataLoader"
:
if
envs
.
get_platform
()
==
"LINUX"
and
envs
.
get_global_env
(
"dataset_class"
,
None
,
"train.reader"
)
!=
"DataLoader"
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataset_train
)
else
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataloader_train
)
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataloader_train
)
self
.
regist_context_processor
(
'infer_pass'
,
self
.
infer
)
self
.
regist_context_processor
(
'terminal_pass'
,
self
.
terminal
)
...
...
@@ -110,27 +112,27 @@ class OnlineLearningTrainer(TranspileTrainer):
if
state
==
"TRAIN"
:
inputs
=
self
.
model
.
get_inputs
()
namespace
=
"train.reader"
train_data_path
=
envs
.
get_global_env
(
"train_data_path"
,
None
,
namespace
)
train_data_path
=
envs
.
get_global_env
(
"train_data_path"
,
None
,
namespace
)
else
:
inputs
=
self
.
model
.
get_infer_inputs
()
namespace
=
"evaluate.reader"
train_data_path
=
envs
.
get_global_env
(
"test_data_path"
,
None
,
namespace
)
train_data_path
=
envs
.
get_global_env
(
"test_data_path"
,
None
,
namespace
)
threads
=
int
(
envs
.
get_runtime_environ
(
"train.trainer.threads"
))
batch_size
=
envs
.
get_global_env
(
"batch_size"
,
None
,
namespace
)
reader_class
=
envs
.
get_global_env
(
"class"
,
None
,
namespace
)
abs_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
reader
=
os
.
path
.
join
(
abs_dir
,
'../utils'
,
'dataset_instance.py'
)
pipe_cmd
=
"python {} {} {} {}"
.
format
(
reader
,
reader_class
,
state
,
self
.
_config_yaml
)
pipe_cmd
=
"python {} {} {} {}"
.
format
(
reader
,
reader_class
,
state
,
self
.
_config_yaml
)
if
train_data_path
.
startswith
(
"paddlerec::"
):
package_base
=
envs
.
get_runtime_environ
(
"PACKAGE_BASE"
)
assert
package_base
is
not
None
train_data_path
=
os
.
path
.
join
(
package_base
,
train_data_path
.
split
(
"::"
)[
1
])
train_data_path
=
os
.
path
.
join
(
package_base
,
train_data_path
.
split
(
"::"
)[
1
])
dataset
=
fluid
.
DatasetFactory
().
create_dataset
()
dataset
.
set_use_var
(
inputs
)
...
...
@@ -166,14 +168,16 @@ class OnlineLearningTrainer(TranspileTrainer):
ins
=
self
.
_get_dataset_ins
()
begin_time
=
time
.
time
()
self
.
_exe
.
train_from_dataset
(
program
=
fluid
.
default_main_program
(),
dataset
=
dataset
,
fetch_list
=
self
.
fetch_vars
,
fetch_info
=
self
.
fetch_alias
,
print_period
=
self
.
fetch_period
)
self
.
_exe
.
train_from_dataset
(
program
=
fluid
.
default_main_program
(),
dataset
=
dataset
,
fetch_list
=
self
.
fetch_vars
,
fetch_info
=
self
.
fetch_alias
,
print_period
=
self
.
fetch_period
)
end_time
=
time
.
time
()
times
=
end_time
-
begin_time
print
(
"epoch {} using time {}, speed {:.2f} lines/s"
.
format
(
i
,
times
,
ins
/
times
))
times
=
end_time
-
begin_time
print
(
"epoch {} using time {}, speed {:.2f} lines/s"
.
format
(
i
,
times
,
ins
/
times
))
self
.
save
(
i
,
"train"
,
is_fleet
=
True
)
fleet
.
stop_worker
()
...
...
core/trainers/single_trainer.py
浏览文件 @
f9ef376e
...
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
...
...
@@ -36,8 +35,9 @@ class SingleTrainer(TranspileTrainer):
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'init_pass'
,
self
.
init
)
self
.
regist_context_processor
(
'startup_pass'
,
self
.
startup
)
if
envs
.
get_platform
()
==
"LINUX"
and
envs
.
get_global_env
(
"dataset_class"
,
None
,
"train.reader"
)
!=
"DataLoader"
:
if
envs
.
get_platform
()
==
"LINUX"
and
envs
.
get_global_env
(
"dataset_class"
,
None
,
"train.reader"
)
!=
"DataLoader"
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataset_train
)
else
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataloader_train
)
...
...
@@ -73,9 +73,8 @@ class SingleTrainer(TranspileTrainer):
reader
=
self
.
_get_dataloader
(
"TRAIN"
)
epochs
=
envs
.
get_global_env
(
"train.epochs"
)
program
=
fluid
.
compiler
.
CompiledProgram
(
fluid
.
default_main_program
()).
with_data_parallel
(
loss_name
=
self
.
model
.
get_avg_cost
().
name
)
program
=
fluid
.
compiler
.
CompiledProgram
(
fluid
.
default_main_program
(
)).
with_data_parallel
(
loss_name
=
self
.
model
.
get_avg_cost
().
name
)
metrics_varnames
=
[]
metrics_format
=
[]
...
...
@@ -94,9 +93,8 @@ class SingleTrainer(TranspileTrainer):
batch_id
=
0
try
:
while
True
:
metrics_rets
=
self
.
_exe
.
run
(
program
=
program
,
fetch_list
=
metrics_varnames
)
metrics_rets
=
self
.
_exe
.
run
(
program
=
program
,
fetch_list
=
metrics_varnames
)
metrics
=
[
epoch
,
batch_id
]
metrics
.
extend
(
metrics_rets
)
...
...
@@ -117,14 +115,16 @@ class SingleTrainer(TranspileTrainer):
epochs
=
envs
.
get_global_env
(
"train.epochs"
)
for
i
in
range
(
epochs
):
begin_time
=
time
.
time
()
self
.
_exe
.
train_from_dataset
(
program
=
fluid
.
default_main_program
(),
dataset
=
dataset
,
fetch_list
=
self
.
fetch_vars
,
fetch_info
=
self
.
fetch_alias
,
print_period
=
self
.
fetch_period
)
self
.
_exe
.
train_from_dataset
(
program
=
fluid
.
default_main_program
(),
dataset
=
dataset
,
fetch_list
=
self
.
fetch_vars
,
fetch_info
=
self
.
fetch_alias
,
print_period
=
self
.
fetch_period
)
end_time
=
time
.
time
()
times
=
end_time
-
begin_time
print
(
"epoch {} using time {}, speed {:.2f} lines/s"
.
format
(
i
,
times
,
ins
/
times
))
print
(
"epoch {} using time {}, speed {:.2f} lines/s"
.
format
(
i
,
times
,
ins
/
times
))
self
.
save
(
i
,
"train"
,
is_fleet
=
False
)
context
[
'status'
]
=
'infer_pass'
...
...
core/trainers/tdm_cluster_trainer.py
浏览文件 @
f9ef376e
...
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
...
...
@@ -36,8 +35,8 @@ special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info"]
class
TDMClusterTrainer
(
ClusterTrainer
):
def
server
(
self
,
context
):
namespace
=
"train.startup"
init_model_path
=
envs
.
get_global_env
(
"cluster.init_model_path"
,
""
,
namespace
)
init_model_path
=
envs
.
get_global_env
(
"cluster.init_model_path"
,
""
,
namespace
)
assert
init_model_path
!=
""
,
"Cluster train must has init_model for TDM"
fleet
.
init_server
(
init_model_path
)
logger
.
info
(
"TDM: load model from {}"
.
format
(
init_model_path
))
...
...
@@ -48,24 +47,27 @@ class TDMClusterTrainer(ClusterTrainer):
self
.
_exe
.
run
(
fleet
.
startup_program
)
namespace
=
"train.startup"
load_tree
=
envs
.
get_global_env
(
"tree.load_tree"
,
True
,
namespace
)
self
.
tree_layer_path
=
envs
.
get_global_env
(
"tree.tree_layer_path"
,
""
,
namespace
)
self
.
tree_travel_path
=
envs
.
get_global_env
(
"tree.tree_travel_path"
,
""
,
namespace
)
self
.
tree_info_path
=
envs
.
get_global_env
(
"tree.tree_info_path"
,
""
,
namespace
)
save_init_model
=
envs
.
get_global_env
(
"cluster.save_init_model"
,
False
,
namespace
)
init_model_path
=
envs
.
get_global_env
(
"cluster.init_model_path"
,
""
,
namespace
)
load_tree
=
envs
.
get_global_env
(
"tree.load_tree"
,
True
,
namespace
)
self
.
tree_layer_path
=
envs
.
get_global_env
(
"tree.tree_layer_path"
,
""
,
namespace
)
self
.
tree_travel_path
=
envs
.
get_global_env
(
"tree.tree_travel_path"
,
""
,
namespace
)
self
.
tree_info_path
=
envs
.
get_global_env
(
"tree.tree_info_path"
,
""
,
namespace
)
save_init_model
=
envs
.
get_global_env
(
"cluster.save_init_model"
,
False
,
namespace
)
init_model_path
=
envs
.
get_global_env
(
"cluster.init_model_path"
,
""
,
namespace
)
if
load_tree
:
# covert tree to tensor, set it into Fluid's variable.
for
param_name
in
special_param
:
param_t
=
fluid
.
global_scope
().
find_var
(
param_name
).
get_tensor
()
param_t
=
fluid
.
global_scope
().
find_var
(
param_name
).
get_tensor
(
)
param_array
=
self
.
_tdm_prepare
(
param_name
)
param_t
.
set
(
param_array
.
astype
(
'int32'
),
self
.
_place
)
...
...
@@ -93,8 +95,8 @@ class TDMClusterTrainer(ClusterTrainer):
def
_tdm_travel_prepare
(
self
):
"""load tdm tree param from npy/list file"""
travel_array
=
np
.
load
(
self
.
tree_travel_path
)
logger
.
info
(
"TDM Tree leaf node nums: {}"
.
format
(
travel_array
.
shape
[
0
]))
logger
.
info
(
"TDM Tree leaf node nums: {}"
.
format
(
travel_array
.
shape
[
0
]))
return
travel_array
def
_tdm_layer_prepare
(
self
):
...
...
core/trainers/tdm_single_trainer.py
浏览文件 @
f9ef376e
...
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
...
...
@@ -27,33 +26,38 @@ from paddlerec.core.utils import envs
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
special_param
=
[
"TDM_Tree_Travel"
,
"TDM_Tree_Layer"
,
"TDM_Tree_Info"
,
"TDM_Tree_Emb"
]
special_param
=
[
"TDM_Tree_Travel"
,
"TDM_Tree_Layer"
,
"TDM_Tree_Info"
,
"TDM_Tree_Emb"
]
class
TDMSingleTrainer
(
SingleTrainer
):
def
startup
(
self
,
context
):
namespace
=
"train.startup"
load_persistables
=
envs
.
get_global_env
(
"single.load_persistables"
,
False
,
namespace
)
load_persistables
=
envs
.
get_global_env
(
"single.load_persistables"
,
False
,
namespace
)
persistables_model_path
=
envs
.
get_global_env
(
"single.persistables_model_path"
,
""
,
namespace
)
load_tree
=
envs
.
get_global_env
(
"tree.load_tree"
,
False
,
namespace
)
self
.
tree_layer_path
=
envs
.
get_global_env
(
"tree.tree_layer_path"
,
""
,
namespace
)
self
.
tree_travel_path
=
envs
.
get_global_env
(
"tree.tree_travel_path"
,
""
,
namespace
)
self
.
tree_info_path
=
envs
.
get_global_env
(
"tree.tree_info_path"
,
""
,
namespace
)
self
.
tree_emb_path
=
envs
.
get_global_env
(
"tree.tree_emb_path"
,
""
,
namespace
)
save_init_model
=
envs
.
get_global_env
(
"single.save_init_model"
,
False
,
namespace
)
init_model_path
=
envs
.
get_global_env
(
"single.init_model_path"
,
""
,
namespace
)
load_tree
=
envs
.
get_global_env
(
"tree.load_tree"
,
False
,
namespace
)
self
.
tree_layer_path
=
envs
.
get_global_env
(
"tree.tree_layer_path"
,
""
,
namespace
)
self
.
tree_travel_path
=
envs
.
get_global_env
(
"tree.tree_travel_path"
,
""
,
namespace
)
self
.
tree_info_path
=
envs
.
get_global_env
(
"tree.tree_info_path"
,
""
,
namespace
)
self
.
tree_emb_path
=
envs
.
get_global_env
(
"tree.tree_emb_path"
,
""
,
namespace
)
save_init_model
=
envs
.
get_global_env
(
"single.save_init_model"
,
False
,
namespace
)
init_model_path
=
envs
.
get_global_env
(
"single.init_model_path"
,
""
,
namespace
)
self
.
_exe
.
run
(
fluid
.
default_startup_program
())
if
load_persistables
:
...
...
@@ -68,7 +72,8 @@ class TDMSingleTrainer(SingleTrainer):
if
load_tree
:
# covert tree to tensor, set it into Fluid's variable.
for
param_name
in
special_param
:
param_t
=
fluid
.
global_scope
().
find_var
(
param_name
).
get_tensor
()
param_t
=
fluid
.
global_scope
().
find_var
(
param_name
).
get_tensor
(
)
param_array
=
self
.
_tdm_prepare
(
param_name
)
if
param_name
==
'TDM_Tree_Emb'
:
param_t
.
set
(
param_array
.
astype
(
'float32'
),
self
.
_place
)
...
...
@@ -102,15 +107,15 @@ class TDMSingleTrainer(SingleTrainer):
def
_tdm_travel_prepare
(
self
):
"""load tdm tree param from npy/list file"""
travel_array
=
np
.
load
(
self
.
tree_travel_path
)
logger
.
info
(
"TDM Tree leaf node nums: {}"
.
format
(
travel_array
.
shape
[
0
]))
logger
.
info
(
"TDM Tree leaf node nums: {}"
.
format
(
travel_array
.
shape
[
0
]))
return
travel_array
def
_tdm_emb_prepare
(
self
):
"""load tdm tree param from npy/list file"""
emb_array
=
np
.
load
(
self
.
tree_emb_path
)
logger
.
info
(
"TDM Tree node nums from emb: {}"
.
format
(
emb_array
.
shape
[
0
]))
logger
.
info
(
"TDM Tree node nums from emb: {}"
.
format
(
emb_array
.
shape
[
0
]))
return
emb_array
def
_tdm_layer_prepare
(
self
):
...
...
core/trainers/transpiler_trainer.py
浏览文件 @
f9ef376e
...
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with DistributeTranspiler
"""
...
...
@@ -39,9 +38,12 @@ class TranspileTrainer(Trainer):
self
.
increment_models
=
[]
def
processor_register
(
self
):
print
(
"Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first"
)
print
(
"Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first"
)
def
_get_dataloader
(
self
,
state
=
"TRAIN"
):
if
state
==
"TRAIN"
:
dataloader
=
self
.
model
.
_data_loader
namespace
=
"train.reader"
...
...
@@ -59,12 +61,14 @@ class TranspileTrainer(Trainer):
if
sparse_slots
is
None
and
dense_slots
is
None
:
reader_class
=
envs
.
get_global_env
(
"class"
,
None
,
namespace
)
reader
=
dataloader_instance
.
dataloader
(
reader_class
,
state
,
self
.
_config_yaml
)
reader_class
=
envs
.
lazy_instance_by_fliename
(
reader_class
,
class_name
)
reader
=
dataloader_instance
.
dataloader
(
reader_class
,
state
,
self
.
_config_yaml
)
reader_class
=
envs
.
lazy_instance_by_fliename
(
reader_class
,
class_name
)
reader_ins
=
reader_class
(
self
.
_config_yaml
)
else
:
reader
=
dataloader_instance
.
slotdataloader
(
""
,
state
,
self
.
_config_yaml
)
reader
=
dataloader_instance
.
slotdataloader
(
""
,
state
,
self
.
_config_yaml
)
reader_ins
=
SlotReader
(
self
.
_config_yaml
)
if
hasattr
(
reader_ins
,
'generate_batch_from_trainfiles'
):
...
...
@@ -94,13 +98,13 @@ class TranspileTrainer(Trainer):
if
state
==
"TRAIN"
:
inputs
=
self
.
model
.
get_inputs
()
namespace
=
"train.reader"
train_data_path
=
envs
.
get_global_env
(
"train_data_path"
,
None
,
namespace
)
train_data_path
=
envs
.
get_global_env
(
"train_data_path"
,
None
,
namespace
)
else
:
inputs
=
self
.
model
.
get_infer_inputs
()
namespace
=
"evaluate.reader"
train_data_path
=
envs
.
get_global_env
(
"test_data_path"
,
None
,
namespace
)
train_data_path
=
envs
.
get_global_env
(
"test_data_path"
,
None
,
namespace
)
sparse_slots
=
envs
.
get_global_env
(
"sparse_slots"
,
None
,
namespace
)
dense_slots
=
envs
.
get_global_env
(
"dense_slots"
,
None
,
namespace
)
...
...
@@ -112,8 +116,8 @@ class TranspileTrainer(Trainer):
reader
=
os
.
path
.
join
(
abs_dir
,
'../utils'
,
'dataset_instance.py'
)
if
sparse_slots
is
None
and
dense_slots
is
None
:
pipe_cmd
=
"python {} {} {} {}"
.
format
(
reader
,
reader_class
,
state
,
self
.
_config_yaml
)
pipe_cmd
=
"python {} {} {} {}"
.
format
(
reader
,
reader_class
,
state
,
self
.
_config_yaml
)
else
:
padding
=
envs
.
get_global_env
(
"padding"
,
0
,
namespace
)
pipe_cmd
=
"python {} {} {} {} {} {} {} {}"
.
format
(
...
...
@@ -123,8 +127,8 @@ class TranspileTrainer(Trainer):
if
train_data_path
.
startswith
(
"paddlerec::"
):
package_base
=
envs
.
get_runtime_environ
(
"PACKAGE_BASE"
)
assert
package_base
is
not
None
train_data_path
=
os
.
path
.
join
(
package_base
,
train_data_path
.
split
(
"::"
)[
1
])
train_data_path
=
os
.
path
.
join
(
package_base
,
train_data_path
.
split
(
"::"
)[
1
])
dataset
=
fluid
.
DatasetFactory
().
create_dataset
()
dataset
.
set_use_var
(
inputs
)
...
...
@@ -140,11 +144,11 @@ class TranspileTrainer(Trainer):
debug_mode
=
envs
.
get_global_env
(
"reader_debug_mode"
,
False
,
namespace
)
if
debug_mode
:
print
(
"--- Dataset Debug Mode Begin , show pre 10 data of {}---"
.
format
(
file_list
[
0
]))
print
(
"--- Dataset Debug Mode Begin , show pre 10 data of {}---"
.
format
(
file_list
[
0
]))
os
.
system
(
"cat {} | {} | head -10"
.
format
(
file_list
[
0
],
pipe_cmd
))
print
(
"--- Dataset Debug Mode End , show pre 10 data of {}---"
.
format
(
file_list
[
0
]))
print
(
"--- Dataset Debug Mode End , show pre 10 data of {}---"
.
format
(
file_list
[
0
]))
exit
(
0
)
return
dataset
...
...
@@ -166,27 +170,29 @@ class TranspileTrainer(Trainer):
if
not
need_save
(
epoch_id
,
save_interval
,
False
):
return
feed_varnames
=
envs
.
get_global_env
(
"save.inference.feed_varnames"
,
None
,
namespace
)
feed_varnames
=
envs
.
get_global_env
(
"save.inference.feed_varnames"
,
None
,
namespace
)
fetch_varnames
=
envs
.
get_global_env
(
"save.inference.fetch_varnames"
,
None
,
namespace
)
if
feed_varnames
is
None
or
fetch_varnames
is
None
:
return
fetch_vars
=
[
fluid
.
default_main_program
().
global_block
().
vars
[
varname
]
for
varname
in
fetch_varnames
]
dirname
=
envs
.
get_global_env
(
"save.inference.dirname"
,
None
,
namespace
)
fetch_vars
=
[
fluid
.
default_main_program
().
global_block
().
vars
[
varname
]
for
varname
in
fetch_varnames
]
dirname
=
envs
.
get_global_env
(
"save.inference.dirname"
,
None
,
namespace
)
assert
dirname
is
not
None
dirname
=
os
.
path
.
join
(
dirname
,
str
(
epoch_id
))
if
is_fleet
:
fleet
.
save_inference_model
(
self
.
_exe
,
dirname
,
feed_varnames
,
fetch_vars
)
fleet
.
save_inference_model
(
self
.
_exe
,
dirname
,
feed_varnames
,
fetch_vars
)
else
:
fluid
.
io
.
save_inference_model
(
dirname
,
feed_varnames
,
fetch_vars
,
self
.
_exe
)
fluid
.
io
.
save_inference_model
(
dirname
,
feed_varnames
,
fetch_vars
,
self
.
_exe
)
self
.
inference_models
.
append
((
epoch_id
,
dirname
))
def
save_persistables
():
...
...
@@ -196,8 +202,8 @@ class TranspileTrainer(Trainer):
if
not
need_save
(
epoch_id
,
save_interval
,
False
):
return
dirname
=
envs
.
get_global_env
(
"save.increment.dirname"
,
None
,
namespace
)
dirname
=
envs
.
get_global_env
(
"save.increment.dirname"
,
None
,
namespace
)
assert
dirname
is
not
None
dirname
=
os
.
path
.
join
(
dirname
,
str
(
epoch_id
))
...
...
@@ -275,10 +281,9 @@ class TranspileTrainer(Trainer):
batch_id
=
0
try
:
while
True
:
metrics_rets
=
self
.
_exe
.
run
(
program
=
program
,
fetch_list
=
metrics_varnames
,
return_numpy
=
is_return_numpy
)
metrics_rets
=
self
.
_exe
.
run
(
program
=
program
,
fetch_list
=
metrics_varnames
,
return_numpy
=
is_return_numpy
)
metrics
=
[
epoch
,
batch_id
]
metrics
.
extend
(
metrics_rets
)
...
...
core/utils/dataset_holder.py
浏览文件 @
f9ef376e
...
...
@@ -24,7 +24,7 @@ from paddlerec.core.utils import util as util
class
DatasetHolder
(
object
):
"""
Dataset
Base
Dataset
Holder
"""
__metaclass__
=
abc
.
ABCMeta
...
...
@@ -74,11 +74,17 @@ class TimeSplitDatasetHolder(DatasetHolder):
Dataset
.
__init__
(
self
,
config
)
if
'data_donefile'
not
in
config
or
config
[
'data_donefile'
]
is
None
:
config
[
'data_donefile'
]
=
config
[
'data_path'
]
+
"/to.hadoop.done"
self
.
_path_generator
=
util
.
PathGenerator
({
'templates'
:
[
{
'name'
:
'data_path'
,
'template'
:
config
[
'data_path'
]},
{
'name'
:
'donefile_path'
,
'template'
:
config
[
'data_donefile'
]}
]})
self
.
_split_interval
=
config
[
'split_interval'
]
# data split N mins per dir
self
.
_path_generator
=
util
.
PathGenerator
({
'templates'
:
[{
'name'
:
'data_path'
,
'template'
:
config
[
'data_path'
]
},
{
'name'
:
'donefile_path'
,
'template'
:
config
[
'data_donefile'
]
}]
})
self
.
_split_interval
=
config
[
'split_interval'
]
# data split N mins per dir
self
.
_data_file_handler
=
fs
.
FileHandler
(
config
)
def
_format_data_time
(
self
,
daytime_str
,
time_window_mins
):
...
...
@@ -91,7 +97,8 @@ class TimeSplitDatasetHolder(DatasetHolder):
return
None
,
0
if
mins_of_day
%
self
.
_split_interval
!=
0
:
skip_mins
=
self
.
_split_interval
-
(
mins_of_day
%
self
.
_split_interval
)
skip_mins
=
self
.
_split_interval
-
(
mins_of_day
%
self
.
_split_interval
)
data_time
=
data_time
+
datetime
.
timedelta
(
minutes
=
skip_mins
)
time_window_mins
=
time_window_mins
-
skip_mins
return
data_time
,
time_window_mins
...
...
@@ -106,17 +113,24 @@ class TimeSplitDatasetHolder(DatasetHolder):
True/False
"""
is_ready
=
True
data_time
,
windows_mins
=
self
.
_format_data_time
(
daytime_str
,
time_window_mins
)
data_time
,
windows_mins
=
self
.
_format_data_time
(
daytime_str
,
time_window_mins
)
while
time_window_mins
>
0
:
file_path
=
self
.
_path_generator
.
generate_path
(
'donefile_path'
,
{
'time_format'
:
data_time
})
file_path
=
self
.
_path_generator
.
generate_path
(
'donefile_path'
,
{
'time_format'
:
data_time
})
if
not
self
.
_data_file_handler
.
is_exist
(
file_path
):
is_ready
=
False
break
time_window_mins
=
time_window_mins
-
self
.
_split_interval
data_time
=
data_time
+
datetime
.
timedelta
(
minutes
=
self
.
_split_interval
)
data_time
=
data_time
+
datetime
.
timedelta
(
minutes
=
self
.
_split_interval
)
return
is_ready
def
get_file_list
(
self
,
daytime_str
,
time_window_mins
,
node_num
=
1
,
node_idx
=
0
):
def
get_file_list
(
self
,
daytime_str
,
time_window_mins
,
node_num
=
1
,
node_idx
=
0
):
"""
data in [daytime_str, daytime_str + time_window_mins], random shard to node_num, return shard[node_idx]
Args:
...
...
@@ -128,26 +142,32 @@ class TimeSplitDatasetHolder(DatasetHolder):
list, data_shard[node_idx]
"""
data_file_list
=
[]
data_time
,
windows_mins
=
self
.
_format_data_time
(
daytime_str
,
time_window_mins
)
data_time
,
windows_mins
=
self
.
_format_data_time
(
daytime_str
,
time_window_mins
)
while
time_window_mins
>
0
:
file_path
=
self
.
_path_generator
.
generate_path
(
'data_path'
,
{
'time_format'
:
data_time
})
file_path
=
self
.
_path_generator
.
generate_path
(
'data_path'
,
{
'time_format'
:
data_time
})
sub_file_list
=
self
.
_data_file_handler
.
ls
(
file_path
)
for
sub_file
in
sub_file_list
:
sub_file_name
=
self
.
_data_file_handler
.
get_file_name
(
sub_file
)
if
not
sub_file_name
.
startswith
(
self
.
_config
[
'filename_prefix'
]):
if
not
sub_file_name
.
startswith
(
self
.
_config
[
'filename_prefix'
]):
continue
if
hash
(
sub_file_name
)
%
node_num
==
node_idx
:
data_file_list
.
append
(
sub_file
)
time_window_mins
=
time_window_mins
-
self
.
_split_interval
data_time
=
data_time
+
datetime
.
timedelta
(
minutes
=
self
.
_split_interval
)
data_time
=
data_time
+
datetime
.
timedelta
(
minutes
=
self
.
_split_interval
)
return
data_file_list
def
_alloc_dataset
(
self
,
file_list
):
""" """
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
self
.
_config
[
'dataset_type'
])
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
self
.
_config
[
'dataset_type'
])
dataset
.
set_batch_size
(
self
.
_config
[
'batch_size'
])
dataset
.
set_thread
(
self
.
_config
[
'load_thread'
])
dataset
.
set_hdfs_config
(
self
.
_config
[
'fs_name'
],
self
.
_config
[
'fs_ugi'
])
dataset
.
set_hdfs_config
(
self
.
_config
[
'fs_name'
],
self
.
_config
[
'fs_ugi'
])
dataset
.
set_pipe_command
(
self
.
_config
[
'data_converter'
])
dataset
.
set_filelist
(
file_list
)
dataset
.
set_use_var
(
self
.
_config
[
'data_vars'
])
...
...
@@ -163,7 +183,9 @@ class TimeSplitDatasetHolder(DatasetHolder):
while
self
.
check_ready
(
begin_time
,
windown_min
)
==
False
:
print
(
"dataset not ready, time:"
+
begin_time
)
time
.
sleep
(
30
)
file_list
=
self
.
get_file_list
(
begin_time
,
windown_min
,
params
[
'node_num'
],
params
[
'node_idx'
])
file_list
=
self
.
get_file_list
(
begin_time
,
windown_min
,
params
[
'node_num'
],
params
[
'node_idx'
])
self
.
_datasets
[
begin_time
]
=
self
.
_alloc_dataset
(
file_list
)
self
.
_datasets
[
begin_time
].
load_into_memory
()
else
:
...
...
@@ -176,9 +198,12 @@ class TimeSplitDatasetHolder(DatasetHolder):
windown_min
=
params
[
'time_window_min'
]
if
begin_time
not
in
self
.
_datasets
:
if
self
.
check_ready
(
begin_time
,
windown_min
):
file_list
=
self
.
get_file_list
(
begin_time
,
windown_min
,
params
[
'node_num'
],
params
[
'node_idx'
])
file_list
=
self
.
get_file_list
(
begin_time
,
windown_min
,
params
[
'node_num'
],
params
[
'node_idx'
])
self
.
_datasets
[
begin_time
]
=
self
.
_alloc_dataset
(
file_list
)
self
.
_datasets
[
begin_time
].
preload_into_memory
(
self
.
_config
[
'preload_thread'
])
self
.
_datasets
[
begin_time
].
preload_into_memory
(
self
.
_config
[
'preload_thread'
])
return
True
return
False
...
...
core/utils/dataset_instance.py
浏览文件 @
f9ef376e
...
...
@@ -17,10 +17,11 @@ import sys
from
paddlerec.core.utils.envs
import
lazy_instance_by_fliename
from
paddlerec.core.reader
import
SlotReader
from
paddlerec.core.utils
import
envs
if
len
(
sys
.
argv
)
<
4
:
raise
ValueError
(
"reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path"
)
raise
ValueError
(
"reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path"
)
reader_package
=
sys
.
argv
[
1
]
...
...
core/utils/envs.py
浏览文件 @
f9ef376e
...
...
@@ -95,7 +95,7 @@ def path_adapter(path):
l_p
=
path
.
split
(
"paddlerec."
)[
1
].
replace
(
"."
,
"/"
)
return
os
.
path
.
join
(
package
,
l_p
)
else
:
return
path
return
path
def
windows_path_converter
(
path
):
...
...
@@ -159,8 +159,8 @@ def pretty_print_envs(envs, header=None):
def
lazy_instance_by_package
(
package
,
class_name
):
models
=
get_global_env
(
"train.model.models"
)
model_package
=
__import__
(
package
,
globals
(),
locals
(),
package
.
split
(
"."
))
model_package
=
__import__
(
package
,
globals
(),
locals
(),
package
.
split
(
"."
))
instance
=
getattr
(
model_package
,
class_name
)
return
instance
...
...
@@ -170,8 +170,8 @@ def lazy_instance_by_fliename(abs, class_name):
sys
.
path
.
append
(
dirname
)
package
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
abs
))[
0
]
model_package
=
__import__
(
package
,
globals
(),
locals
(),
package
.
split
(
"."
))
model_package
=
__import__
(
package
,
globals
(),
locals
(),
package
.
split
(
"."
))
instance
=
getattr
(
model_package
,
class_name
)
return
instance
...
...
@@ -189,8 +189,7 @@ def get_platform():
def
find_free_port
():
def
__free_port
():
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
))
as
s
:
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
))
as
s
:
s
.
bind
((
''
,
0
))
return
s
.
getsockname
()[
1
]
...
...
core/utils/util.py
浏览文件 @
f9ef376e
...
...
@@ -22,6 +22,7 @@ from paddlerec.core.utils import fs as fs
def
save_program_proto
(
path
,
program
=
None
):
if
program
is
None
:
_program
=
fluid
.
default_main_program
()
else
:
...
...
@@ -175,7 +176,8 @@ class PathGenerator(object):
"""
if
template_name
in
self
.
_templates
:
if
'time_format'
in
param
:
str
=
param
[
'time_format'
].
strftime
(
self
.
_templates
[
template_name
])
str
=
param
[
'time_format'
].
strftime
(
self
.
_templates
[
template_name
])
return
str
.
format
(
**
param
)
return
self
.
_templates
[
template_name
].
format
(
**
param
)
else
:
...
...
@@ -198,31 +200,39 @@ class TimeTrainPass(object):
self
.
_begin_day
=
make_datetime
(
day_fields
[
0
].
strip
())
if
len
(
day_fields
)
==
1
or
len
(
day_fields
[
1
])
==
0
:
# 100 years, meaning to continuous running
self
.
_end_day
=
self
.
_begin_day
+
datetime
.
timedelta
(
days
=
36500
)
self
.
_end_day
=
self
.
_begin_day
+
datetime
.
timedelta
(
days
=
36500
)
else
:
# example: 2020212+10
run_day
=
int
(
day_fields
[
1
].
strip
())
self
.
_end_day
=
self
.
_begin_day
+
datetime
.
timedelta
(
days
=
run_day
)
self
.
_end_day
=
self
.
_begin_day
+
datetime
.
timedelta
(
days
=
run_day
)
else
:
# example: {20191001..20191031}
days
=
os
.
popen
(
"echo -n "
+
self
.
_config
[
'days'
]).
read
().
split
(
" "
)
days
=
os
.
popen
(
"echo -n "
+
self
.
_config
[
'days'
]).
read
().
split
(
" "
)
self
.
_begin_day
=
make_datetime
(
days
[
0
])
self
.
_end_day
=
make_datetime
(
days
[
len
(
days
)
-
1
])
self
.
_checkpoint_interval
=
self
.
_config
[
'checkpoint_interval'
]
self
.
_dump_inference_interval
=
self
.
_config
[
'dump_inference_interval'
]
self
.
_interval_per_pass
=
self
.
_config
[
'train_time_interval'
]
# train N min data per pass
self
.
_interval_per_pass
=
self
.
_config
[
'train_time_interval'
]
# train N min data per pass
self
.
_pass_id
=
0
self
.
_inference_pass_id
=
0
self
.
_pass_donefile_handler
=
None
if
'pass_donefile_name'
in
self
.
_config
:
self
.
_train_pass_donefile
=
global_config
[
'output_path'
]
+
'/'
+
self
.
_config
[
'pass_donefile_name'
]
self
.
_train_pass_donefile
=
global_config
[
'output_path'
]
+
'/'
+
self
.
_config
[
'pass_donefile_name'
]
if
fs
.
is_afs_path
(
self
.
_train_pass_donefile
):
self
.
_pass_donefile_handler
=
fs
.
FileHandler
(
global_config
[
'io'
][
'afs'
])
self
.
_pass_donefile_handler
=
fs
.
FileHandler
(
global_config
[
'io'
][
'afs'
])
else
:
self
.
_pass_donefile_handler
=
fs
.
FileHandler
(
global_config
[
'io'
][
'local_fs'
])
self
.
_pass_donefile_handler
=
fs
.
FileHandler
(
global_config
[
'io'
][
'local_fs'
])
last_done
=
self
.
_pass_donefile_handler
.
cat
(
self
.
_train_pass_donefile
).
strip
().
split
(
'
\n
'
)[
-
1
]
last_done
=
self
.
_pass_donefile_handler
.
cat
(
self
.
_train_pass_donefile
).
strip
().
split
(
'
\n
'
)[
-
1
]
done_fileds
=
last_done
.
split
(
'
\t
'
)
if
len
(
done_fileds
)
>
4
:
self
.
_base_key
=
done_fileds
[
1
]
...
...
@@ -236,15 +246,18 @@ class TimeTrainPass(object):
"""
return
24
*
60
/
self
.
_interval_per_pass
def
save_train_progress
(
self
,
day
,
pass_id
,
base_key
,
model_path
,
is_checkpoint
):
def
save_train_progress
(
self
,
day
,
pass_id
,
base_key
,
model_path
,
is_checkpoint
):
"""R
"""
if
is_checkpoint
:
self
.
_checkpoint_pass_id
=
pass_id
self
.
_checkpoint_model_path
=
model_path
done_content
=
"%s
\t
%s
\t
%s
\t
%s
\t
%d
\n
"
%
(
day
,
base_key
,
self
.
_checkpoint_model_path
,
self
.
_checkpoint_pass_id
,
pass_id
)
self
.
_pass_donefile_handler
.
write
(
done_content
,
self
.
_train_pass_donefile
,
'a'
)
done_content
=
"%s
\t
%s
\t
%s
\t
%s
\t
%d
\n
"
%
(
day
,
base_key
,
self
.
_checkpoint_model_path
,
self
.
_checkpoint_pass_id
,
pass_id
)
self
.
_pass_donefile_handler
.
write
(
done_content
,
self
.
_train_pass_donefile
,
'a'
)
pass
def
init_pass_by_id
(
self
,
date_str
,
pass_id
):
...
...
@@ -286,12 +299,14 @@ class TimeTrainPass(object):
if
self
.
_pass_id
<
1
:
self
.
init_pass_by_time
(
self
.
_begin_day
.
strftime
(
"%Y%m%d%H%M"
))
else
:
next_time
=
self
.
_current_train_time
+
datetime
.
timedelta
(
minutes
=
self
.
_interval_per_pass
)
next_time
=
self
.
_current_train_time
+
datetime
.
timedelta
(
minutes
=
self
.
_interval_per_pass
)
if
(
next_time
-
self
.
_end_day
).
total_seconds
()
>
0
:
has_next
=
False
else
:
self
.
init_pass_by_time
(
next_time
.
strftime
(
"%Y%m%d%H%M"
))
if
has_next
and
(
self
.
_inference_pass_id
<
self
.
_pass_id
or
self
.
_pass_id
<
old_pass_id
):
if
has_next
and
(
self
.
_inference_pass_id
<
self
.
_pass_id
or
self
.
_pass_id
<
old_pass_id
):
self
.
_inference_pass_id
=
self
.
_pass_id
-
1
return
has_next
...
...
@@ -319,9 +334,11 @@ class TimeTrainPass(object):
Return:
date(current_train_time + delta_day)
"""
return
(
self
.
_current_train_time
+
datetime
.
timedelta
(
days
=
delta_day
)).
strftime
(
"%Y%m%d"
)
return
(
self
.
_current_train_time
+
datetime
.
timedelta
(
days
=
delta_day
)
).
strftime
(
"%Y%m%d"
)
def
timestamp
(
self
,
delta_day
=
0
):
"""R
"""
return
(
self
.
_current_train_time
+
datetime
.
timedelta
(
days
=
delta_day
)).
timestamp
()
return
(
self
.
_current_train_time
+
datetime
.
timedelta
(
days
=
delta_day
)
).
timestamp
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录