Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleRec
提交
f8d55c9f
P
PaddleRec
项目概览
PaddlePaddle
/
PaddleRec
通知
68
Star
12
Fork
5
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
27
列表
看板
标记
里程碑
合并请求
10
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
27
Issue
27
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f8d55c9f
编写于
4月 14, 2020
作者:
T
tangwei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
code clean
上级
357f0da7
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
45 addition
and
252 deletion
+45
-252
fleetrec/models/base.py
fleetrec/models/base.py
+5
-5
fleetrec/models/ctr_dnn/dataloader.py
fleetrec/models/ctr_dnn/dataloader.py
+0
-83
fleetrec/models/ctr_dnn/dataset.py
fleetrec/models/ctr_dnn/dataset.py
+0
-69
fleetrec/models/ctr_dnn/model.py
fleetrec/models/ctr_dnn/model.py
+10
-25
fleetrec/trainer/cluster_trainer.py
fleetrec/trainer/cluster_trainer.py
+2
-6
fleetrec/trainer/factory.py
fleetrec/trainer/factory.py
+6
-45
fleetrec/trainer/single_trainer.py
fleetrec/trainer/single_trainer.py
+3
-10
fleetrec/trainer/trainer.py
fleetrec/trainer/trainer.py
+0
-3
fleetrec/trainer/transpiler_trainer.py
fleetrec/trainer/transpiler_trainer.py
+7
-5
fleetrec/utils/util.py
fleetrec/utils/util.py
+12
-1
未找到文件。
fleetrec/models/base.py
浏览文件 @
f8d55c9f
...
...
@@ -63,7 +63,7 @@ def create(config):
model
=
None
if
config
[
'mode'
]
==
'fluid'
:
model
=
YamlModel
(
config
)
model
.
build_model
()
model
.
net
()
return
model
...
...
@@ -94,13 +94,13 @@ class Model(object):
return
self
.
_fetch_interval
@
abc
.
abstractmethod
def
shrink
(
self
,
params
):
def
net
(
self
):
"""R
"""
pass
@
abc
.
abstractmethod
def
build_model
(
self
):
def
shrink
(
self
,
params
):
"""R
"""
pass
...
...
@@ -140,7 +140,7 @@ class YamlModel(Model):
self
.
_build_param
=
{
'layer'
:
{},
'inner_layer'
:
{},
'layer_extend'
:
{},
'model'
:
{}}
self
.
_inference_meta
=
{
'dependency'
:
{},
'params'
:
{}}
def
build_model
(
self
):
def
net
(
self
):
"""R
build a fluid model with config
Return:
...
...
@@ -287,4 +287,4 @@ class YamlModel(Model):
dependency_list
=
copy
.
deepcopy
(
dependencys
)
for
dependency
in
dependencys
:
dependency_list
=
dependency_list
+
self
.
get_dependency
(
layer_graph
,
dependency
)
return
list
(
set
(
dependency_list
))
\ No newline at end of file
return
list
(
set
(
dependency_list
))
fleetrec/models/ctr_dnn/dataloader.py
已删除
100644 → 0
浏览文件 @
357f0da7
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
...utils
import
envs
# There are 13 integer features and 26 categorical features
continous_features
=
range
(
1
,
14
)
categorial_features
=
range
(
14
,
40
)
continous_clip
=
[
20
,
600
,
100
,
50
,
64000
,
500
,
100
,
50
,
500
,
10
,
10
,
10
,
50
]
class
CriteoDataset
(
object
):
def
__init__
(
self
,
sparse_feature_dim
):
self
.
cont_min_
=
[
0
,
-
3
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]
self
.
cont_max_
=
[
20
,
600
,
100
,
50
,
64000
,
500
,
100
,
50
,
500
,
10
,
10
,
10
,
50
]
self
.
cont_diff_
=
[
20
,
603
,
100
,
50
,
64000
,
500
,
100
,
50
,
500
,
10
,
10
,
10
,
50
]
self
.
hash_dim_
=
sparse_feature_dim
# here, training data are lines with line_index < train_idx_
self
.
train_idx_
=
41256555
self
.
continuous_range_
=
range
(
1
,
14
)
self
.
categorical_range_
=
range
(
14
,
40
)
def
_reader_creator
(
self
,
file_list
,
is_train
,
trainer_num
,
trainer_id
):
def
reader
():
for
file
in
file_list
:
with
open
(
file
,
'r'
)
as
f
:
line_idx
=
0
for
line
in
f
:
line_idx
+=
1
features
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
dense_feature
=
[]
sparse_feature
=
[]
for
idx
in
self
.
continuous_range_
:
if
features
[
idx
]
==
''
:
dense_feature
.
append
(
0.0
)
else
:
dense_feature
.
append
(
(
float
(
features
[
idx
])
-
self
.
cont_min_
[
idx
-
1
])
/
self
.
cont_diff_
[
idx
-
1
])
for
idx
in
self
.
categorical_range_
:
sparse_feature
.
append
([
hash
(
str
(
idx
)
+
features
[
idx
])
%
self
.
hash_dim_
])
label
=
[
int
(
features
[
0
])]
yield
[
dense_feature
]
+
sparse_feature
+
[
label
]
return
reader
def
train
(
self
,
file_list
,
trainer_num
,
trainer_id
):
return
self
.
_reader_creator
(
file_list
,
True
,
trainer_num
,
trainer_id
)
def
test
(
self
,
file_list
):
return
self
.
_reader_creator
(
file_list
,
False
,
1
,
0
)
def
Train
():
sparse_feature_number
=
envs
.
get_global_env
(
"sparse_feature_number"
)
train_generator
=
CriteoDataset
(
sparse_feature_number
)
return
train_generator
.
train
def
Evaluate
():
sparse_feature_number
=
envs
.
get_global_env
(
"sparse_feature_number"
)
train_generator
=
CriteoDataset
(
sparse_feature_number
)
return
train_generator
.
test
fleetrec/models/ctr_dnn/dataset.py
已删除
100644 → 0
浏览文件 @
357f0da7
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
sys
import
paddle.fluid.incubate.data_generator
as
dg
cont_min_
=
[
0
,
-
3
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]
cont_max_
=
[
20
,
600
,
100
,
50
,
64000
,
500
,
100
,
50
,
500
,
10
,
10
,
10
,
50
]
cont_diff_
=
[
20
,
603
,
100
,
50
,
64000
,
500
,
100
,
50
,
500
,
10
,
10
,
10
,
50
]
hash_dim_
=
1000001
continuous_range_
=
range
(
1
,
14
)
categorical_range_
=
range
(
14
,
40
)
class
CriteoDataset
(
dg
.
MultiSlotDataGenerator
):
"""
DacDataset: inheritance MultiSlotDataGeneratior, Implement data reading
Help document: http://wiki.baidu.com/pages/viewpage.action?pageId=728820675
"""
def
generate_sample
(
self
,
line
):
"""
Read the data line by line and process it as a dictionary
"""
def
reader
():
"""
This function needs to be implemented by the user, based on data format
"""
features
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
dense_feature
=
[]
sparse_feature
=
[]
for
idx
in
continuous_range_
:
if
features
[
idx
]
==
""
:
dense_feature
.
append
(
0.0
)
else
:
dense_feature
.
append
(
(
float
(
features
[
idx
])
-
cont_min_
[
idx
-
1
])
/
cont_diff_
[
idx
-
1
])
for
idx
in
categorical_range_
:
sparse_feature
.
append
(
[
hash
(
str
(
idx
)
+
features
[
idx
])
%
hash_dim_
])
label
=
[
int
(
features
[
0
])]
process_line
=
dense_feature
,
sparse_feature
,
label
feature_name
=
[
"dense_input"
]
for
idx
in
categorical_range_
:
feature_name
.
append
(
"C"
+
str
(
idx
-
13
))
feature_name
.
append
(
"label"
)
yield
zip
(
feature_name
,
[
dense_feature
]
+
sparse_feature
+
[
label
])
return
reader
d
=
CriteoDataset
()
d
.
run_from_stdin
()
fleetrec/models/ctr_dnn/model.py
浏览文件 @
f8d55c9f
...
...
@@ -19,7 +19,7 @@ from fleetrec.utils import envs
from
fleetrec.models.base
import
Model
class
Train
(
Model
):
class
Train
Model
(
Model
):
def
__init__
(
self
,
config
):
Model
.
__init__
(
self
,
config
)
self
.
namespace
=
"train.model"
...
...
@@ -34,7 +34,7 @@ class Train(Model):
lod_level
=
1
,
dtype
=
"int64"
)
for
i
in
range
(
1
,
ids
)
]
return
sparse_input_ids
,
[
var
.
name
for
var
in
sparse_input_ids
]
return
sparse_input_ids
def
dense_input
():
dim
=
envs
.
get_global_env
(
"hyper_parameters.dense_input_dim"
,
None
,
self
.
namespace
)
...
...
@@ -42,23 +42,20 @@ class Train(Model):
dense_input_var
=
fluid
.
layers
.
data
(
name
=
"dense_input"
,
shape
=
[
dim
],
dtype
=
"float32"
)
return
dense_input_var
,
dense_input_var
.
name
return
dense_input_var
def
label_input
():
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
return
label
,
label
.
name
return
label
self
.
sparse_inputs
,
self
.
sparse_input_varnames
=
sparse_inputs
()
self
.
dense_input
,
self
.
dense_input_varname
=
dense_input
()
self
.
label_input
,
self
.
label_input_varname
=
label_input
()
self
.
sparse_inputs
=
sparse_inputs
()
self
.
dense_input
=
dense_input
()
self
.
label_input
=
label_input
()
def
input
_var
s
(
self
):
def
inputs
(
self
):
return
[
self
.
dense_input
]
+
self
.
sparse_inputs
+
[
self
.
label_input
]
def
input_varnames
(
self
):
return
[
input
.
name
for
input
in
self
.
input_vars
()]
def
build_model
(
self
):
def
net
(
self
):
def
embedding_layer
(
input
):
sparse_feature_number
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_number"
,
None
,
self
.
namespace
)
sparse_feature_dim
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
,
None
,
self
.
namespace
)
...
...
@@ -120,20 +117,8 @@ class Train(Model):
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
,
lazy_mode
=
True
)
return
optimizer
def
dump_model_program
(
self
,
path
):
pass
def
dump_inference_param
(
self
,
params
):
pass
def
dump_inference_program
(
self
,
inference_layer
,
path
):
pass
def
shrink
(
self
,
params
):
pass
class
Evaluate
(
object
):
class
Evaluate
Model
(
object
):
def
input
(
self
):
pass
...
...
fleetrec/trainer/cluster_trainer.py
浏览文件 @
f8d55c9f
...
...
@@ -32,11 +32,7 @@ logger = logging.getLogger("fluid")
logger
.
setLevel
(
logging
.
INFO
)
class
ClusterTrainerWithDataloader
(
TranspileTrainer
):
pass
class
ClusterTrainerWithDataset
(
TranspileTrainer
):
class
ClusterTrainer
(
TranspileTrainer
):
def
processor_register
(
self
):
role
=
PaddleCloudRoleMaker
()
fleet
.
init
(
role
)
...
...
@@ -71,7 +67,7 @@ class ClusterTrainerWithDataset(TranspileTrainer):
def
init
(
self
,
context
):
self
.
model
.
input
()
self
.
model
.
build_model
()
self
.
model
.
net
()
self
.
model
.
metrics
()
self
.
model
.
avg_loss
()
optimizer
=
self
.
model
.
optimizer
()
...
...
fleetrec/trainer/factory.py
浏览文件 @
f8d55c9f
...
...
@@ -10,46 +10,19 @@
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
yaml
from
fleetrec.trainer.single_trainer
import
SingleTrainerWithDataloader
from
fleetrec.trainer.single_trainer
import
SingleTrainerWithDataset
from
fleetrec.trainer.cluster_trainer
import
ClusterTrainerWithDataloader
from
fleetrec.trainer.cluster_trainer
import
ClusterTrainerWithDataset
from
fleetrec.trainer.local_engine
import
Launch
from
fleetrec.trainer.single_trainer
import
SingleTrainer
from
fleetrec.trainer.cluster_trainer
import
ClusterTrainer
from
fleetrec.trainer.ctr_trainer
import
CtrPaddleTrainer
from
fleetrec.utils
import
envs
def
str2bool
(
v
):
if
isinstance
(
v
,
bool
):
return
v
if
v
.
lower
()
in
(
'yes'
,
'true'
,
't'
,
'y'
,
'1'
):
return
True
elif
v
.
lower
()
in
(
'no'
,
'false'
,
'f'
,
'n'
,
'0'
):
return
False
else
:
raise
ValueError
(
'Boolean value expected.'
)
from
fleetrec.utils
import
util
class
TrainerFactory
(
object
):
...
...
@@ -61,21 +34,10 @@ class TrainerFactory(object):
print
(
envs
.
pretty_print_envs
(
envs
.
get_global_envs
()))
train_mode
=
envs
.
get_global_env
(
"train.trainer"
)
reader_mode
=
envs
.
get_global_env
(
"train.reader.mode"
)
if
train_mode
==
"SingleTraining"
:
if
reader_mode
==
"dataset"
:
trainer
=
SingleTrainerWithDataset
()
elif
reader_mode
==
"dataloader"
:
trainer
=
SingleTrainerWithDataloader
()
else
:
raise
ValueError
(
"reader only support dataset/dataloader"
)
trainer
=
SingleTrainer
()
elif
train_mode
==
"ClusterTraining"
:
if
reader_mode
==
"dataset"
:
trainer
=
ClusterTrainerWithDataset
()
elif
reader_mode
==
"dataloader"
:
trainer
=
ClusterTrainerWithDataloader
()
else
:
raise
ValueError
(
"reader only support dataset/dataloader"
)
trainer
=
ClusterTrainer
()
elif
train_mode
==
"CtrTrainer"
:
trainer
=
CtrPaddleTrainer
(
config
)
else
:
...
...
@@ -108,7 +70,7 @@ class TrainerFactory(object):
envs
.
set_global_envs
(
_config
)
mode
=
envs
.
get_global_env
(
"train.trainer"
)
container
=
envs
.
get_global_env
(
"train.container"
)
instance
=
str2bool
(
os
.
getenv
(
"CLUSTER_INSTANCE"
,
"0"
))
instance
=
util
.
str2bool
(
os
.
getenv
(
"CLUSTER_INSTANCE"
,
"0"
))
if
mode
==
"ClusterTraining"
and
container
==
"local"
and
not
instance
:
trainer
=
TrainerFactory
.
_build_engine
(
config
)
...
...
@@ -124,4 +86,3 @@ if __name__ == "__main__":
raise
ValueError
(
"need a yaml file path argv"
)
trainer
=
TrainerFactory
.
create
(
sys
.
argv
[
1
])
trainer
.
run
()
fleetrec/trainer/single_trainer.py
浏览文件 @
f8d55c9f
...
...
@@ -17,25 +17,18 @@ Training use fluid with one node only.
"""
from
__future__
import
print_function
import
os
import
time
import
numpy
as
np
import
logging
import
paddle.fluid
as
fluid
from
.transpiler_trainer
import
TranspileTrainer
from
.
.utils
import
envs
from
fleetrec.trainer
.transpiler_trainer
import
TranspileTrainer
from
fleetrec
.utils
import
envs
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
class
SingleTrainerWithDataloader
(
TranspileTrainer
):
pass
class
SingleTrainerWithDataset
(
TranspileTrainer
):
class
SingleTrainer
(
TranspileTrainer
):
def
processor_register
(
self
):
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'init_pass'
,
self
.
init
)
...
...
fleetrec/trainer/trainer.py
浏览文件 @
f8d55c9f
...
...
@@ -14,11 +14,8 @@
import
abc
import
time
import
yaml
from
paddle
import
fluid
from
..utils
import
envs
class
Trainer
(
object
):
"""R
...
...
fleetrec/trainer/transpiler_trainer.py
浏览文件 @
f8d55c9f
...
...
@@ -18,10 +18,9 @@ Training use fluid with DistributeTranspiler
import
os
import
paddle.fluid
as
fluid
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler
import
fleet
from
fleetrec.trainer
import
Trainer
from
fleetrec.trainer
.trainer
import
Trainer
from
fleetrec.utils
import
envs
...
...
@@ -39,15 +38,18 @@ class TranspileTrainer(Trainer):
def
_get_dataset
(
self
):
namespace
=
"train.reader"
inputs
=
self
.
model
.
input
_var
s
()
inputs
=
self
.
model
.
inputs
()
threads
=
envs
.
get_global_env
(
"train.threads"
,
None
)
batch_size
=
envs
.
get_global_env
(
"batch_size"
,
None
,
namespace
)
pipe_command
=
envs
.
get_global_env
(
"pipe_command"
,
None
,
namespace
)
reader_class
=
envs
.
get_global_env
(
"class"
,
None
,
namespace
)
abs_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
reader
=
os
.
path
.
join
(
abs_dir
,
'..'
,
'reader_implement.py'
)
pipe_cmd
=
"python {} {} {}"
.
format
(
reader
,
reader_class
,
"TRAIN"
)
train_data_path
=
envs
.
get_global_env
(
"train_data_path"
,
None
,
namespace
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
()
dataset
.
set_use_var
(
inputs
)
dataset
.
set_pipe_command
(
pipe_c
omman
d
)
dataset
.
set_pipe_command
(
pipe_c
m
d
)
dataset
.
set_batch_size
(
batch_size
)
dataset
.
set_thread
(
threads
)
file_list
=
[
...
...
fleetrec/utils/util.py
浏览文件 @
f8d55c9f
...
...
@@ -15,7 +15,18 @@
import
os
import
time
import
datetime
from
..
utils
import
fs
as
fs
from
..utils
import
fs
as
fs
def
str2bool
(
v
):
if
isinstance
(
v
,
bool
):
return
v
if
v
.
lower
()
in
(
'yes'
,
'true'
,
't'
,
'y'
,
'1'
):
return
True
elif
v
.
lower
()
in
(
'no'
,
'false'
,
'f'
,
'n'
,
'0'
):
return
False
else
:
raise
ValueError
(
'Boolean value expected.'
)
def
get_env_value
(
env_name
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录