Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleRec
提交
752f075e
P
PaddleRec
项目概览
PaddlePaddle
/
PaddleRec
通知
68
Star
12
Fork
5
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
27
列表
看板
标记
里程碑
合并请求
10
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
27
Issue
27
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
752f075e
编写于
8月 21, 2020
作者:
1
123malin
提交者:
GitHub
8月 21, 2020
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'master' into readme
上级
922776b6
af7c0e7d
变更
73
隐藏空白更改
内联
并排
Showing
73 changed file
with
640 addition
and
477 deletion
+640
-477
.travis.yml
.travis.yml
+8
-3
README.md
README.md
+5
-1
README_EN.md
README_EN.md
+4
-1
core/engine/cluster/cloud/k8s_config.ini.template
core/engine/cluster/cloud/k8s_config.ini.template
+1
-0
core/engine/cluster/cloud/mpi_config.ini.template
core/engine/cluster/cloud/mpi_config.ini.template
+1
-0
core/factory.py
core/factory.py
+13
-0
core/trainer.py
core/trainer.py
+10
-0
core/trainers/finetuning_trainer.py
core/trainers/finetuning_trainer.py
+140
-0
core/trainers/framework/dataset.py
core/trainers/framework/dataset.py
+21
-5
core/trainers/framework/network.py
core/trainers/framework/network.py
+83
-1
core/trainers/framework/runner.py
core/trainers/framework/runner.py
+36
-7
core/trainers/framework/startup.py
core/trainers/framework/startup.py
+121
-1
core/utils/dataloader_instance.py
core/utils/dataloader_instance.py
+55
-7
core/utils/util.py
core/utils/util.py
+22
-0
doc/custom_reader.md
doc/custom_reader.md
+0
-362
doc/distributed_train.md
doc/distributed_train.md
+1
-1
doc/model_develop.md
doc/model_develop.md
+1
-1
doc/pre_train_model.md
doc/pre_train_model.md
+15
-0
doc/train.md
doc/train.md
+1
-1
models/contentunderstanding/classification/config.yaml
models/contentunderstanding/classification/config.yaml
+1
-1
models/contentunderstanding/readme.md
models/contentunderstanding/readme.md
+5
-2
models/contentunderstanding/tagspace/config.yaml
models/contentunderstanding/tagspace/config.yaml
+1
-1
models/demo/movie_recommand/rank/config.yaml
models/demo/movie_recommand/rank/config.yaml
+1
-1
models/demo/movie_recommand/recall/config.yaml
models/demo/movie_recommand/recall/config.yaml
+1
-1
models/match/dssm/config.yaml
models/match/dssm/config.yaml
+1
-1
models/match/match-pyramid/config.yaml
models/match/match-pyramid/config.yaml
+1
-1
models/match/multiview-simnet/config.yaml
models/match/multiview-simnet/config.yaml
+1
-1
models/match/readme.md
models/match/readme.md
+5
-2
models/multitask/esmm/config.yaml
models/multitask/esmm/config.yaml
+1
-1
models/multitask/mmoe/config.yaml
models/multitask/mmoe/config.yaml
+1
-1
models/multitask/readme.md
models/multitask/readme.md
+6
-3
models/multitask/share-bottom/config.yaml
models/multitask/share-bottom/config.yaml
+1
-1
models/rank/AutoInt/config.yaml
models/rank/AutoInt/config.yaml
+1
-1
models/rank/BST/config.yaml
models/rank/BST/config.yaml
+1
-1
models/rank/afm/config.yaml
models/rank/afm/config.yaml
+1
-1
models/rank/dcn/config.yaml
models/rank/dcn/config.yaml
+1
-1
models/rank/deep_crossing/config.yaml
models/rank/deep_crossing/config.yaml
+1
-1
models/rank/deepfm/config.yaml
models/rank/deepfm/config.yaml
+1
-1
models/rank/dien/config.yaml
models/rank/dien/config.yaml
+1
-1
models/rank/din/config.yaml
models/rank/din/config.yaml
+1
-1
models/rank/dnn/config.yaml
models/rank/dnn/config.yaml
+1
-2
models/rank/ffm/config.yaml
models/rank/ffm/config.yaml
+1
-1
models/rank/fgcnn/config.yaml
models/rank/fgcnn/config.yaml
+1
-1
models/rank/fibinet/README.md
models/rank/fibinet/README.md
+1
-1
models/rank/fibinet/config.yaml
models/rank/fibinet/config.yaml
+1
-1
models/rank/flen/README.md
models/rank/flen/README.md
+1
-1
models/rank/flen/config.yaml
models/rank/flen/config.yaml
+1
-1
models/rank/fm/config.yaml
models/rank/fm/config.yaml
+1
-1
models/rank/fnn/config.yaml
models/rank/fnn/config.yaml
+1
-1
models/rank/logistic_regression/config.yaml
models/rank/logistic_regression/config.yaml
+1
-1
models/rank/nfm/config.yaml
models/rank/nfm/config.yaml
+1
-1
models/rank/pnn/config.yaml
models/rank/pnn/config.yaml
+1
-1
models/rank/readme.md
models/rank/readme.md
+1
-1
models/rank/wide_deep/config.yaml
models/rank/wide_deep/config.yaml
+1
-1
models/rank/xdeepfm/config.yaml
models/rank/xdeepfm/config.yaml
+1
-1
models/recall/fasttext/config.yaml
models/recall/fasttext/config.yaml
+1
-1
models/recall/gnn/config.yaml
models/recall/gnn/config.yaml
+1
-1
models/recall/gnn/readme.md
models/recall/gnn/readme.md
+1
-1
models/recall/gru4rec/config.yaml
models/recall/gru4rec/config.yaml
+1
-1
models/recall/look-alike_recall/README.md
models/recall/look-alike_recall/README.md
+1
-1
models/recall/look-alike_recall/config.yaml
models/recall/look-alike_recall/config.yaml
+1
-1
models/recall/ncf/config.yaml
models/recall/ncf/config.yaml
+1
-1
models/recall/readme.md
models/recall/readme.md
+12
-6
models/recall/ssr/config.yaml
models/recall/ssr/config.yaml
+1
-1
models/recall/word2vec/config.yaml
models/recall/word2vec/config.yaml
+1
-1
models/recall/youtube_dnn/config.yaml
models/recall/youtube_dnn/config.yaml
+1
-1
models/rerank/listwise/config.yaml
models/rerank/listwise/config.yaml
+1
-1
models/rerank/readme.md
models/rerank/readme.md
+4
-1
models/treebased/tdm/README.md
models/treebased/tdm/README.md
+4
-1
models/treebased/tdm/config.yaml
models/treebased/tdm/config.yaml
+1
-1
run.py
run.py
+15
-1
setup.cfg
setup.cfg
+0
-2
setup.py
setup.py
+6
-22
未找到文件。
.travis.yml
浏览文件 @
752f075e
...
...
@@ -16,15 +16,20 @@ before_install:
# For pylint dockstring checker
-
sudo apt-get update
-
sudo apt-get install -y python-pip libpython-dev
-
sudo apt-get remove python-urllib3
-
sudo apt-get purge python-urllib3
-
sudo rm /usr/lib/python2.7/dist-packages/chardet-*
-
sudo pip install -U pip
-
sudo pip install --upgrade setuptools
-
sudo pip install six --upgrade --ignore-installed six
-
sudo pip install pillow
-
sudo pip install PyYAML
-
sudo pip install pylint pytest astroid isort pre-commit
-
sudo pip install kiwisolver
-
sudo pip install paddlepaddle==1.7.2 --ignore-installed urllib3
-
sudo pip uninstall -y rarfile
-
sudo pip install scikit-build
-
sudo pip install Pillow==5.3.0
-
sudo pip install opencv-python==3.4.3.18
-
sudo pip install rarfile==3.0
-
sudo pip install paddlepaddle==1.7.2
-
sudo python setup.py install
-
|
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
...
...
README.md
浏览文件 @
752f075e
...
...
@@ -124,7 +124,10 @@
```
bash
# 使用CPU进行单机训练
python
-m
paddlerec.run
-m
paddlerec.models.rank.dnn
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd
paddle-rec
python
-m
paddlerec.run
-m
models/rank/dnn/config.yaml
```
...
...
@@ -144,6 +147,7 @@ python -m paddlerec.run -m paddlerec.models.rank.dnn
*
[
启动分布式训练
](
doc/distributed_train.md
)
*
[
启动预测
](
doc/predict.md
)
*
[
快速部署
](
doc/serving.md
)
*
[
预训练模型
](
doc/pre_train_model.md
)
### 进阶教程
...
...
README_EN.md
浏览文件 @
752f075e
...
...
@@ -119,7 +119,10 @@ We take the `dnn` algorithm as an example to get start of `PaddleRec`, and we ta
```
bash
# Training with cpu
python
-m
paddlerec.run
-m
paddlerec.models.rank.dnn
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd
paddle-rec
python
-m
paddlerec.run
-m
models/rank/dnn/config.yaml
```
...
...
core/engine/cluster/cloud/k8s_config.ini.template
浏览文件 @
752f075e
...
...
@@ -19,6 +19,7 @@ afs_local_mount_point="/root/paddlejob/workspace/env_run/afs/"
# 新k8s afs挂载帮助文档: http://wiki.baidu.com/pages/viewpage.action?pageId=906443193
PADDLE_PADDLEREC_ROLE=WORKER
PADDLEREC_CLUSTER_TYPE=K8S
use_python3=<$ USE_PYTHON3 $>
CPU_NUM=<$ CPU_NUM $>
GLOG_v=0
...
...
core/engine/cluster/cloud/mpi_config.ini.template
浏览文件 @
752f075e
...
...
@@ -17,6 +17,7 @@ output_path=<$ OUTPUT_PATH $>
thirdparty_path=<$ THIRDPARTY_PATH $>
PADDLE_PADDLEREC_ROLE=WORKER
PADDLEREC_CLUSTER_TYPE=MPI
use_python3=<$ USE_PYTHON3 $>
CPU_NUM=<$ CPU_NUM $>
GLOG_v=0
...
...
core/factory.py
浏览文件 @
752f075e
...
...
@@ -22,6 +22,19 @@ trainers = {}
def
trainer_registry
():
trainers
[
"SingleTrainer"
]
=
os
.
path
.
join
(
trainer_abs
,
"single_trainer.py"
)
trainers
[
"ClusterTrainer"
]
=
os
.
path
.
join
(
trainer_abs
,
"cluster_trainer.py"
)
trainers
[
"CtrCodingTrainer"
]
=
os
.
path
.
join
(
trainer_abs
,
"ctr_coding_trainer.py"
)
trainers
[
"CtrModulTrainer"
]
=
os
.
path
.
join
(
trainer_abs
,
"ctr_modul_trainer.py"
)
trainers
[
"TDMSingleTrainer"
]
=
os
.
path
.
join
(
trainer_abs
,
"tdm_single_trainer.py"
)
trainers
[
"TDMClusterTrainer"
]
=
os
.
path
.
join
(
trainer_abs
,
"tdm_cluster_trainer.py"
)
trainers
[
"OnlineLearningTrainer"
]
=
os
.
path
.
join
(
trainer_abs
,
"online_learning_trainer.py"
)
# Definition of procedure execution process
trainers
[
"CtrCodingTrainer"
]
=
os
.
path
.
join
(
trainer_abs
,
"ctr_coding_trainer.py"
)
...
...
core/trainer.py
浏览文件 @
752f075e
...
...
@@ -107,6 +107,7 @@ class Trainer(object):
self
.
device
=
Device
.
GPU
gpu_id
=
int
(
os
.
environ
.
get
(
'FLAGS_selected_gpus'
,
0
))
self
.
_place
=
fluid
.
CUDAPlace
(
gpu_id
)
print
(
"PaddleRec run on device GPU: {}"
.
format
(
gpu_id
))
self
.
_exe
=
fluid
.
Executor
(
self
.
_place
)
elif
device
==
"CPU"
:
self
.
device
=
Device
.
CPU
...
...
@@ -146,6 +147,7 @@ class Trainer(object):
elif
engine
.
upper
()
==
"CLUSTER"
:
self
.
engine
=
EngineMode
.
CLUSTER
self
.
is_fleet
=
True
self
.
which_cluster_type
()
else
:
raise
ValueError
(
"Not Support Engine {}"
.
format
(
engine
))
self
.
_context
[
"is_fleet"
]
=
self
.
is_fleet
...
...
@@ -165,6 +167,14 @@ class Trainer(object):
self
.
_context
[
"is_pslib"
]
=
(
fleet_mode
.
upper
()
==
"PSLIB"
)
self
.
_context
[
"fleet_mode"
]
=
fleet_mode
def
which_cluster_type
(
self
):
cluster_type
=
os
.
getenv
(
"PADDLEREC_CLUSTER_TYPE"
,
"MPI"
)
print
(
"PADDLEREC_CLUSTER_TYPE: {}"
.
format
(
cluster_type
))
if
cluster_type
and
cluster_type
.
upper
()
==
"K8S"
:
self
.
_context
[
"cluster_type"
]
=
"K8S"
else
:
self
.
_context
[
"cluster_type"
]
=
"MPI"
def
which_executor_mode
(
self
):
executor_mode
=
envs
.
get_runtime_environ
(
"train.trainer.executor_mode"
)
if
executor_mode
.
upper
()
not
in
[
"TRAIN"
,
"INFER"
]:
...
...
core/trainers/finetuning_trainer.py
0 → 100644
浏览文件 @
752f075e
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
General Trainer, applicable to many situations: Single/Cluster/Local_Cluster + PS/COLLECTIVE
"""
from
__future__
import
print_function
import
os
from
paddlerec.core.utils
import
envs
from
paddlerec.core.trainer
import
Trainer
,
EngineMode
,
FleetMode
class
FineTuningTrainer
(
Trainer
):
"""
Trainer for various situations
"""
def
__init__
(
self
,
config
=
None
):
Trainer
.
__init__
(
self
,
config
)
self
.
processor_register
()
self
.
abs_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
self
.
runner_env_name
=
"runner."
+
self
.
_context
[
"runner_name"
]
def
processor_register
(
self
):
print
(
"processor_register begin"
)
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'network_pass'
,
self
.
network
)
self
.
regist_context_processor
(
'startup_pass'
,
self
.
startup
)
self
.
regist_context_processor
(
'train_pass'
,
self
.
runner
)
self
.
regist_context_processor
(
'terminal_pass'
,
self
.
terminal
)
def
instance
(
self
,
context
):
instance_class_path
=
envs
.
get_global_env
(
self
.
runner_env_name
+
".instance_class_path"
,
default_value
=
None
)
if
instance_class_path
:
instance_class
=
envs
.
lazy_instance_by_fliename
(
instance_class_path
,
"Instance"
)(
context
)
else
:
if
self
.
engine
==
EngineMode
.
SINGLE
:
instance_class_name
=
"SingleInstance"
else
:
raise
ValueError
(
"FineTuningTrainer can only support SingleTraining."
)
instance_path
=
os
.
path
.
join
(
self
.
abs_dir
,
"framework"
,
"instance.py"
)
instance_class
=
envs
.
lazy_instance_by_fliename
(
instance_path
,
instance_class_name
)(
context
)
instance_class
.
instance
(
context
)
def
network
(
self
,
context
):
network_class_path
=
envs
.
get_global_env
(
self
.
runner_env_name
+
".network_class_path"
,
default_value
=
None
)
if
network_class_path
:
network_class
=
envs
.
lazy_instance_by_fliename
(
network_class_path
,
"Network"
)(
context
)
else
:
if
self
.
engine
==
EngineMode
.
SINGLE
:
network_class_name
=
"FineTuningNetwork"
else
:
raise
ValueError
(
"FineTuningTrainer can only support SingleTraining."
)
network_path
=
os
.
path
.
join
(
self
.
abs_dir
,
"framework"
,
"network.py"
)
network_class
=
envs
.
lazy_instance_by_fliename
(
network_path
,
network_class_name
)(
context
)
network_class
.
build_network
(
context
)
def
startup
(
self
,
context
):
startup_class_path
=
envs
.
get_global_env
(
self
.
runner_env_name
+
".startup_class_path"
,
default_value
=
None
)
if
startup_class_path
:
startup_class
=
envs
.
lazy_instance_by_fliename
(
startup_class_path
,
"Startup"
)(
context
)
else
:
if
self
.
engine
==
EngineMode
.
SINGLE
and
not
context
[
"is_infer"
]:
startup_class_name
=
"FineTuningStartup"
else
:
raise
ValueError
(
"FineTuningTrainer can only support SingleTraining."
)
startup_path
=
os
.
path
.
join
(
self
.
abs_dir
,
"framework"
,
"startup.py"
)
startup_class
=
envs
.
lazy_instance_by_fliename
(
startup_path
,
startup_class_name
)(
context
)
startup_class
.
startup
(
context
)
def
runner
(
self
,
context
):
runner_class_path
=
envs
.
get_global_env
(
self
.
runner_env_name
+
".runner_class_path"
,
default_value
=
None
)
if
runner_class_path
:
runner_class
=
envs
.
lazy_instance_by_fliename
(
runner_class_path
,
"Runner"
)(
context
)
else
:
if
self
.
engine
==
EngineMode
.
SINGLE
and
not
context
[
"is_infer"
]:
runner_class_name
=
"SingleRunner"
else
:
raise
ValueError
(
"FineTuningTrainer can only support SingleTraining."
)
runner_path
=
os
.
path
.
join
(
self
.
abs_dir
,
"framework"
,
"runner.py"
)
runner_class
=
envs
.
lazy_instance_by_fliename
(
runner_path
,
runner_class_name
)(
context
)
runner_class
.
run
(
context
)
def
terminal
(
self
,
context
):
terminal_class_path
=
envs
.
get_global_env
(
self
.
runner_env_name
+
".terminal_class_path"
,
default_value
=
None
)
if
terminal_class_path
:
terminal_class
=
envs
.
lazy_instance_by_fliename
(
terminal_class_path
,
"Terminal"
)(
context
)
terminal_class
.
terminal
(
context
)
else
:
terminal_class_name
=
"TerminalBase"
if
self
.
engine
!=
EngineMode
.
SINGLE
and
self
.
fleet_mode
!=
FleetMode
.
COLLECTIVE
:
terminal_class_name
=
"PSTerminal"
terminal_path
=
os
.
path
.
join
(
self
.
abs_dir
,
"framework"
,
"terminal.py"
)
terminal_class
=
envs
.
lazy_instance_by_fliename
(
terminal_path
,
terminal_class_name
)(
context
)
terminal_class
.
terminal
(
context
)
context
[
'is_exit'
]
=
True
core/trainers/framework/dataset.py
浏览文件 @
752f075e
...
...
@@ -21,7 +21,7 @@ from paddlerec.core.utils import envs
from
paddlerec.core.utils
import
dataloader_instance
from
paddlerec.core.reader
import
SlotReader
from
paddlerec.core.trainer
import
EngineMode
from
paddlerec.core.utils.util
import
split_files
from
paddlerec.core.utils.util
import
split_files
,
check_filelist
__all__
=
[
"DatasetBase"
,
"DataLoader"
,
"QueueDataset"
]
...
...
@@ -121,14 +121,30 @@ class QueueDataset(DatasetBase):
dataset
.
set_pipe_command
(
pipe_cmd
)
train_data_path
=
envs
.
get_global_env
(
name
+
"data_path"
)
file_list
=
[
os
.
path
.
join
(
train_data_path
,
x
)
for
x
in
os
.
listdir
(
train_data_path
)
]
hidden_file_list
,
file_list
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
train_data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
file_list
.
sort
()
need_split_files
=
False
if
context
[
"engine"
]
==
EngineMode
.
LOCAL_CLUSTER
:
# for local cluster: split files for multi process
need_split_files
=
True
elif
context
[
"engine"
]
==
EngineMode
.
CLUSTER
and
context
[
"cluster_type"
]
==
"K8S"
:
# for k8s mount afs, split files for every node
need_split_files
=
True
if
need_split_files
:
file_list
=
split_files
(
file_list
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
print
(
"File_list: {}"
.
format
(
file_list
))
dataset
.
set_filelist
(
file_list
)
for
model_dict
in
context
[
"phases"
]:
if
model_dict
[
"dataset_name"
]
==
dataset_name
:
...
...
core/trainers/framework/network.py
浏览文件 @
752f075e
...
...
@@ -23,7 +23,7 @@ from paddlerec.core.trainers.framework.dataset import DataLoader, QueueDataset
__all__
=
[
"NetworkBase"
,
"SingleNetwork"
,
"PSNetwork"
,
"PslibNetwork"
,
"CollectiveNetwork"
"CollectiveNetwork"
,
"FineTuningNetwork"
]
...
...
@@ -109,6 +109,88 @@ class SingleNetwork(NetworkBase):
context
[
"status"
]
=
"startup_pass"
class
FineTuningNetwork
(
NetworkBase
):
"""R
"""
def
__init__
(
self
,
context
):
print
(
"Running FineTuningNetwork."
)
def
build_network
(
self
,
context
):
context
[
"model"
]
=
{}
for
model_dict
in
context
[
"phases"
]:
context
[
"model"
][
model_dict
[
"name"
]]
=
{}
train_program
=
fluid
.
Program
()
startup_program
=
fluid
.
Program
()
scope
=
fluid
.
Scope
()
dataset_name
=
model_dict
[
"dataset_name"
]
with
fluid
.
program_guard
(
train_program
,
startup_program
):
with
fluid
.
unique_name
.
guard
():
with
fluid
.
scope_guard
(
scope
):
model_path
=
envs
.
os_path_adapter
(
envs
.
workspace_adapter
(
model_dict
[
"model"
]))
model
=
envs
.
lazy_instance_by_fliename
(
model_path
,
"Model"
)(
context
[
"env"
])
model
.
_data_var
=
model
.
input_data
(
dataset_name
=
model_dict
[
"dataset_name"
])
if
envs
.
get_global_env
(
"dataset."
+
dataset_name
+
".type"
)
==
"DataLoader"
:
model
.
_init_dataloader
(
is_infer
=
context
[
"is_infer"
])
data_loader
=
DataLoader
(
context
)
data_loader
.
get_dataloader
(
context
,
dataset_name
,
model
.
_data_loader
)
model
.
net
(
model
.
_data_var
,
context
[
"is_infer"
])
finetuning_varnames
=
envs
.
get_global_env
(
"runner."
+
context
[
"runner_name"
]
+
".finetuning_aspect_varnames"
,
default_value
=
[])
if
len
(
finetuning_varnames
)
==
0
:
raise
ValueError
(
"nothing need to be fine tuning, you may use other traning mode"
)
if
len
(
finetuning_varnames
)
!=
1
:
raise
ValueError
(
"fine tuning mode can only accept one varname now"
)
varname
=
finetuning_varnames
[
0
]
finetuning_vars
=
train_program
.
global_block
().
vars
[
varname
]
finetuning_vars
.
stop_gradient
=
True
optimizer
=
model
.
optimizer
()
optimizer
.
minimize
(
model
.
_cost
)
context
[
"model"
][
model_dict
[
"name"
]][
"main_program"
]
=
train_program
context
[
"model"
][
model_dict
[
"name"
]][
"startup_program"
]
=
startup_program
context
[
"model"
][
model_dict
[
"name"
]][
"scope"
]
=
scope
context
[
"model"
][
model_dict
[
"name"
]][
"model"
]
=
model
context
[
"model"
][
model_dict
[
"name"
]][
"default_main_program"
]
=
train_program
.
clone
()
context
[
"model"
][
model_dict
[
"name"
]][
"compiled_program"
]
=
None
context
[
"dataset"
]
=
{}
for
dataset
in
context
[
"env"
][
"dataset"
]:
type
=
envs
.
get_global_env
(
"dataset."
+
dataset
[
"name"
]
+
".type"
)
if
type
==
"QueueDataset"
:
dataset_class
=
QueueDataset
(
context
)
context
[
"dataset"
][
dataset
[
"name"
]]
=
dataset_class
.
create_dataset
(
dataset
[
"name"
],
context
)
context
[
"status"
]
=
"startup_pass"
class
PSNetwork
(
NetworkBase
):
def
__init__
(
self
,
context
):
print
(
"Running PSNetwork."
)
...
...
core/trainers/framework/runner.py
浏览文件 @
752f075e
...
...
@@ -16,6 +16,7 @@ from __future__ import print_function
import
os
import
time
import
warnings
import
numpy
as
np
import
paddle.fluid
as
fluid
...
...
@@ -284,6 +285,7 @@ class RunnerBase(object):
return
(
epoch_id
+
1
)
%
epoch_interval
==
0
def
save_inference_model
():
# get global env
name
=
"runner."
+
context
[
"runner_name"
]
+
"."
save_interval
=
int
(
envs
.
get_global_env
(
name
+
"save_inference_interval"
,
-
1
))
...
...
@@ -296,18 +298,44 @@ class RunnerBase(object):
if
feed_varnames
is
None
or
fetch_varnames
is
None
or
feed_varnames
==
""
or
fetch_varnames
==
""
or
\
len
(
feed_varnames
)
==
0
or
len
(
fetch_varnames
)
==
0
:
return
fetch_vars
=
[
fluid
.
default_main_program
().
global_block
().
vars
[
varname
]
for
varname
in
fetch_varnames
]
# check feed var exist
for
var_name
in
feed_varnames
:
if
var_name
not
in
fluid
.
default_main_program
().
global_block
(
).
vars
:
raise
ValueError
(
"Feed variable: {} not in default_main_program, global block has follow vars: {}"
.
format
(
var_name
,
fluid
.
default_main_program
().
global_block
()
.
vars
.
keys
()))
# check fetch var exist
fetch_vars
=
[]
for
var_name
in
fetch_varnames
:
if
var_name
not
in
fluid
.
default_main_program
().
global_block
(
).
vars
:
raise
ValueError
(
"Fetch variable: {} not in default_main_program, global block has follow vars: {}"
.
format
(
var_name
,
fluid
.
default_main_program
().
global_block
()
.
vars
.
keys
()))
else
:
fetch_vars
.
append
(
fluid
.
default_main_program
()
.
global_block
().
vars
[
var_name
])
dirname
=
envs
.
get_global_env
(
name
+
"save_inference_path"
,
None
)
assert
dirname
is
not
None
dirname
=
os
.
path
.
join
(
dirname
,
str
(
epoch_id
))
if
is_fleet
:
context
[
"fleet"
].
save_inference_model
(
context
[
"exe"
],
dirname
,
feed_varnames
,
fetch_vars
)
warnings
.
warn
(
"Save inference model in cluster training is not recommended! Using save checkpoint instead."
,
category
=
UserWarning
,
stacklevel
=
2
)
if
context
[
"fleet"
].
worker_index
()
==
0
:
context
[
"fleet"
].
save_inference_model
(
context
[
"exe"
],
dirname
,
feed_varnames
,
fetch_vars
)
else
:
fluid
.
io
.
save_inference_model
(
dirname
,
feed_varnames
,
fetch_vars
,
context
[
"exe"
])
...
...
@@ -323,7 +351,8 @@ class RunnerBase(object):
return
dirname
=
os
.
path
.
join
(
dirname
,
str
(
epoch_id
))
if
is_fleet
:
context
[
"fleet"
].
save_persistables
(
context
[
"exe"
],
dirname
)
if
context
[
"fleet"
].
worker_index
()
==
0
:
context
[
"fleet"
].
save_persistables
(
context
[
"exe"
],
dirname
)
else
:
fluid
.
io
.
save_persistables
(
context
[
"exe"
],
dirname
)
...
...
core/trainers/framework/startup.py
浏览文件 @
752f075e
...
...
@@ -17,9 +17,13 @@ from __future__ import print_function
import
warnings
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddlerec.core.utils
import
envs
__all__
=
[
"StartupBase"
,
"SingleStartup"
,
"PSStartup"
,
"CollectiveStartup"
]
__all__
=
[
"StartupBase"
,
"SingleStartup"
,
"PSStartup"
,
"CollectiveStartup"
,
"FineTuningStartup"
]
class
StartupBase
(
object
):
...
...
@@ -65,6 +69,122 @@ class SingleStartup(StartupBase):
context
[
"status"
]
=
"train_pass"
class
FineTuningStartup
(
StartupBase
):
"""R
"""
def
__init__
(
self
,
context
):
self
.
op_name_scope
=
"op_namescope"
self
.
clip_op_name_scope
=
"@CLIP"
self
.
self
.
op_role_var_attr_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleVarAttrName
(
)
print
(
"Running SingleStartup."
)
def
_is_opt_role_op
(
self
,
op
):
# NOTE: depend on oprole to find out whether this op is for
# optimize
op_maker
=
core
.
op_proto_and_checker_maker
optimize_role
=
core
.
op_proto_and_checker_maker
.
OpRole
.
Optimize
if
op_maker
.
kOpRoleAttrName
()
in
op
.
attr_names
and
\
int
(
op
.
all_attrs
()[
op_maker
.
kOpRoleAttrName
()])
==
int
(
optimize_role
):
return
True
return
False
def
_get_params_grads
(
self
,
program
):
"""
Get optimizer operators, parameters and gradients from origin_program
Returns:
opt_ops (list): optimize operators.
params_grads (dict): parameter->gradient.
"""
block
=
program
.
global_block
()
params_grads
=
[]
# tmp set to dedup
optimize_params
=
set
()
origin_var_dict
=
program
.
global_block
().
vars
for
op
in
block
.
ops
:
if
self
.
_is_opt_role_op
(
op
):
# Todo(chengmo): Whether clip related op belongs to Optimize guard should be discussed
# delete clip op from opt_ops when run in Parameter Server mode
if
self
.
op_name_scope
in
op
.
all_attrs
(
)
and
self
.
clip_op_name_scope
in
op
.
attr
(
self
.
op_name_scope
):
op
.
_set_attr
(
"op_role"
,
int
(
core
.
op_proto_and_checker_maker
.
OpRole
.
Backward
))
continue
if
op
.
attr
(
self
.
op_role_var_attr_name
):
param_name
=
op
.
attr
(
self
.
op_role_var_attr_name
)[
0
]
grad_name
=
op
.
attr
(
self
.
op_role_var_attr_name
)[
1
]
if
not
param_name
in
optimize_params
:
optimize_params
.
add
(
param_name
)
params_grads
.
append
([
origin_var_dict
[
param_name
],
origin_var_dict
[
grad_name
]
])
return
params_grads
@
staticmethod
def
is_persistable
(
var
):
"""
Check whether the given variable is persistable.
Args:
var(Variable): The variable to be checked.
Returns:
bool: True if the given `var` is persistable
False if not.
Examples:
.. code-block:: python
import paddle.fluid as fluid
param = fluid.default_main_program().global_block().var('fc.b')
res = fluid.io.is_persistable(param)
"""
if
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FEED_MINIBATCH
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FETCH_LIST
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
READER
:
return
False
return
var
.
persistable
def
load
(
self
,
context
,
is_fleet
=
False
,
main_program
=
None
):
dirname
=
envs
.
get_global_env
(
"runner."
+
context
[
"runner_name"
]
+
".init_model_path"
,
None
)
if
dirname
is
None
or
dirname
==
""
:
return
print
(
"going to load "
,
dirname
)
params_grads
=
self
.
_get_params_grads
(
main_program
)
update_params
=
[
p
for
p
,
_
in
params_grads
]
need_load_vars
=
[]
parameters
=
list
(
filter
(
FineTuningStartup
.
is_persistable
,
main_program
.
list_vars
()))
for
param
in
parameters
:
if
param
not
in
update_params
:
need_load_vars
.
append
(
param
)
fluid
.
io
.
load_vars
(
context
[
"exe"
],
dirname
,
main_program
,
need_load_vars
)
print
(
"load from {} success"
.
format
(
dirname
))
def
startup
(
self
,
context
):
for
model_dict
in
context
[
"phases"
]:
with
fluid
.
scope_guard
(
context
[
"model"
][
model_dict
[
"name"
]][
"scope"
]):
train_prog
=
context
[
"model"
][
model_dict
[
"name"
]][
"main_program"
]
startup_prog
=
context
[
"model"
][
model_dict
[
"name"
]][
"startup_program"
]
with
fluid
.
program_guard
(
train_prog
,
startup_prog
):
context
[
"exe"
].
run
(
startup_prog
)
self
.
load
(
context
,
main_program
=
train_prog
)
context
[
"status"
]
=
"train_pass"
class
PSStartup
(
StartupBase
):
def
__init__
(
self
,
context
):
print
(
"Running PSStartup."
)
...
...
core/utils/dataloader_instance.py
浏览文件 @
752f075e
...
...
@@ -19,7 +19,7 @@ from paddlerec.core.utils.envs import get_global_env
from
paddlerec.core.utils.envs
import
get_runtime_environ
from
paddlerec.core.reader
import
SlotReader
from
paddlerec.core.trainer
import
EngineMode
from
paddlerec.core.utils.util
import
split_files
from
paddlerec.core.utils.util
import
split_files
,
check_filelist
def
dataloader_by_name
(
readerclass
,
...
...
@@ -38,11 +38,27 @@ def dataloader_by_name(readerclass,
assert
package_base
is
not
None
data_path
=
os
.
path
.
join
(
package_base
,
data_path
.
split
(
"::"
)[
1
])
files
=
[
str
(
data_path
)
+
"/%s"
%
x
for
x
in
os
.
listdir
(
data_path
)]
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
files
.
sort
()
need_split_files
=
False
if
context
[
"engine"
]
==
EngineMode
.
LOCAL_CLUSTER
:
# for local cluster: split files for multi process
need_split_files
=
True
elif
context
[
"engine"
]
==
EngineMode
.
CLUSTER
and
context
[
"cluster_type"
]
==
"K8S"
:
# for k8s mount mode, split files for every node
need_split_files
=
True
print
(
"need_split_files: {}"
.
format
(
need_split_files
))
if
need_split_files
:
files
=
split_files
(
files
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
print
(
"file_list : {}"
.
format
(
files
))
reader
=
reader_class
(
yaml_file
)
reader
.
init
()
...
...
@@ -84,11 +100,27 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
assert
package_base
is
not
None
data_path
=
os
.
path
.
join
(
package_base
,
data_path
.
split
(
"::"
)[
1
])
files
=
[
str
(
data_path
)
+
"/%s"
%
x
for
x
in
os
.
listdir
(
data_path
)]
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
files
.
sort
()
need_split_files
=
False
if
context
[
"engine"
]
==
EngineMode
.
LOCAL_CLUSTER
:
# for local cluster: split files for multi process
need_split_files
=
True
elif
context
[
"engine"
]
==
EngineMode
.
CLUSTER
and
context
[
"cluster_type"
]
==
"K8S"
:
# for k8s mount mode, split files for every node
need_split_files
=
True
if
need_split_files
:
files
=
split_files
(
files
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
print
(
"file_list: {}"
.
format
(
files
))
sparse
=
get_global_env
(
name
+
"sparse_slots"
,
"#"
)
if
sparse
==
""
:
...
...
@@ -138,11 +170,27 @@ def slotdataloader(readerclass, train, yaml_file, context):
assert
package_base
is
not
None
data_path
=
os
.
path
.
join
(
package_base
,
data_path
.
split
(
"::"
)[
1
])
files
=
[
str
(
data_path
)
+
"/%s"
%
x
for
x
in
os
.
listdir
(
data_path
)]
hidden_file_list
,
files
=
check_filelist
(
hidden_file_list
=
[],
data_file_list
=
[],
train_data_path
=
data_path
)
if
(
hidden_file_list
is
not
None
):
print
(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
.
format
(
hidden_file_list
))
files
.
sort
()
need_split_files
=
False
if
context
[
"engine"
]
==
EngineMode
.
LOCAL_CLUSTER
:
# for local cluster: split files for multi process
need_split_files
=
True
elif
context
[
"engine"
]
==
EngineMode
.
CLUSTER
and
context
[
"cluster_type"
]
==
"K8S"
:
# for k8s mount mode, split files for every node
need_split_files
=
True
if
need_split_files
:
files
=
split_files
(
files
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
print
(
"file_list: {}"
.
format
(
files
))
sparse
=
get_global_env
(
"sparse_slots"
,
"#"
,
namespace
)
if
sparse
==
""
:
...
...
core/utils/util.py
浏览文件 @
752f075e
...
...
@@ -201,6 +201,28 @@ def split_files(files, trainer_id, trainers):
return
trainer_files
[
trainer_id
]
def
check_filelist
(
hidden_file_list
,
data_file_list
,
train_data_path
):
for
root
,
dirs
,
files
in
os
.
walk
(
train_data_path
):
if
(
files
==
None
and
dirs
==
None
):
return
None
,
None
else
:
# use files and dirs
for
file_name
in
files
:
file_path
=
os
.
path
.
join
(
train_data_path
,
file_name
)
if
file_name
[
0
]
==
'.'
:
hidden_file_list
.
append
(
file_path
)
else
:
data_file_list
.
append
(
file_path
)
for
dirs_name
in
dirs
:
dirs_path
=
os
.
path
.
join
(
train_data_path
,
dirs_name
)
if
dirs_name
[
0
]
==
'.'
:
hidden_file_list
.
append
(
dirs_path
)
else
:
#train_data_path = os.path.join(train_data_path, dirs_name)
check_filelist
(
hidden_file_list
,
data_file_list
,
dirs_path
)
return
hidden_file_list
,
data_file_list
class
CostPrinter
(
object
):
"""
For count cost time && print cost log
...
...
doc/custom_reader.md
已删除
100644 → 0
浏览文件 @
922776b6
# PaddleRec 自定义数据集及Reader
用户自定义数据集及配置异步Reader,需要关注以下几个步骤:
*
[
数据集整理
](
#数据集整理
)
*
[
在模型组网中加入输入占位符
](
#在模型组网中加入输入占位符
)
*
[
Reader实现
](
#Reader的实现
)
*
[
在yaml文件中配置Reader
](
#在yaml文件中配置reader
)
我们以CTR-DNN模型为例,给出了从数据整理,变量定义,Reader写法,调试的完整历程。
*
[
数据及Reader示例-DNN
](
#数据及Reader示例-DNN
)
## 数据集整理
PaddleRec支持模型自定义数据集。
关于数据的tips:
1.
数据量:
PaddleRec面向大规模数据设计,可以轻松支持亿级的数据读取,工业级的数据读写api:`dataset`在搜索、推荐、信息流等业务得到了充分打磨。
2.
文件类型:
支持任意直接可读的文本数据,`dataset`同时支持`.gz`格式的文本压缩数据,无需额外代码,可直接读取。数据样本应以`\n`为标志,按行组织。
3.
文件存放位置:
文件通常存放在训练节点本地,但同时,`dataset`支持使用`hadoop`远程读取数据,数据无需下载到本地,为dataset配置hadoop相关账户及地址即可。
4.
数据类型
Reader处理的是以行为单位的`string`数据,喂入网络的数据需要转为`int`,`float`的数值数据,不支持`string`喂入网络,不建议明文保存及处理训练数据。
5.
Tips
Dataset模式下,训练线程与数据读取线程的关系强相关,为了多线程充分利用,`强烈建议将文件合理的拆为多个小文件`,尤其是在分布式训练场景下,可以均衡各个节点的数据量,同时加快数据的下载速度。
## 在模型组网中加入输入占位符
Reader读取文件后,产出的数据喂入网络,需要有占位符进行接收。占位符在Paddle中使用
`fluid.data`
或
`fluid.layers.data`
进行定义。
`data`
的定义可以参考
[
fluid.data
](
https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/data_cn.html#data
)
以及
[
fluid.layers.data
](
https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/layers_cn/data_cn.html#data
)
。
假如您希望输入三个数据,分别是维度32的数据A,维度变长的稀疏数据B,以及一个一维的标签数据C,并希望梯度可以经过该变量向前传递,则示例如下:
数据A的定义:
```
python
var_a
=
fluid
.
data
(
name
=
'A'
,
shape
=
[
-
1
,
32
],
dtype
=
'float32'
)
```
数据B的定义,变长数据的使用可以参考
[
LoDTensor
](
https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#cn-user-guide-lod-tensor
)
:
```
python
var_b
=
fluid
.
data
(
name
=
'B'
,
shape
=
[
-
1
,
1
],
lod_level
=
1
,
dtype
=
'int64'
)
```
数据C的定义:
```
python
var_c
=
fluid
.
data
(
name
=
'C'
,
shape
=
[
-
1
,
1
],
dtype
=
'int32'
)
var_c
.
stop_gradient
=
False
```
当我们完成以上三个数据的定义后,在PaddleRec的模型定义中,还需将其加入model基类成员变量
`self._data_var`
```
python
self
.
_data_var
.
append
(
var_a
)
self
.
_data_var
.
append
(
var_b
)
self
.
_data_var
.
append
(
var_c
)
```
至此,我们完成了在组网中定义输入数据的工作。
## Reader的实现
### Reader的实现范式
Reader的逻辑需要一个单独的python文件进行描述。我们试写一个
`test_reader.py`
,实现的具体流程如下:
1.
首先我们需要引入Reader基类
```python
from paddlerec.core.reader import ReaderBase
```
2.
创建一个子类,继承Reader的基类,训练所需Reader命名为
`TrainerReader`
```
python
class
TrainerReader
(
ReaderBase
):
def
init
(
self
):
pass
def
generator_sample
(
self
,
line
):
pass
```
3.
在
`init(self)`
函数中声明一些在数据读取中会用到的变量,必要时可以在
`config.yaml`
文件中配置变量,利用
`env.get_global_env()`
拿到。
比如,我们希望从yaml文件中读取一个数据预处理变量
`avg=10`
,目的是将数据A的数据缩小10倍,可以这样实现:
首先更改yaml文件,在某个space下加入该变量
```yaml
...
train:
reader:
avg: 10
...
```
再更改Reader的init函数
```python
from paddlerec.core.utils import envs
class TrainerReader(Reader):
def init(self):
self.avg = envs.get_global_env("avg", None, "train.reader")
def generator_sample(self, line):
pass
```
4.
继承并实现基类中的
`generate_sample(self, line)`
函数,逐行读取数据。
-
该函数应返回一个可以迭代的reader方法(带有yield的函数不再是一个普通的函数,而是一个生成器generator,成为了可以迭代的对象,等价于一个数组、链表、文件、字符串etc.)
-
在这个可以迭代的函数中,如示例代码中的
`def reader()`
,我们定义数据读取的逻辑。以行为单位的数据进行截取,转换及预处理。
-
最后,我们需要将数据整理为特定的格式,才能够被PaddleRec的Reader正确读取,并灌入的训练的网络中。简单来说,数据的输出顺序与我们在网络中创建的
`inputs`
必须是严格一一对应的,并转换为类似字典的形式。
示例: 假设数据ABC在文本数据中,每行以这样的形式存储:
```
shell
0.1,0.2,0.3...3.0,3.1,3.2
\t
99999,99998,99997
\t
1
\n
```
则示例代码如下:
```python
from paddlerec.core.utils import envs
class TrainerReader(Reader):
def init(self):
self.avg = envs.get_global_env("avg", None, "train.reader")
def generator_sample(self, line):
def reader(self, line):
# 先分割 '\n', 再以 '\t'为标志分割为list
variables = (line.strip('\n')).split('\t')
# A是第一个元素,并且每个数据之间使用','分割
var_a = variables[0].split(',') # list
var_a = [float(i) / self.avg for i in var_a] # 将str数据转换为float
# B是第二个元素,同样以 ',' 分割
var_b = variables[1].split(',') # list
var_b = [int(i) for i in var_b] # 将str数据转换为int
# C是第三个元素, 只有一个元素,没有分割符
var_c = variables[2]
var_c = int(var_c) # 将str数据转换为int
var_c = [var_c] # 将单独的数据元素置入list中
# 将数据与数据名结合,组织为dict的形式
# 如下,output形式为{ A: var_a, B: var_b, C: var_c}
variable_name = ['A', 'B', 'C']
output = zip(variable_name, [var_a] + [var_b] + [var_c])
# 将数据输出,使用yield方法,将该函数变为了一个可迭代的对象
yield output
```
至此,我们完成了Reader的实现。
### 在yaml文件中配置Reader
在模型的yaml配置文件中,主要的修改是三个,如下
```
yaml
reader
:
batch_size
:
2
class
:
"
{workspace}/reader.py"
train_data_path
:
"
{workspace}/data/train_data"
reader_debug_mode
:
False
```
batch_size: 顾名思义,是小批量训练时的样本大小
class: 运行改模型所需reader的路径
train_data_path: 训练数据所在文件夹
reader_debug_mode: 测试reader语法,及输出是否符合预期的debug模式的开关
## 数据及Reader示例-DNN
Reader代码来源于
[
criteo_reader.py
](
../models/rank/criteo_reader.py
)
, 组网代码来源于
[
model.py
](
../models/rank/dnn/model.py
)
### Criteo数据集格式
CTR-DNN训练及测试数据集选用
[
Display Advertising Challenge
](
https://www.kaggle.com/c/criteo-display-ad-challenge/
)
所用的Criteo数据集。该数据集包括两部分:训练集和测试集。训练集包含一段时间内Criteo的部分流量,测试集则对应训练数据后一天的广告点击流量。
每一行数据格式如下所示:
```
bash
<label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>
```
其中
```<label>```
表示广告是否被点击,点击用1表示,未点击用0表示。
```<integer feature>```
代表数值特征(连续特征),共有13个连续特征。
```<categorical feature>```
代表分类特征(离散特征),共有26个离散特征。相邻两个特征用
```\t```
分隔,缺失特征用空格表示。测试集中
```<label>```
特征已被移除。
### Criteo数据集的预处理
数据预处理共包括两步:
-
将原始训练集按9:1划分为训练集和验证集
-
数值特征(连续特征)需进行归一化处理,但需要注意的是,对每一个特征
```<integer feature i>```
,归一化时用到的最大值并不是用全局最大值,而是取排序后95%位置处的特征值作为最大值,同时保留极值。
### CTR网络输入的定义
正如前所述,Criteo数据集中,分为连续数据与离散(稀疏)数据,所以整体而言,CTR-DNN模型的数据输入层包括三个,分别是:
`dense_input`
用于输入连续数据,维度由超参数
`dense_feature_dim`
指定,数据类型是归一化后的浮点型数据。
`sparse_input_ids`
用于记录离散数据,在Criteo数据集中,共有26个slot,所以我们创建了名为
`C1~C26`
的26个稀疏参数输入,并设置
`lod_level=1`
,代表其为变长数据,数据类型为整数;最后是每条样本的
`label`
,代表了是否被点击,数据类型是整数,0代表负样例,1代表正样例。
在Paddle中数据输入的声明使用
`paddle.fluid.layers.data()`
,会创建指定类型的占位符,数据IO会依据此定义进行数据的输入。
稀疏参数输入的定义:
```
python
def
sparse_inputs
():
ids
=
envs
.
get_global_env
(
"hyper_parameters.sparse_inputs_slots"
,
None
)
sparse_input_ids
=
[
fluid
.
layers
.
data
(
name
=
"S"
+
str
(
i
),
shape
=
[
1
],
lod_level
=
1
,
dtype
=
"int64"
)
for
i
in
range
(
1
,
ids
)
]
return
sparse_input_ids
```
稠密参数输入的定义:
```
python
def
dense_input
():
dim
=
envs
.
get_global_env
(
"hyper_parameters.dense_input_dim"
,
None
)
dense_input_var
=
fluid
.
layers
.
data
(
name
=
"D"
,
shape
=
[
dim
],
dtype
=
"float32"
)
return
dense_input_var
```
标签的定义:
```
python
def
label_input
():
label
=
fluid
.
layers
.
data
(
name
=
"click"
,
shape
=
[
1
],
dtype
=
"int64"
)
return
label
```
组合起来,正确的声明他们:
```
python
self
.
sparse_inputs
=
sparse_inputs
()
self
.
dense_input
=
dense_input
()
self
.
label_input
=
label_input
()
self
.
_data_var
.
append
(
self
.
dense_input
)
for
input
in
self
.
sparse_inputs
:
self
.
_data_var
.
append
(
input
)
self
.
_data_var
.
append
(
self
.
label_input
)
```
### Criteo Reader写法
```
python
# 引入PaddleRec的Reader基类
from
paddlerec.core.reader
import
ReaderBase
# 引入PaddleRec的读取yaml配置文件的方法
from
paddlerec.core.utils
import
envs
# 定义TrainReader,需要继承 paddlerec.core.reader.Reader
class
Reader
(
ReaderBase
)::
# 数据预处理逻辑,继承自基类
# 如果无需处理, 使用pass跳过该函数的执行
def
init
(
self
):
self
.
cont_min_
=
[
0
,
-
3
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]
self
.
cont_max_
=
[
20
,
600
,
100
,
50
,
64000
,
500
,
100
,
50
,
500
,
10
,
10
,
10
,
50
]
self
.
cont_diff_
=
[
20
,
603
,
100
,
50
,
64000
,
500
,
100
,
50
,
500
,
10
,
10
,
10
,
50
]
self
.
hash_dim_
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_number"
,
None
,
"train.model"
)
self
.
continuous_range_
=
range
(
1
,
14
)
self
.
categorical_range_
=
range
(
14
,
40
)
# 读取数据方法,继承自基类
# 实现可以迭代的reader函数,逐行处理数据
def
generate_sample
(
self
,
line
):
"""
Read the data line by line and process it as a dictionary
"""
def
reader
():
"""
This function needs to be implemented by the user, based on data format
"""
features
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
dense_feature
=
[]
sparse_feature
=
[]
for
idx
in
self
.
continuous_range_
:
if
features
[
idx
]
==
""
:
dense_feature
.
append
(
0.0
)
else
:
dense_feature
.
append
(
(
float
(
features
[
idx
])
-
self
.
cont_min_
[
idx
-
1
])
/
self
.
cont_diff_
[
idx
-
1
])
for
idx
in
self
.
categorical_range_
:
sparse_feature
.
append
(
[
hash
(
str
(
idx
)
+
features
[
idx
])
%
self
.
hash_dim_
])
label
=
[
int
(
features
[
0
])]
feature_name
=
[
"D"
]
for
idx
in
self
.
categorical_range_
:
feature_name
.
append
(
"S"
+
str
(
idx
-
13
))
feature_name
.
append
(
"label"
)
yield
zip
(
feature_name
,
[
dense_feature
]
+
sparse_feature
+
[
label
])
return
reader
```
### 调试Reader
在Linux下运行时,默认启动
`Dataset`
模式,在Win/Mac下运行时,默认启动
`Dataloader`
模式。
通过在
`config.yaml`
中添加或修改
`reader_debug_mode=True`
打开debug模式,只会结合组网运行reader的部分,读取10条样本,并print,方便您观察格式是否符合预期或隐藏bug。
```
yaml
reader
:
batch_size
:
2
class
:
"
{workspace}/../criteo_reader.py"
train_data_path
:
"
{workspace}/data/train"
reader_debug_mode
:
True
```
修改后,使用paddlerec.run执行该修改后的yaml文件,可以观察输出。
```
bash
python
-m
paddlerec.run
-m
./models/rank/dnn/config.yaml
-e
single
```
### Dataset调试
dataset输出的数据格式如下:
` dense_input:size ; dense_input:value ; sparse_input:size ; sparse_input:value ; ... ; sparse_input:size ; sparse_input:value ; label:size ; label:value `
基本规律是对于每个变量,会先输出其维度大小,再输出其具体值。
直接debug
`criteo_reader`
理想的输出为(截取了一个片段):
```
bash
...
13 0.0 0.00497512437811 0.05 0.08 0.207421875 0.028 0.35 0.08 0.082 0.0 0.4 0.0 0.08 1 737395 1 210498 1 903564 1 286224 1 286835 1 906818 1 90
6116 1 67180 1 27346 1 51086 1 142177 1 95024 1 157883 1 873363 1 600281 1 812592 1 228085 1 35900 1 880474 1 984402 1 100885 1 26235 1 410878 1 798162 1 499868 1 306163 1 0
...
```
可以看到首先输出的是13维的dense参数,随后是分立的sparse参数,最后一个是1维的label,数值为0,输出符合预期。
>使用Dataset的一些注意事项
> - Dataset的基本原理:将数据print到缓存,再由C++端的代码实现读取,因此,我们不能在dataset的读取代码中,加入与数据读取无关的print信息,会导致C++端拿到错误的数据信息。
> - dataset目前只支持在`unbuntu`及`CentOS`等标准Linux环境下使用,在`Windows`及`Mac`下使用时,会产生预料之外的错误,请知悉。
### DataLoader调试
dataloader的输出格式为
`list: [ list[var_1], list[var_2], ... , list[var_3]]`
,每条样本的数据会被放在一个
**list[list]**
中,list[0]为第一个variable。
直接debug
`criteo_reader`
理想的输出为(截取了一个片段):
```
bash
...
[[
0.0, 0.004975124378109453, 0.05, 0.08, 0.207421875, 0.028, 0.35, 0.08, 0.082, 0.0, 0.4, 0.0, 0.08],
[
560746],
[
902436],
[
262029],
[
182633],
[
368411],
[
735166],
[
321120],
[
39572],
[
185732],
[
140298],
[
926671],
[
81559],
[
461249],
[
728372],
[
915018],
[
907965],
[
818961],
[
850958],
[
311492],
[
980340],
[
254960],
[
175041],
[
524857],
[
764893],
[
526288],
[
220126],
[
0]]
...
```
可以看到首先输出的是13维的dense参数的list,随后是分立的sparse参数,各自在一个list中,最后一个是1维的label的list,数值为0,输出符合预期。
doc/distributed_train.md
浏览文件 @
752f075e
...
...
@@ -48,7 +48,7 @@
```
yaml
# workspace
workspace
:
"
paddlerec.models.rank.
dnn"
workspace
:
"
models/rank/
dnn"
mode
:
[
single_cpu_train
]
runner
:
...
...
doc/model_develop.md
浏览文件 @
752f075e
...
...
@@ -92,7 +92,7 @@ def input_data(self, is_infer=False, **kwargs):
return
train_inputs
```
更多数据读取教程,请参考
[
自定义数据集及Reader
](
custom_
dataset_
reader.md
)
更多数据读取教程,请参考
[
自定义数据集及Reader
](
custom_reader.md
)
### 组网的定义
...
...
doc/pre_train_model.md
0 → 100644
浏览文件 @
752f075e
# PaddleRec 预训练模型
PaddleRec基于业务实践,使用真实数据,产出了推荐领域算法的若干预训练模型,方便开发者进行算法调研。
## 文本分类预训练模型
### 获取地址
```
bash
wget xxx.tar.gz
```
### 使用方法
解压后,得到的是一个paddle的模型文件夹,使用
`PaddleRec/models/contentunderstanding/classification_finetue`
模型进行加载
doc/train.md
浏览文件 @
752f075e
...
...
@@ -20,7 +20,7 @@ python -m paddlerec.run -m paddlerec.models.xxx.yyy
例如启动
`recall`
下的
`word2vec`
模型的默认配置;
```
shell
python
-m
paddlerec.run
-m
paddlerec.models.recall.
word2vec
python
-m
paddlerec.run
-m
models/recall/
word2vec
```
### 2. 启动内置模型的个性化配置训练
...
...
models/contentunderstanding/classification/config.yaml
浏览文件 @
752f075e
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.contentunderstanding.
classification"
workspace
:
"
models/contentunderstanding/
classification"
dataset
:
-
name
:
data1
...
...
models/contentunderstanding/readme.md
浏览文件 @
752f075e
...
...
@@ -39,8 +39,11 @@
##使用教程(快速开始)
```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.tagspace
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd paddle-rec
python -m paddlerec.run -m models/contentunderstanding/tagspace/config.yaml
python -m paddlerec.run -m models/contentunderstanding/classification/config.yaml
```
## 使用教程(复现论文)
...
...
models/contentunderstanding/tagspace/config.yaml
浏览文件 @
752f075e
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.contentunderstanding.
tagspace"
workspace
:
"
models/contentunderstanding/
tagspace"
dataset
:
-
name
:
sample_1
...
...
models/demo/movie_recommand/rank/config.yaml
浏览文件 @
752f075e
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.demo.
movie_recommand"
workspace
:
"
models/demo/
movie_recommand"
# list of dataset
dataset
:
...
...
models/demo/movie_recommand/recall/config.yaml
浏览文件 @
752f075e
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.demo.
movie_recommand"
workspace
:
"
models/demo/
movie_recommand"
# list of dataset
dataset
:
...
...
models/match/dssm/config.yaml
浏览文件 @
752f075e
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
workspace
:
"
paddlerec.models.match.
dssm"
workspace
:
"
models/match/
dssm"
dataset
:
-
name
:
dataset_train
...
...
models/match/match-pyramid/config.yaml
浏览文件 @
752f075e
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
workspace
:
"
paddlerec.models.match.
match-pyramid"
workspace
:
"
models/match/
match-pyramid"
dataset
:
-
name
:
dataset_train
...
...
models/match/multiview-simnet/config.yaml
浏览文件 @
752f075e
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
# workspace
workspace
:
"
paddlerec.models.match.
multiview-simnet"
workspace
:
"
models/match/
multiview-simnet"
# list of dataset
dataset
:
...
...
models/match/readme.md
浏览文件 @
752f075e
...
...
@@ -34,8 +34,11 @@
## 使用教程(快速开始)
### 训练
```
shell
python
-m
paddlerec.run
-m
paddlerec.models.match.dssm
# dssm
python
-m
paddlerec.run
-m
paddlerec.models.match.multiview-simnet
# multiview-simnet
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd
paddle-rec
python
-m
paddlerec.run
-m
models/match/dssm/config.yaml
# dssm
python
-m
paddlerec.run
-m
models/match/multiview-simnet/config.yaml
# multiview-simnet
```
### 预测
...
...
models/multitask/esmm/config.yaml
浏览文件 @
752f075e
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
workspace
:
"
paddlerec.models.multitask.
esmm"
workspace
:
"
models/multitask/
esmm"
dataset
:
-
name
:
dataset_train
...
...
models/multitask/mmoe/config.yaml
浏览文件 @
752f075e
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.multitask.
mmoe"
workspace
:
"
models/multitask/
mmoe"
dataset
:
-
name
:
dataset_train
...
...
models/multitask/readme.md
浏览文件 @
752f075e
...
...
@@ -44,9 +44,12 @@
## 使用教程(快速开始)
```
shell
python
-m
paddlerec.run
-m
paddlerec.models.multitask.mmoe
# mmoe
python
-m
paddlerec.run
-m
paddlerec.models.multitask.share-bottom
# share-bottom
python
-m
paddlerec.run
-m
paddlerec.models.multitask.esmm
# esmm
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd
paddle-rec
python
-m
paddlerec.run
-m
models/multitask/mmoe/config.yaml
# mmoe
python
-m
paddlerec.run
-m
models/multitask/share-bottom/config.yaml
# share-bottom
python
-m
paddlerec.run
-m
models/multitask/esmm/config.yaml
# esmm
```
## 使用教程(复现论文)
...
...
models/multitask/share-bottom/config.yaml
浏览文件 @
752f075e
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.multitask.
share-bottom"
workspace
:
"
models/multitask/
share-bottom"
dataset
:
-
name
:
dataset_train
...
...
models/rank/AutoInt/config.yaml
浏览文件 @
752f075e
...
...
@@ -14,7 +14,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
AutoInt"
workspace
:
"
models/rank/
AutoInt"
dataset
:
...
...
models/rank/BST/config.yaml
浏览文件 @
752f075e
...
...
@@ -14,7 +14,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
BST"
workspace
:
"
models/rank/
BST"
dataset
:
-
name
:
sample_1
...
...
models/rank/afm/config.yaml
浏览文件 @
752f075e
...
...
@@ -15,7 +15,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
afm"
workspace
:
"
models/rank/
afm"
dataset
:
-
name
:
train_sample
...
...
models/rank/dcn/config.yaml
浏览文件 @
752f075e
...
...
@@ -15,7 +15,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
dcn"
workspace
:
"
models/rank/
dcn"
dataset
:
-
name
:
train_sample
...
...
models/rank/deep_crossing/config.yaml
浏览文件 @
752f075e
...
...
@@ -15,7 +15,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
deep_crossing"
workspace
:
"
models/rank/
deep_crossing"
dataset
:
-
name
:
train_sample
...
...
models/rank/deepfm/config.yaml
浏览文件 @
752f075e
...
...
@@ -14,7 +14,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
deepfm"
workspace
:
"
models/rank/
deepfm"
dataset
:
...
...
models/rank/dien/config.yaml
浏览文件 @
752f075e
...
...
@@ -14,7 +14,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
dien"
workspace
:
"
models/rank/
dien"
dataset
:
-
name
:
sample_1
...
...
models/rank/din/config.yaml
浏览文件 @
752f075e
...
...
@@ -14,7 +14,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
din"
workspace
:
"
models/rank/
din"
dataset
:
-
name
:
sample_1
...
...
models/rank/dnn/config.yaml
浏览文件 @
752f075e
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
# workspace
workspace
:
"
paddlerec.models.rank.
dnn"
workspace
:
"
models/rank/
dnn"
# list of dataset
dataset
:
...
...
@@ -67,7 +67,6 @@ runner:
save_inference_path
:
"
inference"
# save inference path
save_inference_feed_varnames
:
[]
# feed vars of save inference
save_inference_fetch_varnames
:
[]
# fetch vars of save inference
init_model_path
:
"
"
# load model path
print_interval
:
10
phases
:
[
phase1
]
...
...
models/rank/ffm/config.yaml
浏览文件 @
752f075e
...
...
@@ -15,7 +15,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
ffm"
workspace
:
"
models/rank/
ffm"
dataset
:
-
name
:
train_sample
...
...
models/rank/fgcnn/config.yaml
浏览文件 @
752f075e
...
...
@@ -15,7 +15,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
fgcnn"
workspace
:
"
models/rank/
fgcnn"
dataset
:
-
name
:
train_sample
...
...
models/rank/fibinet/README.md
浏览文件 @
752f075e
...
...
@@ -132,7 +132,7 @@ CPU环境
### 运行
```
python -m paddlerec.run -m
paddlerec.models.rank.
fibinet
python -m paddlerec.run -m
models/rank/
fibinet
```
...
...
models/rank/fibinet/config.yaml
浏览文件 @
752f075e
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
# workspace
workspace
:
"
paddlerec.models.rank.
fibinet"
workspace
:
"
models/rank/
fibinet"
# list of dataset
dataset
:
...
...
models/rank/flen/README.md
浏览文件 @
752f075e
...
...
@@ -110,7 +110,7 @@ CPU环境
### 运行
```
python -m paddlerec.run -m
paddlerec.models.rank.
flen
python -m paddlerec.run -m
models/rank/
flen
```
## 论文复现
...
...
models/rank/flen/config.yaml
浏览文件 @
752f075e
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
# workspace
workspace
:
"
paddlerec.models.rank.
flen"
workspace
:
"
models/rank/
flen"
# list of dataset
dataset
:
...
...
models/rank/fm/config.yaml
浏览文件 @
752f075e
...
...
@@ -15,7 +15,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
fm"
workspace
:
"
models/rank/
fm"
dataset
:
-
name
:
train_sample
...
...
models/rank/fnn/config.yaml
浏览文件 @
752f075e
...
...
@@ -15,7 +15,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
fnn"
workspace
:
"
models/rank/
fnn"
dataset
:
-
name
:
train_sample
...
...
models/rank/logistic_regression/config.yaml
浏览文件 @
752f075e
...
...
@@ -14,7 +14,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
logistic_regression"
workspace
:
"
models/rank/
logistic_regression"
dataset
:
...
...
models/rank/nfm/config.yaml
浏览文件 @
752f075e
...
...
@@ -15,7 +15,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
nfm"
workspace
:
"
models/rank/
nfm"
dataset
:
-
name
:
train_sample
...
...
models/rank/pnn/config.yaml
浏览文件 @
752f075e
...
...
@@ -15,7 +15,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
pnn"
workspace
:
"
models/rank/
pnn"
dataset
:
-
name
:
train_sample
...
...
models/rank/readme.md
浏览文件 @
752f075e
...
...
@@ -107,7 +107,7 @@ sh run.sh
### 训练
```
cd modles/rank/dnn # 进入选定好的排序模型的目录 以DNN为例
python -m paddlerec.run -m
paddlerec.models.rank.dnn
# 使用内置配置
python -m paddlerec.run -m
models/rank/dnn/config.yaml
# 使用内置配置
# 如果需要使用自定义配置,config.yaml中workspace需要使用改模型目录的绝对路径
# 自定义修改超参后,指定配置文件,使用自定义配置
python -m paddlerec.run -m ./config.yaml
...
...
models/rank/wide_deep/config.yaml
浏览文件 @
752f075e
...
...
@@ -14,7 +14,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.rank.
wide_deep"
workspace
:
"
models/rank/
wide_deep"
dataset
:
...
...
models/rank/xdeepfm/config.yaml
浏览文件 @
752f075e
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
debug
:
false
workspace
:
"
paddlerec.models.rank.
xdeepfm"
workspace
:
"
models/rank/
xdeepfm"
dataset
:
-
name
:
sample_1
...
...
models/recall/fasttext/config.yaml
浏览文件 @
752f075e
...
...
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.recall.
fasttext"
workspace
:
"
models/recall/
fasttext"
# list of dataset
dataset
:
...
...
models/recall/gnn/config.yaml
浏览文件 @
752f075e
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
# workspace
workspace
:
"
paddlerec.models.recall.
gnn"
workspace
:
"
models/recall/
gnn"
# list of dataset
dataset
:
...
...
models/recall/gnn/readme.md
浏览文件 @
752f075e
...
...
@@ -165,7 +165,7 @@ CPU环境
### 运行
```
python -m paddlerec.run -m
paddlerec.models.recall.gnn
python -m paddlerec.run -m
models/recall/gnn/config.yaml
```
### 结果展示
...
...
models/recall/gru4rec/config.yaml
浏览文件 @
752f075e
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.recall.
gru4rec"
workspace
:
"
models/recall/
gru4rec"
dataset
:
-
name
:
dataset_train
...
...
models/recall/look-alike_recall/README.md
浏览文件 @
752f075e
...
...
@@ -129,7 +129,7 @@ CPU环境
### 运行
```
python -m paddlerec.run -m
paddlerec.models.recall.look-alike_recal
l
python -m paddlerec.run -m
models/recall/look-alike_recall/config.yam
l
```
...
...
models/recall/look-alike_recall/config.yaml
浏览文件 @
752f075e
...
...
@@ -14,7 +14,7 @@
# global settings
debug
:
false
workspace
:
"
paddlerec.models.recall.
look-alike_recall"
workspace
:
"
models/recall/
look-alike_recall"
dataset
:
-
name
:
sample_1
...
...
models/recall/ncf/config.yaml
浏览文件 @
752f075e
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.recall.
ncf"
workspace
:
"
models/recall/
ncf"
dataset
:
-
name
:
dataset_train
...
...
models/recall/readme.md
浏览文件 @
752f075e
...
...
@@ -62,12 +62,15 @@
## 使用教程(快速开始)
###
```
shell
python
-m
paddlerec.run
-m
paddlerec.models.recall.word2vec
# word2vec
python
-m
paddlerec.run
-m
paddlerec.models.recall.ssr
# ssr
python
-m
paddlerec.run
-m
paddlerec.models.recall.gru4rec
# gru4rec
python
-m
paddlerec.run
-m
paddlerec.models.recall.gnn
# gnn
python
-m
paddlerec.run
-m
paddlerec.models.recall.ncf
# ncf
python
-m
paddlerec.run
-m
paddlerec.models.recall.youtube_dnn
# youtube_dnn
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd
paddle-rec
python
-m
paddlerec.run
-m
models/recall/word2vec/config.yaml
# word2vec
python
-m
paddlerec.run
-m
models/recall/ssr/config.yaml
# ssr
python
-m
paddlerec.run
-m
models/recall/gru4rec/config.yaml
# gru4rec
python
-m
paddlerec.run
-m
models/recall/gnn/config.yaml
# gnn
python
-m
paddlerec.run
-m
models/recall/ncf/config.yaml
# ncf
python
-m
paddlerec.run
-m
models/recall/youtube_dnn/config.yaml
# youtube_dnn
```
## 使用教程(复现论文)
...
...
@@ -87,6 +90,9 @@ sh data_prepare.sh
### 训练
```
bash
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd
paddle-rec
cd
modles/recall/gnn
# 进入选定好的召回模型的目录 以gnn为例
python
-m
paddlerec.run
-m
./config.yaml
# 自定义修改超参后,指定配置文件,使用自定义配置
```
...
...
models/recall/ssr/config.yaml
浏览文件 @
752f075e
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.recall.
ssr"
workspace
:
"
models/recall/
ssr"
dataset
:
-
name
:
dataset_train
...
...
models/recall/word2vec/config.yaml
浏览文件 @
752f075e
...
...
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.recall.word2vec"
workspace
:
"
models/recall/word2vec"
# list of dataset
dataset
:
...
...
models/recall/youtube_dnn/config.yaml
浏览文件 @
752f075e
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
workspace
:
"
paddlerec.models.recall.
youtube_dnn"
workspace
:
"
models/recall/
youtube_dnn"
dataset
:
-
name
:
dataset_train
...
...
models/rerank/listwise/config.yaml
浏览文件 @
752f075e
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
workspace
:
"
paddlerec.models.rerank.
listwise"
workspace
:
"
models/rerank/
listwise"
dataset
:
-
name
:
dataset_train
...
...
models/rerank/readme.md
浏览文件 @
752f075e
...
...
@@ -28,7 +28,10 @@
## 使用教程(快速开始)
```
shell
python
-m
paddlerec.run
-m
paddlerec.models.rerank.listwise
# listwise
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd
paddle-rec
python
-m
paddlerec.run
-m
models/rerank/listwise/config.yaml
# listwise
```
## 使用教程(复现论文)
...
...
models/treebased/tdm/README.md
浏览文件 @
752f075e
...
...
@@ -8,7 +8,10 @@
2.
基于单机模型,可以进行分布式的参数服务器训练
```
shell
python
-m
paddlerec.run
-m
paddlerec.models.treebased.tdm
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd
paddle-rec
python
-m
paddlerec.run
-m
models/treebased/tdm/config.yaml
```
## 树结构的准备
...
...
models/treebased/tdm/config.yaml
浏览文件 @
752f075e
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.treebased.
tdm"
workspace
:
"
models/treebased/
tdm"
# list of dataset
dataset
:
...
...
run.py
浏览文件 @
752f075e
...
...
@@ -16,7 +16,6 @@ import os
import
subprocess
import
sys
import
argparse
import
tempfile
import
warnings
import
copy
...
...
@@ -39,6 +38,7 @@ def engine_registry():
engines
[
"TRANSPILER"
][
"INFER"
]
=
single_infer_engine
engines
[
"TRANSPILER"
][
"LOCAL_CLUSTER_TRAIN"
]
=
local_cluster_engine
engines
[
"TRANSPILER"
][
"CLUSTER_TRAIN"
]
=
cluster_engine
engines
[
"TRANSPILER"
][
"ONLINE_LEARNING"
]
=
online_learning
engines
[
"PSLIB"
][
"TRAIN"
]
=
local_mpi_engine
engines
[
"PSLIB"
][
"LOCAL_CLUSTER_TRAIN"
]
=
local_mpi_engine
engines
[
"PSLIB"
][
"CLUSTER_TRAIN"
]
=
cluster_mpi_engine
...
...
@@ -259,6 +259,20 @@ def single_infer_engine(args):
return
trainer
def
online_learning
(
args
):
trainer
=
"OnlineLearningTrainer"
single_envs
=
{}
single_envs
[
"train.trainer.trainer"
]
=
trainer
single_envs
[
"train.trainer.threads"
]
=
"2"
single_envs
[
"train.trainer.engine"
]
=
"online_learning"
single_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
print
(
"use {} engine to run model: {}"
.
format
(
trainer
,
args
.
model
))
set_runtime_envs
(
single_envs
,
args
.
model
)
trainer
=
TrainerFactory
.
create
(
args
.
model
)
return
trainer
def
cluster_engine
(
args
):
def
master
():
from
paddlerec.core.engine.cluster.cluster
import
ClusterEngine
...
...
setup.cfg
已删除
100644 → 0
浏览文件 @
922776b6
[easy_install]
index_url=http://pip.baidu.com/pypi/simple
\ No newline at end of file
setup.py
浏览文件 @
752f075e
...
...
@@ -38,15 +38,18 @@ readme = ""
def
build
(
dirname
):
package_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
shutil
.
copytree
(
package_dir
,
dirname
,
ignore
=
shutil
.
ignore_patterns
(
".git"
))
package_dir
,
dirname
,
ignore
=
shutil
.
ignore_patterns
(
".git"
,
"models"
,
"build"
,
"dist"
,
"*.md"
))
os
.
mkdir
(
os
.
path
.
join
(
dirname
,
"paddlerec"
))
shutil
.
move
(
os
.
path
.
join
(
dirname
,
"core"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
))
shutil
.
move
(
os
.
path
.
join
(
dirname
,
"doc"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
))
shutil
.
move
(
os
.
path
.
join
(
dirname
,
"models"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
))
shutil
.
move
(
os
.
path
.
join
(
dirname
,
"tests"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
))
shutil
.
move
(
...
...
@@ -63,17 +66,8 @@ def build(dirname):
package_dir
=
{
''
:
dirname
}
package_data
=
{}
models_copy
=
[
'data/*.txt'
,
'data/*/*.txt'
,
'*.yaml'
,
'*.sh'
,
'tree/*.npy'
,
'tree/*.txt'
,
'data/sample_data/*'
,
'data/sample_data/train/*'
,
'data/sample_data/infer/*'
,
'data/*/*.csv'
,
'Criteo_data/*'
,
'Criteo_data/sample_data/train/*'
]
engine_copy
=
[
'*/*.sh'
,
'*/*.template'
]
for
package
in
packages
:
if
package
.
startswith
(
"paddlerec.models."
):
package_data
[
package
]
=
models_copy
if
package
.
startswith
(
"paddlerec.core.engine"
):
package_data
[
package
]
=
engine_copy
...
...
@@ -98,16 +92,6 @@ build(dirname)
shutil
.
rmtree
(
dirname
)
print
(
u
'''
\033
[32m
██████╗ █████╗ ██████╗ ██████╗ ██╗ ███████╗██████╗ ███████╗ ██████╗
██╔══██╗██╔══██╗██╔══██╗██╔══██╗██║ ██╔════╝██╔══██╗██╔════╝██╔════╝
██████╔╝███████║██║ ██║██║ ██║██║ █████╗ ██████╔╝█████╗ ██║
██╔═══╝ ██╔══██║██║ ██║██║ ██║██║ ██╔══╝ ██╔══██╗██╔══╝ ██║
██║ ██║ ██║██████╔╝██████╔╝███████╗███████╗██║ ██║███████╗╚██████╗
╚═╝ ╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚══════╝╚══════╝╚═╝ ╚═╝╚══════╝ ╚═════╝
\033
[0m
\033
[34m
Installation Complete. Congratulations!
How to use it ? Please visit our webside: https://github.com/PaddlePaddle/PaddleRec
\033
[0m
'''
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录