Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleRec
提交
c68f2694
P
PaddleRec
项目概览
PaddlePaddle
/
PaddleRec
通知
68
Star
12
Fork
5
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
27
列表
看板
标记
里程碑
合并请求
10
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
27
Issue
27
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c68f2694
编写于
5月 20, 2020
作者:
T
tangwei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix code style
上级
66e1859f
变更
49
展开全部
隐藏空白更改
内联
并排
Showing
49 changed file
with
548 addition
and
190 deletion
+548
-190
core/engine/cluster/cloud/__init__.py
core/engine/cluster/cloud/__init__.py
+13
-0
core/modules/coding/layers.py
core/modules/coding/layers.py
+13
-0
core/trainers/__init__.py
core/trainers/__init__.py
+0
-3
core/trainers/ctr_coding_trainer.py
core/trainers/ctr_coding_trainer.py
+14
-9
core/trainers/ctr_modul_trainer.py
core/trainers/ctr_modul_trainer.py
+145
-71
doc/benchmark.md
doc/benchmark.md
+1
-1
doc/contribute.md
doc/contribute.md
+1
-1
doc/design.md
doc/design.md
+1
-1
doc/distributed_train.md
doc/distributed_train.md
+0
-2
doc/faq.md
doc/faq.md
+1
-1
doc/local_train.md
doc/local_train.md
+1
-1
doc/model_list.md
doc/model_list.md
+0
-1
doc/optimization_model.md
doc/optimization_model.md
+1
-1
doc/predict.md
doc/predict.md
+1
-1
doc/ps_background.md
doc/ps_background.md
+0
-1
models/contentunderstanding/__init__.py
models/contentunderstanding/__init__.py
+13
-0
models/contentunderstanding/classification/config.yaml
models/contentunderstanding/classification/config.yaml
+0
-1
models/contentunderstanding/classification/model.py
models/contentunderstanding/classification/model.py
+5
-2
models/contentunderstanding/classification/reader.py
models/contentunderstanding/classification/reader.py
+2
-2
models/contentunderstanding/readme.md
models/contentunderstanding/readme.md
+0
-1
models/contentunderstanding/tagspace/config.yaml
models/contentunderstanding/tagspace/config.yaml
+0
-1
models/match/__init__.py
models/match/__init__.py
+13
-0
models/match/readme.md
models/match/readme.md
+0
-1
models/multitask/__init__.py
models/multitask/__init__.py
+13
-0
models/multitask/readme.md
models/multitask/readme.md
+0
-1
models/rank/__init__.py
models/rank/__init__.py
+13
-0
models/rank/dcn/data/download.py
models/rank/dcn/data/download.py
+14
-0
models/rank/dcn/data/get_slot_data.py
models/rank/dcn/data/get_slot_data.py
+4
-3
models/rank/dcn/data/preprocess.py
models/rank/dcn/data/preprocess.py
+14
-0
models/rank/deepfm/data/download_preprocess.py
models/rank/deepfm/data/download_preprocess.py
+14
-0
models/rank/deepfm/data/get_slot_data.py
models/rank/deepfm/data/get_slot_data.py
+7
-3
models/rank/deepfm/data/preprocess.py
models/rank/deepfm/data/preprocess.py
+14
-0
models/rank/din/data/build_dataset.py
models/rank/din/data/build_dataset.py
+14
-0
models/rank/din/data/convert_pd.py
models/rank/din/data/convert_pd.py
+14
-0
models/rank/din/data/remap_id.py
models/rank/din/data/remap_id.py
+14
-0
models/rank/dnn/data/get_slot_data.py
models/rank/dnn/data/get_slot_data.py
+4
-2
models/rank/wide_deep/data/data_preparation.py
models/rank/wide_deep/data/data_preparation.py
+90
-43
models/rank/wide_deep/data/get_slot_data.py
models/rank/wide_deep/data/get_slot_data.py
+4
-1
models/rank/xdeepfm/data/download.py
models/rank/xdeepfm/data/download.py
+14
-0
models/rank/xdeepfm/data/get_slot_data.py
models/rank/xdeepfm/data/get_slot_data.py
+4
-1
models/recall/gnn/data_process.sh
models/recall/gnn/data_process.sh
+0
-2
models/recall/gnn/raw_data/convert_data.py
models/recall/gnn/raw_data/convert_data.py
+16
-0
models/recall/gnn/raw_data/download.py
models/recall/gnn/raw_data/download.py
+14
-0
models/recall/readme.md
models/recall/readme.md
+0
-1
models/recall/word2vec/prepare_data.sh
models/recall/word2vec/prepare_data.sh
+0
-3
models/treebased/README.md
models/treebased/README.md
+1
-1
models/treebased/tdm/tree/layer_list.txt
models/treebased/tdm/tree/layer_list.txt
+1
-1
run.py
run.py
+22
-11
setup.py
setup.py
+18
-15
未找到文件。
core/engine/cluster/cloud/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
core/modules/coding/layers.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
core/trainers/__init__.py
浏览文件 @
c68f2694
...
@@ -11,7 +11,6 @@
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""
"""
trainer implement.
trainer implement.
...
@@ -22,5 +21,3 @@ Trainer
...
@@ -22,5 +21,3 @@ Trainer
↘ (for online learning training) OnlineLearningTrainer
↘ (for online learning training) OnlineLearningTrainer
"""
"""
core/trainers/ctr_coding_trainer.py
浏览文件 @
c68f2694
...
@@ -59,8 +59,10 @@ class CtrTrainer(Trainer):
...
@@ -59,8 +59,10 @@ class CtrTrainer(Trainer):
reader_class
=
envs
.
get_global_env
(
"class"
,
None
,
namespace
)
reader_class
=
envs
.
get_global_env
(
"class"
,
None
,
namespace
)
abs_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
abs_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
reader
=
os
.
path
.
join
(
abs_dir
,
'../utils'
,
'dataset_instance.py'
)
reader
=
os
.
path
.
join
(
abs_dir
,
'../utils'
,
'dataset_instance.py'
)
pipe_cmd
=
"python {} {} {} {}"
.
format
(
reader
,
reader_class
,
"TRAIN"
,
self
.
_config_yaml
)
pipe_cmd
=
"python {} {} {} {}"
.
format
(
reader
,
reader_class
,
"TRAIN"
,
train_data_path
=
envs
.
get_global_env
(
"train_data_path"
,
None
,
namespace
)
self
.
_config_yaml
)
train_data_path
=
envs
.
get_global_env
(
"train_data_path"
,
None
,
namespace
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
()
dataset
=
fluid
.
DatasetFactory
().
create_dataset
()
dataset
.
set_use_var
(
inputs
)
dataset
.
set_use_var
(
inputs
)
...
@@ -87,7 +89,8 @@ class CtrTrainer(Trainer):
...
@@ -87,7 +89,8 @@ class CtrTrainer(Trainer):
self
.
model
.
train_net
()
self
.
model
.
train_net
()
optimizer
=
self
.
model
.
optimizer
()
optimizer
=
self
.
model
.
optimizer
()
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
{
"use_cvm"
:
False
})
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
{
"use_cvm"
:
False
})
optimizer
.
minimize
(
self
.
model
.
get_avg_cost
())
optimizer
.
minimize
(
self
.
model
.
get_avg_cost
())
if
fleet
.
is_server
():
if
fleet
.
is_server
():
...
@@ -118,16 +121,18 @@ class CtrTrainer(Trainer):
...
@@ -118,16 +121,18 @@ class CtrTrainer(Trainer):
gs
=
shuf
*
0
gs
=
shuf
*
0
fleet
.
_role_maker
.
_node_type_comm
.
Allreduce
(
shuf
,
gs
)
fleet
.
_role_maker
.
_node_type_comm
.
Allreduce
(
shuf
,
gs
)
print
(
"trainer id: {}, trainers: {}, gs: {}"
.
format
(
fleet
.
worker_index
(),
fleet
.
worker_num
(),
gs
))
print
(
"trainer id: {}, trainers: {}, gs: {}"
.
format
(
fleet
.
worker_index
(
),
fleet
.
worker_num
(),
gs
))
epochs
=
envs
.
get_global_env
(
"train.epochs"
)
epochs
=
envs
.
get_global_env
(
"train.epochs"
)
for
i
in
range
(
epochs
):
for
i
in
range
(
epochs
):
self
.
_exe
.
train_from_dataset
(
program
=
fluid
.
default_main_program
(),
self
.
_exe
.
train_from_dataset
(
dataset
=
dataset
,
program
=
fluid
.
default_main_program
(),
fetch_list
=
self
.
fetch_vars
,
dataset
=
dataset
,
fetch_info
=
self
.
fetch_alias
,
fetch_list
=
self
.
fetch_vars
,
print_period
=
self
.
fetch_period
)
fetch_info
=
self
.
fetch_alias
,
print_period
=
self
.
fetch_period
)
context
[
'status'
]
=
'terminal_pass'
context
[
'status'
]
=
'terminal_pass'
fleet
.
stop_worker
()
fleet
.
stop_worker
()
...
...
core/trainers/ctr_modul_trainer.py
浏览文件 @
c68f2694
此差异已折叠。
点击以展开。
doc/benchmark.md
浏览文件 @
c68f2694
# PaddleRec Benchmark
# PaddleRec Benchmark
> 占位
> 占位
\ No newline at end of file
doc/contribute.md
浏览文件 @
c68f2694
# PaddleRec 贡献代码
# PaddleRec 贡献代码
> 占位
> 占位
\ No newline at end of file
doc/design.md
浏览文件 @
c68f2694
...
@@ -279,4 +279,4 @@ class Metric(object):
...
@@ -279,4 +279,4 @@ class Metric(object):
pass
pass
```
```
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考
[
auc_metric.py
](
../core/metrics/auc_metrics.py
)
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考
[
auc_metric.py
](
../core/metrics/auc_metrics.py
)
\ No newline at end of file
doc/distributed_train.md
浏览文件 @
c68f2694
...
@@ -7,5 +7,3 @@
...
@@ -7,5 +7,3 @@
### K8S集群运行分布式
### K8S集群运行分布式
> 占位
> 占位
doc/faq.md
浏览文件 @
c68f2694
# 常见问题FAQ
# 常见问题FAQ
> 占位
> 占位
\ No newline at end of file
doc/local_train.md
浏览文件 @
c68f2694
# PaddleRec 单机训练
# PaddleRec 单机训练
> 占位
> 占位
\ No newline at end of file
doc/model_list.md
浏览文件 @
c68f2694
...
@@ -12,4 +12,3 @@
...
@@ -12,4 +12,3 @@
| 多任务 |
[
ESMM
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
| 多任务 |
[
ESMM
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 |
[
DSSM
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 |
[
DSSM
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 |
[
Multiview-Simnet
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 |
[
Multiview-Simnet
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
doc/optimization_model.md
浏览文件 @
c68f2694
# PaddleRec 模型调参
# PaddleRec 模型调参
> 占位
> 占位
\ No newline at end of file
doc/predict.md
浏览文件 @
c68f2694
# PaddleRec 离线预测
# PaddleRec 离线预测
\ No newline at end of file
doc/ps_background.md
浏览文件 @
c68f2694
...
@@ -5,4 +5,3 @@
...
@@ -5,4 +5,3 @@
## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839)
## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839)
models/contentunderstanding/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/contentunderstanding/classification/config.yaml
浏览文件 @
c68f2694
...
@@ -37,4 +37,3 @@ train:
...
@@ -37,4 +37,3 @@ train:
dirname
:
"
inference"
dirname
:
"
inference"
epoch_interval
:
100
epoch_interval
:
100
save_last
:
True
save_last
:
True
models/contentunderstanding/classification/model.py
浏览文件 @
c68f2694
...
@@ -31,7 +31,8 @@ class Model(ModelBase):
...
@@ -31,7 +31,8 @@ class Model(ModelBase):
def
train_net
(
self
):
def
train_net
(
self
):
""" network definition """
""" network definition """
data
=
fluid
.
data
(
name
=
"input"
,
shape
=
[
None
,
self
.
max_len
],
dtype
=
'int64'
)
data
=
fluid
.
data
(
name
=
"input"
,
shape
=
[
None
,
self
.
max_len
],
dtype
=
'int64'
)
label
=
fluid
.
data
(
name
=
"label"
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
label
=
fluid
.
data
(
name
=
"label"
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
seq_len
=
fluid
.
data
(
name
=
"seq_len"
,
shape
=
[
None
],
dtype
=
'int64'
)
seq_len
=
fluid
.
data
(
name
=
"seq_len"
,
shape
=
[
None
],
dtype
=
'int64'
)
...
@@ -51,7 +52,9 @@ class Model(ModelBase):
...
@@ -51,7 +52,9 @@ class Model(ModelBase):
# full connect layer
# full connect layer
fc_1
=
fluid
.
layers
.
fc
(
input
=
[
conv
],
size
=
self
.
hid_dim
)
fc_1
=
fluid
.
layers
.
fc
(
input
=
[
conv
],
size
=
self
.
hid_dim
)
# softmax layer
# softmax layer
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_1
],
size
=
self
.
class_dim
,
act
=
"softmax"
)
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_1
],
size
=
self
.
class_dim
,
act
=
"softmax"
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
...
...
models/contentunderstanding/classification/reader.py
浏览文件 @
c68f2694
...
@@ -12,7 +12,6 @@
...
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
sys
import
sys
from
paddlerec.core.reader
import
Reader
from
paddlerec.core.reader
import
Reader
...
@@ -38,7 +37,8 @@ class TrainReader(Reader):
...
@@ -38,7 +37,8 @@ class TrainReader(Reader):
data
=
[
int
(
i
)
for
i
in
data
]
data
=
[
int
(
i
)
for
i
in
data
]
label
=
[
int
(
i
)
for
i
in
label
]
label
=
[
int
(
i
)
for
i
in
label
]
seq_len
=
[
int
(
i
)
for
i
in
seq_len
]
seq_len
=
[
int
(
i
)
for
i
in
seq_len
]
print
>>
sys
.
stderr
,
str
([(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)])
print
>>
sys
.
stderr
,
str
(
[(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)])
yield
[(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)]
yield
[(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)]
return
data_iter
return
data_iter
models/contentunderstanding/readme.md
浏览文件 @
c68f2694
...
@@ -87,4 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
...
@@ -87,4 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
| :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: |
| :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: |
| ag news dataset | TagSpace | -- | -- | -- | -- |
| ag news dataset | TagSpace | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- |
models/contentunderstanding/tagspace/config.yaml
浏览文件 @
c68f2694
...
@@ -47,4 +47,3 @@ train:
...
@@ -47,4 +47,3 @@ train:
dirname
:
"
inference"
dirname
:
"
inference"
epoch_interval
:
100
epoch_interval
:
100
save_last
:
True
save_last
:
True
models/match/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/match/readme.md
浏览文件 @
c68f2694
...
@@ -37,4 +37,3 @@
...
@@ -37,4 +37,3 @@
python
-m
paddlerec.run
-m
paddlerec.models.match.dssm
# dssm
python
-m
paddlerec.run
-m
paddlerec.models.match.dssm
# dssm
python
-m
paddlerec.run
-m
paddlerec.models.match.multiview-simnet
# multiview-simnet
python
-m
paddlerec.run
-m
paddlerec.models.match.multiview-simnet
# multiview-simnet
```
```
models/multitask/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/multitask/readme.md
浏览文件 @
c68f2694
...
@@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm
...
@@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm
| Census-income Data | Share-Bottom | -- | 0.93120/0.99256 |
| Census-income Data | Share-Bottom | -- | 0.93120/0.99256 |
| Census-income Data | MMoE | -- | 0.94465/0.99324 |
| Census-income Data | MMoE | -- | 0.94465/0.99324 |
| Ali-CCP | ESMM | -- | 0.97181/0.49967 |
| Ali-CCP | ESMM | -- | 0.97181/0.49967 |
models/rank/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/rank/dcn/data/download.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
os
import
sys
import
sys
import
io
import
io
...
...
models/rank/dcn/data/get_slot_data.py
浏览文件 @
c68f2694
...
@@ -26,8 +26,8 @@ from collections import Counter
...
@@ -26,8 +26,8 @@ from collections import Counter
import
os
import
os
import
paddle.fluid.incubate.data_generator
as
dg
import
paddle.fluid.incubate.data_generator
as
dg
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
...
@@ -83,11 +83,11 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -83,11 +83,11 @@ class TrainReader(dg.MultiSlotDataGenerator):
if
idx
==
2
else
math
.
log
(
1
+
float
(
features
[
idx
])))
if
idx
==
2
else
math
.
log
(
1
+
float
(
features
[
idx
])))
for
idx
in
self
.
cat_idx_
:
for
idx
in
self
.
cat_idx_
:
if
features
[
idx
]
==
''
or
features
[
if
features
[
idx
]
==
''
or
features
[
idx
]
not
in
self
.
cat_feat_idx_dict_list
[
idx
-
14
]:
idx
]
not
in
self
.
cat_feat_idx_dict_list
[
idx
-
14
]:
label_feat_list
[
idx
].
append
(
0
)
label_feat_list
[
idx
].
append
(
0
)
else
:
else
:
label_feat_list
[
idx
].
append
(
self
.
cat_feat_idx_dict_list
[
label_feat_list
[
idx
].
append
(
self
.
cat_feat_idx_dict_list
[
idx
-
14
][
features
[
idx
]])
idx
-
14
][
features
[
idx
]])
label_feat_list
[
0
].
append
(
int
(
features
[
0
]))
label_feat_list
[
0
].
append
(
int
(
features
[
0
]))
return
label_feat_list
return
label_feat_list
...
@@ -109,6 +109,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -109,6 +109,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return
data_iter
return
data_iter
reader
=
TrainReader
(
"../config.yaml"
)
reader
=
TrainReader
(
"../config.yaml"
)
reader
.
init
()
reader
.
init
()
reader
.
run_from_stdin
()
reader
.
run_from_stdin
()
models/rank/dcn/data/preprocess.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
,
absolute_import
,
division
from
__future__
import
print_function
,
absolute_import
,
division
import
os
import
os
...
...
models/rank/deepfm/data/download_preprocess.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
os
import
shutil
import
shutil
import
sys
import
sys
...
...
models/rank/deepfm/data/get_slot_data.py
浏览文件 @
c68f2694
...
@@ -19,8 +19,9 @@ try:
...
@@ -19,8 +19,9 @@ try:
import
cPickle
as
pickle
import
cPickle
as
pickle
except
ImportError
:
except
ImportError
:
import
pickle
import
pickle
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
...
@@ -44,7 +45,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -44,7 +45,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
self
.
categorical_range_
=
range
(
14
,
40
)
self
.
categorical_range_
=
range
(
14
,
40
)
# load preprocessed feature dict
# load preprocessed feature dict
self
.
feat_dict_name
=
"aid_data/feat_dict_10.pkl2"
self
.
feat_dict_name
=
"aid_data/feat_dict_10.pkl2"
self
.
feat_dict_
=
pickle
.
load
(
open
(
self
.
feat_dict_name
,
'rb'
))
self
.
feat_dict_
=
pickle
.
load
(
open
(
self
.
feat_dict_name
,
'rb'
))
def
_process_line
(
self
,
line
):
def
_process_line
(
self
,
line
):
features
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
features
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
...
@@ -77,15 +78,18 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -77,15 +78,18 @@ class TrainReader(dg.MultiSlotDataGenerator):
def
data_iter
():
def
data_iter
():
feat_idx
,
feat_value
,
label
=
self
.
_process_line
(
line
)
feat_idx
,
feat_value
,
label
=
self
.
_process_line
(
line
)
s
=
""
s
=
""
for
i
in
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]:
for
i
in
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]:
k
=
i
[
0
]
k
=
i
[
0
]
v
=
i
[
1
]
v
=
i
[
1
]
for
j
in
v
:
for
j
in
v
:
s
+=
" "
+
k
+
":"
+
str
(
j
)
s
+=
" "
+
k
+
":"
+
str
(
j
)
print
s
.
strip
()
print
s
.
strip
()
yield
None
yield
None
return
data_iter
return
data_iter
reader
=
TrainReader
(
"../config.yaml"
)
reader
=
TrainReader
(
"../config.yaml"
)
reader
.
init
()
reader
.
init
()
reader
.
run_from_stdin
()
reader
.
run_from_stdin
()
models/rank/deepfm/data/preprocess.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
os
import
numpy
import
numpy
from
collections
import
Counter
from
collections
import
Counter
...
...
models/rank/din/data/build_dataset.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
print_function
import
random
import
random
import
pickle
import
pickle
...
...
models/rank/din/data/convert_pd.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
print_function
import
pickle
import
pickle
import
pandas
as
pd
import
pandas
as
pd
...
...
models/rank/din/data/remap_id.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
print_function
import
random
import
random
import
pickle
import
pickle
...
...
models/rank/dnn/data/get_slot_data.py
浏览文件 @
c68f2694
...
@@ -32,6 +32,7 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
...
@@ -32,6 +32,7 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
"""
"""
Read the data line by line and process it as a dictionary
Read the data line by line and process it as a dictionary
"""
"""
def
reader
():
def
reader
():
"""
"""
This function needs to be implemented by the user, based on data format
This function needs to be implemented by the user, based on data format
...
@@ -57,11 +58,12 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
...
@@ -57,11 +58,12 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
feature_name
.
append
(
"label"
)
feature_name
.
append
(
"label"
)
s
=
"click:"
+
str
(
label
[
0
])
s
=
"click:"
+
str
(
label
[
0
])
for
i
in
dense_feature
:
for
i
in
dense_feature
:
s
+=
" dense_feature:"
+
str
(
i
)
s
+=
" dense_feature:"
+
str
(
i
)
for
i
in
range
(
1
,
1
+
len
(
categorical_range_
)):
for
i
in
range
(
1
,
1
+
len
(
categorical_range_
)):
s
+=
" "
+
str
(
i
)
+
":"
+
str
(
sparse_feature
[
i
-
1
][
0
])
s
+=
" "
+
str
(
i
)
+
":"
+
str
(
sparse_feature
[
i
-
1
][
0
])
print
s
.
strip
()
print
s
.
strip
()
yield
None
yield
None
return
reader
return
reader
...
...
models/rank/wide_deep/data/data_preparation.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
os
import
io
import
io
import
args
import
args
import
pandas
as
pd
import
pandas
as
pd
from
sklearn
import
preprocessing
from
sklearn
import
preprocessing
def
_clean_file
(
source_path
,
target_path
):
def
_clean_file
(
source_path
,
target_path
):
"""makes changes to match the CSV format."""
"""makes changes to match the CSV format."""
with
io
.
open
(
source_path
,
'r'
)
as
temp_eval_file
:
with
io
.
open
(
source_path
,
'r'
)
as
temp_eval_file
:
with
io
.
open
(
target_path
,
'w'
)
as
eval_file
:
with
io
.
open
(
target_path
,
'w'
)
as
eval_file
:
...
@@ -17,15 +32,16 @@ def _clean_file(source_path,target_path):
...
@@ -17,15 +32,16 @@ def _clean_file(source_path,target_path):
line
=
line
[:
-
1
]
line
=
line
[:
-
1
]
line
+=
'
\n
'
line
+=
'
\n
'
eval_file
.
write
(
line
)
eval_file
.
write
(
line
)
def
build_model_columns
(
train_data_path
,
test_data_path
):
def
build_model_columns
(
train_data_path
,
test_data_path
):
# The column names are from
# The column names are from
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
column_names
=
[
column_names
=
[
'age'
,
'workclass'
,
'fnlwgt'
,
'education'
,
'education_num'
,
'age'
,
'workclass'
,
'fnlwgt'
,
'education'
,
'education_num'
,
'marital_status'
,
'occupation'
,
'relationship'
,
'race'
,
'gender'
,
'marital_status'
,
'occupation'
,
'relationship'
,
'race'
,
'gender'
,
'capital_gain'
,
'capital_loss'
,
'hours_per_week'
,
'native_country'
,
'capital_gain'
,
'capital_loss'
,
'hours_per_week'
,
'native_country'
,
'income_bracket'
'income_bracket'
]
]
# Load the dataset in Pandas
# Load the dataset in Pandas
...
@@ -44,61 +60,92 @@ def build_model_columns(train_data_path, test_data_path):
...
@@ -44,61 +60,92 @@ def build_model_columns(train_data_path, test_data_path):
# First group of tasks according to the paper
# First group of tasks according to the paper
#label_columns = ['income_50k', 'marital_stat']
#label_columns = ['income_50k', 'marital_stat']
categorical_columns
=
[
'education'
,
'marital_status'
,
'relationship'
,
'workclass'
,
'occupation'
]
categorical_columns
=
[
'education'
,
'marital_status'
,
'relationship'
,
'workclass'
,
'occupation'
]
for
col
in
categorical_columns
:
for
col
in
categorical_columns
:
label_train
=
preprocessing
.
LabelEncoder
()
label_train
=
preprocessing
.
LabelEncoder
()
train_df
[
col
]
=
label_train
.
fit_transform
(
train_df
[
col
])
train_df
[
col
]
=
label_train
.
fit_transform
(
train_df
[
col
])
label_test
=
preprocessing
.
LabelEncoder
()
label_test
=
preprocessing
.
LabelEncoder
()
test_df
[
col
]
=
label_test
.
fit_transform
(
test_df
[
col
])
test_df
[
col
]
=
label_test
.
fit_transform
(
test_df
[
col
])
bins
=
[
18
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
]
bins
=
[
18
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
]
train_df
[
'age_buckets'
]
=
pd
.
cut
(
train_df
[
'age'
].
values
.
tolist
(),
bins
,
labels
=
False
)
train_df
[
'age_buckets'
]
=
pd
.
cut
(
train_df
[
'age'
].
values
.
tolist
(),
test_df
[
'age_buckets'
]
=
pd
.
cut
(
test_df
[
'age'
].
values
.
tolist
(),
bins
,
labels
=
False
)
bins
,
labels
=
False
)
base_columns
=
[
'education'
,
'marital_status'
,
'relationship'
,
'workclass'
,
'occupation'
,
'age_buckets'
]
test_df
[
'age_buckets'
]
=
pd
.
cut
(
test_df
[
'age'
].
values
.
tolist
(),
bins
,
train_df
[
'education_occupation'
]
=
train_df
[
'education'
].
astype
(
str
)
+
'_'
+
train_df
[
'occupation'
].
astype
(
str
)
labels
=
False
)
test_df
[
'education_occupation'
]
=
test_df
[
'education'
].
astype
(
str
)
+
'_'
+
test_df
[
'occupation'
].
astype
(
str
)
train_df
[
'age_buckets_education_occupation'
]
=
train_df
[
'age_buckets'
].
astype
(
str
)
+
'_'
+
train_df
[
'education'
].
astype
(
str
)
+
'_'
+
train_df
[
'occupation'
].
astype
(
str
)
base_columns
=
[
test_df
[
'age_buckets_education_occupation'
]
=
test_df
[
'age_buckets'
].
astype
(
str
)
+
'_'
+
test_df
[
'education'
].
astype
(
str
)
+
'_'
+
test_df
[
'occupation'
].
astype
(
str
)
'education'
,
'marital_status'
,
'relationship'
,
'workclass'
,
crossed_columns
=
[
'education_occupation'
,
'age_buckets_education_occupation'
]
'occupation'
,
'age_buckets'
]
train_df
[
'education_occupation'
]
=
train_df
[
'education'
].
astype
(
str
)
+
'_'
+
train_df
[
'occupation'
].
astype
(
str
)
test_df
[
'education_occupation'
]
=
test_df
[
'education'
].
astype
(
str
)
+
'_'
+
test_df
[
'occupation'
].
astype
(
str
)
train_df
[
'age_buckets_education_occupation'
]
=
train_df
[
'age_buckets'
].
astype
(
str
)
+
'_'
+
train_df
[
'education'
].
astype
(
str
)
+
'_'
+
train_df
[
'occupation'
].
astype
(
str
)
test_df
[
'age_buckets_education_occupation'
]
=
test_df
[
'age_buckets'
].
astype
(
str
)
+
'_'
+
test_df
[
'education'
].
astype
(
str
)
+
'_'
+
test_df
[
'occupation'
].
astype
(
str
)
crossed_columns
=
[
'education_occupation'
,
'age_buckets_education_occupation'
]
for
col
in
crossed_columns
:
for
col
in
crossed_columns
:
label_train
=
preprocessing
.
LabelEncoder
()
label_train
=
preprocessing
.
LabelEncoder
()
train_df
[
col
]
=
label_train
.
fit_transform
(
train_df
[
col
])
train_df
[
col
]
=
label_train
.
fit_transform
(
train_df
[
col
])
label_test
=
preprocessing
.
LabelEncoder
()
label_test
=
preprocessing
.
LabelEncoder
()
test_df
[
col
]
=
label_test
.
fit_transform
(
test_df
[
col
])
test_df
[
col
]
=
label_test
.
fit_transform
(
test_df
[
col
])
wide_columns
=
base_columns
+
crossed_columns
wide_columns
=
base_columns
+
crossed_columns
train_df_temp
=
pd
.
get_dummies
(
train_df
[
categorical_columns
],
columns
=
categorical_columns
)
train_df_temp
=
pd
.
get_dummies
(
test_df_temp
=
pd
.
get_dummies
(
test_df
[
categorical_columns
],
columns
=
categorical_columns
)
train_df
[
categorical_columns
],
columns
=
categorical_columns
)
test_df_temp
=
pd
.
get_dummies
(
test_df
[
categorical_columns
],
columns
=
categorical_columns
)
train_df
=
train_df
.
join
(
train_df_temp
)
train_df
=
train_df
.
join
(
train_df_temp
)
test_df
=
test_df
.
join
(
test_df_temp
)
test_df
=
test_df
.
join
(
test_df_temp
)
deep_columns
=
list
(
train_df_temp
.
columns
)
+
[
'age'
,
'education_num'
,
'capital_gain'
,
'capital_loss'
,
'hours_per_week'
]
deep_columns
=
list
(
train_df_temp
.
columns
)
+
[
'age'
,
'education_num'
,
'capital_gain'
,
'capital_loss'
,
train_df
[
'label'
]
=
train_df
[
'income_bracket'
].
apply
(
lambda
x
:
1
if
x
==
'>50K'
else
0
)
'hours_per_week'
test_df
[
'label'
]
=
test_df
[
'income_bracket'
].
apply
(
lambda
x
:
1
if
x
==
'>50K'
else
0
)
]
with
io
.
open
(
'train_data/columns.txt'
,
'w'
)
as
f
:
train_df
[
'label'
]
=
train_df
[
'income_bracket'
].
apply
(
write_str
=
str
(
len
(
wide_columns
))
+
'
\n
'
+
str
(
len
(
deep_columns
))
+
'
\n
'
lambda
x
:
1
if
x
==
'>50K'
else
0
)
test_df
[
'label'
]
=
test_df
[
'income_bracket'
].
apply
(
lambda
x
:
1
if
x
==
'>50K'
else
0
)
with
io
.
open
(
'train_data/columns.txt'
,
'w'
)
as
f
:
write_str
=
str
(
len
(
wide_columns
))
+
'
\n
'
+
str
(
len
(
deep_columns
))
+
'
\n
'
f
.
write
(
write_str
)
f
.
write
(
write_str
)
f
.
close
()
f
.
close
()
with
io
.
open
(
'test_data/columns.txt'
,
'w'
)
as
f
:
with
io
.
open
(
'test_data/columns.txt'
,
'w'
)
as
f
:
write_str
=
str
(
len
(
wide_columns
))
+
'
\n
'
+
str
(
len
(
deep_columns
))
+
'
\n
'
write_str
=
str
(
len
(
wide_columns
))
+
'
\n
'
+
str
(
len
(
deep_columns
))
+
'
\n
'
f
.
write
(
write_str
)
f
.
write
(
write_str
)
f
.
close
()
f
.
close
()
train_df
[
wide_columns
+
deep_columns
+
[
'label'
]].
fillna
(
0
).
to_csv
(
train_data_path
,
index
=
False
)
train_df
[
wide_columns
+
deep_columns
+
[
'label'
]].
fillna
(
0
).
to_csv
(
test_df
[
wide_columns
+
deep_columns
+
[
'label'
]].
fillna
(
0
).
to_csv
(
test_data_path
,
index
=
False
)
train_data_path
,
index
=
False
)
test_df
[
wide_columns
+
deep_columns
+
[
'label'
]].
fillna
(
0
).
to_csv
(
test_data_path
,
index
=
False
)
def
clean_file
(
train_path
,
test_path
,
train_data_path
,
test_data_path
):
def
clean_file
(
train_path
,
test_path
,
train_data_path
,
test_data_path
):
_clean_file
(
train_path
,
train_data_path
)
_clean_file
(
train_path
,
train_data_path
)
_clean_file
(
test_path
,
test_data_path
)
_clean_file
(
test_path
,
test_data_path
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
args
=
args
.
parse_args
()
args
=
args
.
parse_args
()
clean_file
(
args
.
train_path
,
args
.
test_path
,
args
.
train_data_path
,
args
.
test_data_path
)
clean_file
(
args
.
train_path
,
args
.
test_path
,
args
.
train_data_path
,
args
.
test_data_path
)
build_model_columns
(
args
.
train_data_path
,
args
.
test_data_path
)
build_model_columns
(
args
.
train_data_path
,
args
.
test_data_path
)
models/rank/wide_deep/data/get_slot_data.py
浏览文件 @
c68f2694
...
@@ -20,6 +20,7 @@ except ImportError:
...
@@ -20,6 +20,7 @@ except ImportError:
import
pickle
import
pickle
import
paddle.fluid.incubate.data_generator
as
dg
import
paddle.fluid.incubate.data_generator
as
dg
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
...
@@ -50,7 +51,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -50,7 +51,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
wide_feat
,
deep_deat
,
label
=
self
.
_process_line
(
line
)
wide_feat
,
deep_deat
,
label
=
self
.
_process_line
(
line
)
s
=
""
s
=
""
for
i
in
[(
'wide_input'
,
wide_feat
),
(
'deep_input'
,
deep_deat
),
(
'label'
,
label
)]:
for
i
in
[(
'wide_input'
,
wide_feat
),
(
'deep_input'
,
deep_deat
),
(
'label'
,
label
)]:
k
=
i
[
0
]
k
=
i
[
0
]
v
=
i
[
1
]
v
=
i
[
1
]
for
j
in
v
:
for
j
in
v
:
...
@@ -60,6 +62,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -60,6 +62,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return
data_iter
return
data_iter
reader
=
TrainReader
(
"../config.yaml"
)
reader
=
TrainReader
(
"../config.yaml"
)
reader
.
init
()
reader
.
init
()
reader
.
run_from_stdin
()
reader
.
run_from_stdin
()
models/rank/xdeepfm/data/download.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
os
import
shutil
import
shutil
import
sys
import
sys
...
...
models/rank/xdeepfm/data/get_slot_data.py
浏览文件 @
c68f2694
...
@@ -21,6 +21,7 @@ except ImportError:
...
@@ -21,6 +21,7 @@ except ImportError:
import
pickle
import
pickle
import
paddle.fluid.incubate.data_generator
as
dg
import
paddle.fluid.incubate.data_generator
as
dg
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
...
@@ -48,7 +49,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -48,7 +49,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
feat_idx
,
feat_value
,
label
=
self
.
_process_line
(
line
)
feat_idx
,
feat_value
,
label
=
self
.
_process_line
(
line
)
s
=
""
s
=
""
for
i
in
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]:
for
i
in
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]:
k
=
i
[
0
]
k
=
i
[
0
]
v
=
i
[
1
]
v
=
i
[
1
]
for
j
in
v
:
for
j
in
v
:
...
@@ -58,6 +60,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -58,6 +60,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return
data_iter
return
data_iter
reader
=
TrainReader
(
"../config.yaml"
)
reader
=
TrainReader
(
"../config.yaml"
)
reader
.
init
()
reader
.
init
()
reader
.
run_from_stdin
()
reader
.
run_from_stdin
()
models/recall/gnn/data_process.sh
浏览文件 @
c68f2694
...
@@ -31,5 +31,3 @@ mv diginetica/train.txt train_data
...
@@ -31,5 +31,3 @@ mv diginetica/train.txt train_data
mkdir
test_data
mkdir
test_data
mv
diginetica/test.txt test_data
mv
diginetica/test.txt test_data
models/recall/gnn/raw_data/convert_data.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
argparse
import
time
import
time
import
pickle
import
pickle
...
@@ -10,6 +24,7 @@ parser.add_argument(
...
@@ -10,6 +24,7 @@ parser.add_argument(
help
=
'dataset dir: diginetica/yoochoose1_4/yoochoose1_64/sample'
)
help
=
'dataset dir: diginetica/yoochoose1_4/yoochoose1_64/sample'
)
opt
=
parser
.
parse_args
()
opt
=
parser
.
parse_args
()
def
process_data
(
file_type
):
def
process_data
(
file_type
):
path
=
os
.
path
.
join
(
opt
.
data_dir
,
file_type
)
path
=
os
.
path
.
join
(
opt
.
data_dir
,
file_type
)
output_path
=
os
.
path
.
splitext
(
path
)[
0
]
+
".txt"
output_path
=
os
.
path
.
splitext
(
path
)[
0
]
+
".txt"
...
@@ -23,6 +38,7 @@ def process_data(file_type):
...
@@ -23,6 +38,7 @@ def process_data(file_type):
fout
.
write
(
str
(
data
[
i
][
1
]))
fout
.
write
(
str
(
data
[
i
][
1
]))
fout
.
write
(
"
\n
"
)
fout
.
write
(
"
\n
"
)
process_data
(
"train"
)
process_data
(
"train"
)
process_data
(
"test"
)
process_data
(
"test"
)
...
...
models/recall/gnn/raw_data/download.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
requests
import
requests
import
sys
import
sys
import
time
import
time
...
...
models/recall/readme.md
浏览文件 @
c68f2694
...
@@ -78,4 +78,3 @@ python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn
...
@@ -78,4 +78,3 @@ python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn
| MOVIELENS | NCF | 0.688 | -- |
| MOVIELENS | NCF | 0.688 | -- |
| -- | Youtube | -- | -- |
| -- | Youtube | -- | -- |
| 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 |
| 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 |
models/recall/word2vec/prepare_data.sh
浏览文件 @
c68f2694
...
@@ -35,6 +35,3 @@ wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.ta
...
@@ -35,6 +35,3 @@ wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.ta
tar
xzvf test_dir.tar
-C
raw_data
tar
xzvf test_dir.tar
-C
raw_data
mv
raw_data/data/test_dir test_data/
mv
raw_data/data/test_dir test_data/
rm
-rf
raw_data
rm
-rf
raw_data
models/treebased/README.md
浏览文件 @
c68f2694
...
@@ -24,4 +24,4 @@ TDM是为大规模推荐系统设计的、能承载任意先进模型来高效
...
@@ -24,4 +24,4 @@ TDM是为大规模推荐系统设计的、能承载任意先进模型来高效
-
如何组网?答:paddle封装了大量的深度学习OP,用户可以根据需求设计自己的网络结构。
-
如何组网?答:paddle封装了大量的深度学习OP,用户可以根据需求设计自己的网络结构。
-
训练数据如何组织?答:tdm的训练数据主要为:
`user/query emb`
加
`item`
的正样本,
`item`
需要映射到树的某个叶子节点。用户只需准备符合该构成的数据即可。负样本的生成,会基于用户提供的树结构,以及paddle提供的
`tdm-sampler op`
完成高效的负采样,并自动添加相应的label,参与tdm中深度学习模型的训练。
-
训练数据如何组织?答:tdm的训练数据主要为:
`user/query emb`
加
`item`
的正样本,
`item`
需要映射到树的某个叶子节点。用户只需准备符合该构成的数据即可。负样本的生成,会基于用户提供的树结构,以及paddle提供的
`tdm-sampler op`
完成高效的负采样,并自动添加相应的label,参与tdm中深度学习模型的训练。
-
大规模的数据与模型训练如何实现?答:基于paddle优秀的大规模参数服务器分布式能力,可以实现高效的分布式训练。基于paddle-fleet api,学习门槛极低,且可以灵活的支持增量训练,流式训练等业务需求。
-
大规模的数据与模型训练如何实现?答:基于paddle优秀的大规模参数服务器分布式能力,可以实现高效的分布式训练。基于paddle-fleet api,学习门槛极低,且可以灵活的支持增量训练,流式训练等业务需求。
3.
训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。
3.
训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。
\ No newline at end of file
models/treebased/tdm/tree/layer_list.txt
浏览文件 @
c68f2694
1,2
1,2
3,4,5,6
3,4,5,6
7,8,9,10,11,12,13
7,8,9,10,11,12,13
14,15,16,17,18,19,20,21,22,23,24,25
14,15,16,17,18,19,20,21,22,23,24,25
\ No newline at end of file
run.py
浏览文件 @
c68f2694
...
@@ -26,8 +26,10 @@ from paddlerec.core.utils import util
...
@@ -26,8 +26,10 @@ from paddlerec.core.utils import util
engines
=
{}
engines
=
{}
device
=
[
"CPU"
,
"GPU"
]
device
=
[
"CPU"
,
"GPU"
]
clusters
=
[
"SINGLE"
,
"LOCAL_CLUSTER"
,
"CLUSTER"
]
clusters
=
[
"SINGLE"
,
"LOCAL_CLUSTER"
,
"CLUSTER"
]
engine_choices
=
[
"SINGLE"
,
"LOCAL_CLUSTER"
,
"CLUSTER"
,
engine_choices
=
[
"TDM_SINGLE"
,
"TDM_LOCAL_CLUSTER"
,
"TDM_CLUSTER"
]
"SINGLE"
,
"LOCAL_CLUSTER"
,
"CLUSTER"
,
"TDM_SINGLE"
,
"TDM_LOCAL_CLUSTER"
,
"TDM_CLUSTER"
]
custom_model
=
[
'TDM'
]
custom_model
=
[
'TDM'
]
model_name
=
""
model_name
=
""
...
@@ -66,7 +68,8 @@ def get_engine(args):
...
@@ -66,7 +68,8 @@ def get_engine(args):
engine
=
engine
.
upper
()
engine
=
engine
.
upper
()
if
engine
not
in
engine_choices
:
if
engine
not
in
engine_choices
:
raise
ValueError
(
"train.engin can not be chosen in {}"
.
format
(
engine_choices
))
raise
ValueError
(
"train.engin can not be chosen in {}"
.
format
(
engine_choices
))
print
(
"engines:
\n
{}"
.
format
(
engines
))
print
(
"engines:
\n
{}"
.
format
(
engines
))
...
@@ -77,8 +80,10 @@ def get_engine(args):
...
@@ -77,8 +80,10 @@ def get_engine(args):
def
get_transpiler
():
def
get_transpiler
():
FNULL
=
open
(
os
.
devnull
,
'w'
)
FNULL
=
open
(
os
.
devnull
,
'w'
)
cmd
=
[
"python"
,
"-c"
,
cmd
=
[
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"
]
"python"
,
"-c"
,
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"
]
proc
=
subprocess
.
Popen
(
cmd
,
stdout
=
FNULL
,
stderr
=
FNULL
,
cwd
=
os
.
getcwd
())
proc
=
subprocess
.
Popen
(
cmd
,
stdout
=
FNULL
,
stderr
=
FNULL
,
cwd
=
os
.
getcwd
())
ret
=
proc
.
wait
()
ret
=
proc
.
wait
()
if
ret
==
-
11
:
if
ret
==
-
11
:
...
@@ -152,7 +157,8 @@ def cluster_engine(args):
...
@@ -152,7 +157,8 @@ def cluster_engine(args):
update_workspace
(
flattens
)
update_workspace
(
flattens
)
envs
.
set_runtime_environs
(
flattens
)
envs
.
set_runtime_environs
(
flattens
)
print
(
envs
.
pretty_print_envs
(
flattens
,
(
"Submit Runtime Envs"
,
"Value"
)))
print
(
envs
.
pretty_print_envs
(
flattens
,
(
"Submit Runtime Envs"
,
"Value"
)))
launch
=
ClusterEngine
(
None
,
args
.
model
)
launch
=
ClusterEngine
(
None
,
args
.
model
)
return
launch
return
launch
...
@@ -163,7 +169,8 @@ def cluster_engine(args):
...
@@ -163,7 +169,8 @@ def cluster_engine(args):
cluster_envs
=
{}
cluster_envs
=
{}
cluster_envs
[
"train.trainer.trainer"
]
=
trainer
cluster_envs
[
"train.trainer.trainer"
]
=
trainer
cluster_envs
[
"train.trainer.engine"
]
=
"cluster"
cluster_envs
[
"train.trainer.engine"
]
=
"cluster"
cluster_envs
[
"train.trainer.threads"
]
=
envs
.
get_runtime_environ
(
"CPU_NUM"
)
cluster_envs
[
"train.trainer.threads"
]
=
envs
.
get_runtime_environ
(
"CPU_NUM"
)
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
print
(
"launch {} engine with cluster to with model: {}"
.
format
(
print
(
"launch {} engine with cluster to with model: {}"
.
format
(
trainer
,
args
.
model
))
trainer
,
args
.
model
))
...
@@ -181,7 +188,8 @@ def cluster_engine(args):
...
@@ -181,7 +188,8 @@ def cluster_engine(args):
def
cluster_mpi_engine
(
args
):
def
cluster_mpi_engine
(
args
):
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
cluster_envs
=
{}
cluster_envs
=
{}
cluster_envs
[
"train.trainer.trainer"
]
=
"CtrCodingTrainer"
cluster_envs
[
"train.trainer.trainer"
]
=
"CtrCodingTrainer"
...
@@ -209,7 +217,8 @@ def local_cluster_engine(args):
...
@@ -209,7 +217,8 @@ def local_cluster_engine(args):
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
cluster_envs
[
"CPU_NUM"
]
=
"2"
cluster_envs
[
"CPU_NUM"
]
=
"2"
print
(
"launch {} engine with cluster to run model: {}"
.
format
(
trainer
,
args
.
model
))
print
(
"launch {} engine with cluster to run model: {}"
.
format
(
trainer
,
args
.
model
))
set_runtime_envs
(
cluster_envs
,
args
.
model
)
set_runtime_envs
(
cluster_envs
,
args
.
model
)
launch
=
LocalClusterEngine
(
cluster_envs
,
args
.
model
)
launch
=
LocalClusterEngine
(
cluster_envs
,
args
.
model
)
...
@@ -217,10 +226,12 @@ def local_cluster_engine(args):
...
@@ -217,10 +226,12 @@ def local_cluster_engine(args):
def
local_mpi_engine
(
args
):
def
local_mpi_engine
(
args
):
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
from
paddlerec.core.engine.local_mpi
import
LocalMPIEngine
from
paddlerec.core.engine.local_mpi
import
LocalMPIEngine
print
(
"use 1X1 MPI ClusterTraining at localhost to run model: {}"
.
format
(
args
.
model
))
print
(
"use 1X1 MPI ClusterTraining at localhost to run model: {}"
.
format
(
args
.
model
))
mpi
=
util
.
run_which
(
"mpirun"
)
mpi
=
util
.
run_which
(
"mpirun"
)
if
not
mpi
:
if
not
mpi
:
...
...
setup.py
浏览文件 @
c68f2694
...
@@ -11,7 +11,6 @@
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""
"""
setup for paddle-rec.
setup for paddle-rec.
"""
"""
...
@@ -22,11 +21,7 @@ from setuptools import setup, find_packages
...
@@ -22,11 +21,7 @@ from setuptools import setup, find_packages
import
shutil
import
shutil
import
tempfile
import
tempfile
requires
=
[
"paddlepaddle == 1.7.2"
,
"pyyaml >= 5.1.1"
]
requires
=
[
"paddlepaddle == 1.7.2"
,
"pyyaml >= 5.1.1"
]
about
=
{}
about
=
{}
about
[
"__title__"
]
=
"paddle-rec"
about
[
"__title__"
]
=
"paddle-rec"
...
@@ -48,18 +43,27 @@ def build(dirname):
...
@@ -48,18 +43,27 @@ def build(dirname):
package_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
package_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
run_cmd
(
"cp -r {}/* {}"
.
format
(
package_dir
,
dirname
))
run_cmd
(
"cp -r {}/* {}"
.
format
(
package_dir
,
dirname
))
run_cmd
(
"mkdir {}"
.
format
(
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mkdir {}"
.
format
(
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"core"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"doc"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
os
.
path
.
join
(
dirname
,
"core"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"models"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"tests"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
os
.
path
.
join
(
dirname
,
"doc"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"tools"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"*.py"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
os
.
path
.
join
(
dirname
,
"models"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"tests"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"tools"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"*.py"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
packages
=
find_packages
(
dirname
,
include
=
(
'paddlerec.*'
))
packages
=
find_packages
(
dirname
,
include
=
(
'paddlerec.*'
))
package_dir
=
{
''
:
dirname
}
package_dir
=
{
''
:
dirname
}
package_data
=
{}
package_data
=
{}
models_copy
=
[
'data/*.txt'
,
'data/*/*.txt'
,
'*.yaml'
,
'*.sh'
,
'tree/*.npy'
,
'tree/*.txt'
]
models_copy
=
[
'data/*.txt'
,
'data/*/*.txt'
,
'*.yaml'
,
'*.sh'
,
'tree/*.npy'
,
'tree/*.txt'
]
engine_copy
=
[
'*/*.sh'
]
engine_copy
=
[
'*/*.sh'
]
for
package
in
packages
:
for
package
in
packages
:
if
package
.
startswith
(
"paddlerec.models."
):
if
package
.
startswith
(
"paddlerec.models."
):
...
@@ -80,8 +84,7 @@ def build(dirname):
...
@@ -80,8 +84,7 @@ def build(dirname):
package_data
=
package_data
,
package_data
=
package_data
,
python_requires
=
">=2.7"
,
python_requires
=
">=2.7"
,
install_requires
=
requires
,
install_requires
=
requires
,
zip_safe
=
False
zip_safe
=
False
)
)
dirname
=
tempfile
.
mkdtemp
()
dirname
=
tempfile
.
mkdtemp
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录