Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleRec
提交
c68f2694
P
PaddleRec
项目概览
PaddlePaddle
/
PaddleRec
通知
68
Star
12
Fork
5
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
27
列表
看板
标记
里程碑
合并请求
10
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
27
Issue
27
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c68f2694
编写于
5月 20, 2020
作者:
T
tangwei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix code style
上级
66e1859f
变更
49
展开全部
显示空白变更内容
内联
并排
Showing
49 changed file
with
548 addition
and
190 deletion
+548
-190
core/engine/cluster/cloud/__init__.py
core/engine/cluster/cloud/__init__.py
+13
-0
core/modules/coding/layers.py
core/modules/coding/layers.py
+13
-0
core/trainers/__init__.py
core/trainers/__init__.py
+0
-3
core/trainers/ctr_coding_trainer.py
core/trainers/ctr_coding_trainer.py
+14
-9
core/trainers/ctr_modul_trainer.py
core/trainers/ctr_modul_trainer.py
+145
-71
doc/benchmark.md
doc/benchmark.md
+1
-1
doc/contribute.md
doc/contribute.md
+1
-1
doc/design.md
doc/design.md
+1
-1
doc/distributed_train.md
doc/distributed_train.md
+0
-2
doc/faq.md
doc/faq.md
+1
-1
doc/local_train.md
doc/local_train.md
+1
-1
doc/model_list.md
doc/model_list.md
+0
-1
doc/optimization_model.md
doc/optimization_model.md
+1
-1
doc/predict.md
doc/predict.md
+1
-1
doc/ps_background.md
doc/ps_background.md
+0
-1
models/contentunderstanding/__init__.py
models/contentunderstanding/__init__.py
+13
-0
models/contentunderstanding/classification/config.yaml
models/contentunderstanding/classification/config.yaml
+0
-1
models/contentunderstanding/classification/model.py
models/contentunderstanding/classification/model.py
+5
-2
models/contentunderstanding/classification/reader.py
models/contentunderstanding/classification/reader.py
+2
-2
models/contentunderstanding/readme.md
models/contentunderstanding/readme.md
+0
-1
models/contentunderstanding/tagspace/config.yaml
models/contentunderstanding/tagspace/config.yaml
+0
-1
models/match/__init__.py
models/match/__init__.py
+13
-0
models/match/readme.md
models/match/readme.md
+0
-1
models/multitask/__init__.py
models/multitask/__init__.py
+13
-0
models/multitask/readme.md
models/multitask/readme.md
+0
-1
models/rank/__init__.py
models/rank/__init__.py
+13
-0
models/rank/dcn/data/download.py
models/rank/dcn/data/download.py
+14
-0
models/rank/dcn/data/get_slot_data.py
models/rank/dcn/data/get_slot_data.py
+4
-3
models/rank/dcn/data/preprocess.py
models/rank/dcn/data/preprocess.py
+14
-0
models/rank/deepfm/data/download_preprocess.py
models/rank/deepfm/data/download_preprocess.py
+14
-0
models/rank/deepfm/data/get_slot_data.py
models/rank/deepfm/data/get_slot_data.py
+7
-3
models/rank/deepfm/data/preprocess.py
models/rank/deepfm/data/preprocess.py
+14
-0
models/rank/din/data/build_dataset.py
models/rank/din/data/build_dataset.py
+14
-0
models/rank/din/data/convert_pd.py
models/rank/din/data/convert_pd.py
+14
-0
models/rank/din/data/remap_id.py
models/rank/din/data/remap_id.py
+14
-0
models/rank/dnn/data/get_slot_data.py
models/rank/dnn/data/get_slot_data.py
+4
-2
models/rank/wide_deep/data/data_preparation.py
models/rank/wide_deep/data/data_preparation.py
+90
-43
models/rank/wide_deep/data/get_slot_data.py
models/rank/wide_deep/data/get_slot_data.py
+4
-1
models/rank/xdeepfm/data/download.py
models/rank/xdeepfm/data/download.py
+14
-0
models/rank/xdeepfm/data/get_slot_data.py
models/rank/xdeepfm/data/get_slot_data.py
+4
-1
models/recall/gnn/data_process.sh
models/recall/gnn/data_process.sh
+0
-2
models/recall/gnn/raw_data/convert_data.py
models/recall/gnn/raw_data/convert_data.py
+16
-0
models/recall/gnn/raw_data/download.py
models/recall/gnn/raw_data/download.py
+14
-0
models/recall/readme.md
models/recall/readme.md
+0
-1
models/recall/word2vec/prepare_data.sh
models/recall/word2vec/prepare_data.sh
+0
-3
models/treebased/README.md
models/treebased/README.md
+1
-1
models/treebased/tdm/tree/layer_list.txt
models/treebased/tdm/tree/layer_list.txt
+1
-1
run.py
run.py
+22
-11
setup.py
setup.py
+18
-15
未找到文件。
core/engine/cluster/cloud/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
core/modules/coding/layers.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
core/trainers/__init__.py
浏览文件 @
c68f2694
...
@@ -11,7 +11,6 @@
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""
"""
trainer implement.
trainer implement.
...
@@ -22,5 +21,3 @@ Trainer
...
@@ -22,5 +21,3 @@ Trainer
↘ (for online learning training) OnlineLearningTrainer
↘ (for online learning training) OnlineLearningTrainer
"""
"""
core/trainers/ctr_coding_trainer.py
浏览文件 @
c68f2694
...
@@ -59,8 +59,10 @@ class CtrTrainer(Trainer):
...
@@ -59,8 +59,10 @@ class CtrTrainer(Trainer):
reader_class
=
envs
.
get_global_env
(
"class"
,
None
,
namespace
)
reader_class
=
envs
.
get_global_env
(
"class"
,
None
,
namespace
)
abs_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
abs_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
reader
=
os
.
path
.
join
(
abs_dir
,
'../utils'
,
'dataset_instance.py'
)
reader
=
os
.
path
.
join
(
abs_dir
,
'../utils'
,
'dataset_instance.py'
)
pipe_cmd
=
"python {} {} {} {}"
.
format
(
reader
,
reader_class
,
"TRAIN"
,
self
.
_config_yaml
)
pipe_cmd
=
"python {} {} {} {}"
.
format
(
reader
,
reader_class
,
"TRAIN"
,
train_data_path
=
envs
.
get_global_env
(
"train_data_path"
,
None
,
namespace
)
self
.
_config_yaml
)
train_data_path
=
envs
.
get_global_env
(
"train_data_path"
,
None
,
namespace
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
()
dataset
=
fluid
.
DatasetFactory
().
create_dataset
()
dataset
.
set_use_var
(
inputs
)
dataset
.
set_use_var
(
inputs
)
...
@@ -87,7 +89,8 @@ class CtrTrainer(Trainer):
...
@@ -87,7 +89,8 @@ class CtrTrainer(Trainer):
self
.
model
.
train_net
()
self
.
model
.
train_net
()
optimizer
=
self
.
model
.
optimizer
()
optimizer
=
self
.
model
.
optimizer
()
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
{
"use_cvm"
:
False
})
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
{
"use_cvm"
:
False
})
optimizer
.
minimize
(
self
.
model
.
get_avg_cost
())
optimizer
.
minimize
(
self
.
model
.
get_avg_cost
())
if
fleet
.
is_server
():
if
fleet
.
is_server
():
...
@@ -118,12 +121,14 @@ class CtrTrainer(Trainer):
...
@@ -118,12 +121,14 @@ class CtrTrainer(Trainer):
gs
=
shuf
*
0
gs
=
shuf
*
0
fleet
.
_role_maker
.
_node_type_comm
.
Allreduce
(
shuf
,
gs
)
fleet
.
_role_maker
.
_node_type_comm
.
Allreduce
(
shuf
,
gs
)
print
(
"trainer id: {}, trainers: {}, gs: {}"
.
format
(
fleet
.
worker_index
(),
fleet
.
worker_num
(),
gs
))
print
(
"trainer id: {}, trainers: {}, gs: {}"
.
format
(
fleet
.
worker_index
(
),
fleet
.
worker_num
(),
gs
))
epochs
=
envs
.
get_global_env
(
"train.epochs"
)
epochs
=
envs
.
get_global_env
(
"train.epochs"
)
for
i
in
range
(
epochs
):
for
i
in
range
(
epochs
):
self
.
_exe
.
train_from_dataset
(
program
=
fluid
.
default_main_program
(),
self
.
_exe
.
train_from_dataset
(
program
=
fluid
.
default_main_program
(),
dataset
=
dataset
,
dataset
=
dataset
,
fetch_list
=
self
.
fetch_vars
,
fetch_list
=
self
.
fetch_vars
,
fetch_info
=
self
.
fetch_alias
,
fetch_info
=
self
.
fetch_alias
,
...
...
core/trainers/ctr_modul_trainer.py
浏览文件 @
c68f2694
此差异已折叠。
点击以展开。
doc/benchmark.md
浏览文件 @
c68f2694
doc/contribute.md
浏览文件 @
c68f2694
doc/design.md
浏览文件 @
c68f2694
doc/distributed_train.md
浏览文件 @
c68f2694
...
@@ -7,5 +7,3 @@
...
@@ -7,5 +7,3 @@
### K8S集群运行分布式
### K8S集群运行分布式
> 占位
> 占位
doc/faq.md
浏览文件 @
c68f2694
doc/local_train.md
浏览文件 @
c68f2694
doc/model_list.md
浏览文件 @
c68f2694
...
@@ -12,4 +12,3 @@
...
@@ -12,4 +12,3 @@
| 多任务 |
[
ESMM
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
| 多任务 |
[
ESMM
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 |
[
DSSM
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 |
[
DSSM
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 |
[
Multiview-Simnet
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 |
[
Multiview-Simnet
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
doc/optimization_model.md
浏览文件 @
c68f2694
doc/predict.md
浏览文件 @
c68f2694
doc/ps_background.md
浏览文件 @
c68f2694
...
@@ -5,4 +5,3 @@
...
@@ -5,4 +5,3 @@
## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839)
## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839)
models/contentunderstanding/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/contentunderstanding/classification/config.yaml
浏览文件 @
c68f2694
...
@@ -37,4 +37,3 @@ train:
...
@@ -37,4 +37,3 @@ train:
dirname
:
"
inference"
dirname
:
"
inference"
epoch_interval
:
100
epoch_interval
:
100
save_last
:
True
save_last
:
True
models/contentunderstanding/classification/model.py
浏览文件 @
c68f2694
...
@@ -31,7 +31,8 @@ class Model(ModelBase):
...
@@ -31,7 +31,8 @@ class Model(ModelBase):
def
train_net
(
self
):
def
train_net
(
self
):
""" network definition """
""" network definition """
data
=
fluid
.
data
(
name
=
"input"
,
shape
=
[
None
,
self
.
max_len
],
dtype
=
'int64'
)
data
=
fluid
.
data
(
name
=
"input"
,
shape
=
[
None
,
self
.
max_len
],
dtype
=
'int64'
)
label
=
fluid
.
data
(
name
=
"label"
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
label
=
fluid
.
data
(
name
=
"label"
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
seq_len
=
fluid
.
data
(
name
=
"seq_len"
,
shape
=
[
None
],
dtype
=
'int64'
)
seq_len
=
fluid
.
data
(
name
=
"seq_len"
,
shape
=
[
None
],
dtype
=
'int64'
)
...
@@ -51,7 +52,9 @@ class Model(ModelBase):
...
@@ -51,7 +52,9 @@ class Model(ModelBase):
# full connect layer
# full connect layer
fc_1
=
fluid
.
layers
.
fc
(
input
=
[
conv
],
size
=
self
.
hid_dim
)
fc_1
=
fluid
.
layers
.
fc
(
input
=
[
conv
],
size
=
self
.
hid_dim
)
# softmax layer
# softmax layer
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_1
],
size
=
self
.
class_dim
,
act
=
"softmax"
)
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_1
],
size
=
self
.
class_dim
,
act
=
"softmax"
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
...
...
models/contentunderstanding/classification/reader.py
浏览文件 @
c68f2694
...
@@ -12,7 +12,6 @@
...
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
sys
import
sys
from
paddlerec.core.reader
import
Reader
from
paddlerec.core.reader
import
Reader
...
@@ -38,7 +37,8 @@ class TrainReader(Reader):
...
@@ -38,7 +37,8 @@ class TrainReader(Reader):
data
=
[
int
(
i
)
for
i
in
data
]
data
=
[
int
(
i
)
for
i
in
data
]
label
=
[
int
(
i
)
for
i
in
label
]
label
=
[
int
(
i
)
for
i
in
label
]
seq_len
=
[
int
(
i
)
for
i
in
seq_len
]
seq_len
=
[
int
(
i
)
for
i
in
seq_len
]
print
>>
sys
.
stderr
,
str
([(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)])
print
>>
sys
.
stderr
,
str
(
[(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)])
yield
[(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)]
yield
[(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)]
return
data_iter
return
data_iter
models/contentunderstanding/readme.md
浏览文件 @
c68f2694
...
@@ -87,4 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
...
@@ -87,4 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
| :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: |
| :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: |
| ag news dataset | TagSpace | -- | -- | -- | -- |
| ag news dataset | TagSpace | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- |
models/contentunderstanding/tagspace/config.yaml
浏览文件 @
c68f2694
...
@@ -47,4 +47,3 @@ train:
...
@@ -47,4 +47,3 @@ train:
dirname
:
"
inference"
dirname
:
"
inference"
epoch_interval
:
100
epoch_interval
:
100
save_last
:
True
save_last
:
True
models/match/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/match/readme.md
浏览文件 @
c68f2694
...
@@ -37,4 +37,3 @@
...
@@ -37,4 +37,3 @@
python
-m
paddlerec.run
-m
paddlerec.models.match.dssm
# dssm
python
-m
paddlerec.run
-m
paddlerec.models.match.dssm
# dssm
python
-m
paddlerec.run
-m
paddlerec.models.match.multiview-simnet
# multiview-simnet
python
-m
paddlerec.run
-m
paddlerec.models.match.multiview-simnet
# multiview-simnet
```
```
models/multitask/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/multitask/readme.md
浏览文件 @
c68f2694
...
@@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm
...
@@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm
| Census-income Data | Share-Bottom | -- | 0.93120/0.99256 |
| Census-income Data | Share-Bottom | -- | 0.93120/0.99256 |
| Census-income Data | MMoE | -- | 0.94465/0.99324 |
| Census-income Data | MMoE | -- | 0.94465/0.99324 |
| Ali-CCP | ESMM | -- | 0.97181/0.49967 |
| Ali-CCP | ESMM | -- | 0.97181/0.49967 |
models/rank/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/rank/dcn/data/download.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
os
import
sys
import
sys
import
io
import
io
...
...
models/rank/dcn/data/get_slot_data.py
浏览文件 @
c68f2694
...
@@ -26,8 +26,8 @@ from collections import Counter
...
@@ -26,8 +26,8 @@ from collections import Counter
import
os
import
os
import
paddle.fluid.incubate.data_generator
as
dg
import
paddle.fluid.incubate.data_generator
as
dg
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
...
@@ -109,6 +109,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -109,6 +109,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return
data_iter
return
data_iter
reader
=
TrainReader
(
"../config.yaml"
)
reader
=
TrainReader
(
"../config.yaml"
)
reader
.
init
()
reader
.
init
()
reader
.
run_from_stdin
()
reader
.
run_from_stdin
()
models/rank/dcn/data/preprocess.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
,
absolute_import
,
division
from
__future__
import
print_function
,
absolute_import
,
division
import
os
import
os
...
...
models/rank/deepfm/data/download_preprocess.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
os
import
shutil
import
shutil
import
sys
import
sys
...
...
models/rank/deepfm/data/get_slot_data.py
浏览文件 @
c68f2694
...
@@ -19,8 +19,9 @@ try:
...
@@ -19,8 +19,9 @@ try:
import
cPickle
as
pickle
import
cPickle
as
pickle
except
ImportError
:
except
ImportError
:
import
pickle
import
pickle
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
...
@@ -77,15 +78,18 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -77,15 +78,18 @@ class TrainReader(dg.MultiSlotDataGenerator):
def
data_iter
():
def
data_iter
():
feat_idx
,
feat_value
,
label
=
self
.
_process_line
(
line
)
feat_idx
,
feat_value
,
label
=
self
.
_process_line
(
line
)
s
=
""
s
=
""
for
i
in
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]:
for
i
in
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]:
k
=
i
[
0
]
k
=
i
[
0
]
v
=
i
[
1
]
v
=
i
[
1
]
for
j
in
v
:
for
j
in
v
:
s
+=
" "
+
k
+
":"
+
str
(
j
)
s
+=
" "
+
k
+
":"
+
str
(
j
)
print
s
.
strip
()
print
s
.
strip
()
yield
None
yield
None
return
data_iter
return
data_iter
reader
=
TrainReader
(
"../config.yaml"
)
reader
=
TrainReader
(
"../config.yaml"
)
reader
.
init
()
reader
.
init
()
reader
.
run_from_stdin
()
reader
.
run_from_stdin
()
models/rank/deepfm/data/preprocess.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
os
import
numpy
import
numpy
from
collections
import
Counter
from
collections
import
Counter
...
...
models/rank/din/data/build_dataset.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
print_function
import
random
import
random
import
pickle
import
pickle
...
...
models/rank/din/data/convert_pd.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
print_function
import
pickle
import
pickle
import
pandas
as
pd
import
pandas
as
pd
...
...
models/rank/din/data/remap_id.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
print_function
import
random
import
random
import
pickle
import
pickle
...
...
models/rank/dnn/data/get_slot_data.py
浏览文件 @
c68f2694
...
@@ -32,6 +32,7 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
...
@@ -32,6 +32,7 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
"""
"""
Read the data line by line and process it as a dictionary
Read the data line by line and process it as a dictionary
"""
"""
def
reader
():
def
reader
():
"""
"""
This function needs to be implemented by the user, based on data format
This function needs to be implemented by the user, based on data format
...
@@ -59,9 +60,10 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
...
@@ -59,9 +60,10 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
for
i
in
dense_feature
:
for
i
in
dense_feature
:
s
+=
" dense_feature:"
+
str
(
i
)
s
+=
" dense_feature:"
+
str
(
i
)
for
i
in
range
(
1
,
1
+
len
(
categorical_range_
)):
for
i
in
range
(
1
,
1
+
len
(
categorical_range_
)):
s
+=
" "
+
str
(
i
)
+
":"
+
str
(
sparse_feature
[
i
-
1
][
0
])
s
+=
" "
+
str
(
i
)
+
":"
+
str
(
sparse_feature
[
i
-
1
][
0
])
print
s
.
strip
()
print
s
.
strip
()
yield
None
yield
None
return
reader
return
reader
...
...
models/rank/wide_deep/data/data_preparation.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
os
import
io
import
io
import
args
import
args
import
pandas
as
pd
import
pandas
as
pd
from
sklearn
import
preprocessing
from
sklearn
import
preprocessing
def
_clean_file
(
source_path
,
target_path
):
def
_clean_file
(
source_path
,
target_path
):
"""makes changes to match the CSV format."""
"""makes changes to match the CSV format."""
with
io
.
open
(
source_path
,
'r'
)
as
temp_eval_file
:
with
io
.
open
(
source_path
,
'r'
)
as
temp_eval_file
:
with
io
.
open
(
target_path
,
'w'
)
as
eval_file
:
with
io
.
open
(
target_path
,
'w'
)
as
eval_file
:
...
@@ -18,6 +33,7 @@ def _clean_file(source_path,target_path):
...
@@ -18,6 +33,7 @@ def _clean_file(source_path,target_path):
line
+=
'
\n
'
line
+=
'
\n
'
eval_file
.
write
(
line
)
eval_file
.
write
(
line
)
def
build_model_columns
(
train_data_path
,
test_data_path
):
def
build_model_columns
(
train_data_path
,
test_data_path
):
# The column names are from
# The column names are from
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
...
@@ -44,61 +60,92 @@ def build_model_columns(train_data_path, test_data_path):
...
@@ -44,61 +60,92 @@ def build_model_columns(train_data_path, test_data_path):
# First group of tasks according to the paper
# First group of tasks according to the paper
#label_columns = ['income_50k', 'marital_stat']
#label_columns = ['income_50k', 'marital_stat']
categorical_columns
=
[
'education'
,
'marital_status'
,
'relationship'
,
'workclass'
,
'occupation'
]
categorical_columns
=
[
'education'
,
'marital_status'
,
'relationship'
,
'workclass'
,
'occupation'
]
for
col
in
categorical_columns
:
for
col
in
categorical_columns
:
label_train
=
preprocessing
.
LabelEncoder
()
label_train
=
preprocessing
.
LabelEncoder
()
train_df
[
col
]
=
label_train
.
fit_transform
(
train_df
[
col
])
train_df
[
col
]
=
label_train
.
fit_transform
(
train_df
[
col
])
label_test
=
preprocessing
.
LabelEncoder
()
label_test
=
preprocessing
.
LabelEncoder
()
test_df
[
col
]
=
label_test
.
fit_transform
(
test_df
[
col
])
test_df
[
col
]
=
label_test
.
fit_transform
(
test_df
[
col
])
bins
=
[
18
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
]
bins
=
[
18
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
]
train_df
[
'age_buckets'
]
=
pd
.
cut
(
train_df
[
'age'
].
values
.
tolist
(),
bins
,
labels
=
False
)
train_df
[
'age_buckets'
]
=
pd
.
cut
(
train_df
[
'age'
].
values
.
tolist
(),
test_df
[
'age_buckets'
]
=
pd
.
cut
(
test_df
[
'age'
].
values
.
tolist
(),
bins
,
labels
=
False
)
bins
,
labels
=
False
)
base_columns
=
[
'education'
,
'marital_status'
,
'relationship'
,
'workclass'
,
'occupation'
,
'age_buckets'
]
test_df
[
'age_buckets'
]
=
pd
.
cut
(
test_df
[
'age'
].
values
.
tolist
(),
bins
,
labels
=
False
)
base_columns
=
[
'education'
,
'marital_status'
,
'relationship'
,
'workclass'
,
'occupation'
,
'age_buckets'
]
train_df
[
'education_occupation'
]
=
train_df
[
'education'
].
astype
(
str
)
+
'_'
+
train_df
[
'occupation'
].
astype
(
str
)
train_df
[
'education_occupation'
]
=
train_df
[
'education'
].
astype
(
test_df
[
'education_occupation'
]
=
test_df
[
'education'
].
astype
(
str
)
+
'_'
+
test_df
[
'occupation'
].
astype
(
str
)
str
)
+
'_'
+
train_df
[
'occupation'
].
astype
(
str
)
train_df
[
'age_buckets_education_occupation'
]
=
train_df
[
'age_buckets'
].
astype
(
str
)
+
'_'
+
train_df
[
'education'
].
astype
(
str
)
+
'_'
+
train_df
[
'occupation'
].
astype
(
str
)
test_df
[
'education_occupation'
]
=
test_df
[
'education'
].
astype
(
test_df
[
'age_buckets_education_occupation'
]
=
test_df
[
'age_buckets'
].
astype
(
str
)
+
'_'
+
test_df
[
'education'
].
astype
(
str
)
+
'_'
+
test_df
[
'occupation'
].
astype
(
str
)
str
)
+
'_'
+
test_df
[
'occupation'
].
astype
(
str
)
crossed_columns
=
[
'education_occupation'
,
'age_buckets_education_occupation'
]
train_df
[
'age_buckets_education_occupation'
]
=
train_df
[
'age_buckets'
].
astype
(
str
)
+
'_'
+
train_df
[
'education'
].
astype
(
str
)
+
'_'
+
train_df
[
'occupation'
].
astype
(
str
)
test_df
[
'age_buckets_education_occupation'
]
=
test_df
[
'age_buckets'
].
astype
(
str
)
+
'_'
+
test_df
[
'education'
].
astype
(
str
)
+
'_'
+
test_df
[
'occupation'
].
astype
(
str
)
crossed_columns
=
[
'education_occupation'
,
'age_buckets_education_occupation'
]
for
col
in
crossed_columns
:
for
col
in
crossed_columns
:
label_train
=
preprocessing
.
LabelEncoder
()
label_train
=
preprocessing
.
LabelEncoder
()
train_df
[
col
]
=
label_train
.
fit_transform
(
train_df
[
col
])
train_df
[
col
]
=
label_train
.
fit_transform
(
train_df
[
col
])
label_test
=
preprocessing
.
LabelEncoder
()
label_test
=
preprocessing
.
LabelEncoder
()
test_df
[
col
]
=
label_test
.
fit_transform
(
test_df
[
col
])
test_df
[
col
]
=
label_test
.
fit_transform
(
test_df
[
col
])
wide_columns
=
base_columns
+
crossed_columns
wide_columns
=
base_columns
+
crossed_columns
train_df_temp
=
pd
.
get_dummies
(
train_df
[
categorical_columns
],
columns
=
categorical_columns
)
train_df_temp
=
pd
.
get_dummies
(
test_df_temp
=
pd
.
get_dummies
(
test_df
[
categorical_columns
],
columns
=
categorical_columns
)
train_df
[
categorical_columns
],
columns
=
categorical_columns
)
test_df_temp
=
pd
.
get_dummies
(
test_df
[
categorical_columns
],
columns
=
categorical_columns
)
train_df
=
train_df
.
join
(
train_df_temp
)
train_df
=
train_df
.
join
(
train_df_temp
)
test_df
=
test_df
.
join
(
test_df_temp
)
test_df
=
test_df
.
join
(
test_df_temp
)
deep_columns
=
list
(
train_df_temp
.
columns
)
+
[
'age'
,
'education_num'
,
'capital_gain'
,
'capital_loss'
,
'hours_per_week'
]
deep_columns
=
list
(
train_df_temp
.
columns
)
+
[
'age'
,
'education_num'
,
'capital_gain'
,
'capital_loss'
,
'hours_per_week'
]
train_df
[
'label'
]
=
train_df
[
'income_bracket'
].
apply
(
lambda
x
:
1
if
x
==
'>50K'
else
0
)
train_df
[
'label'
]
=
train_df
[
'income_bracket'
].
apply
(
test_df
[
'label'
]
=
test_df
[
'income_bracket'
].
apply
(
lambda
x
:
1
if
x
==
'>50K'
else
0
)
lambda
x
:
1
if
x
==
'>50K'
else
0
)
test_df
[
'label'
]
=
test_df
[
'income_bracket'
].
apply
(
lambda
x
:
1
if
x
==
'>50K'
else
0
)
with
io
.
open
(
'train_data/columns.txt'
,
'w'
)
as
f
:
with
io
.
open
(
'train_data/columns.txt'
,
'w'
)
as
f
:
write_str
=
str
(
len
(
wide_columns
))
+
'
\n
'
+
str
(
len
(
deep_columns
))
+
'
\n
'
write_str
=
str
(
len
(
wide_columns
))
+
'
\n
'
+
str
(
len
(
deep_columns
))
+
'
\n
'
f
.
write
(
write_str
)
f
.
write
(
write_str
)
f
.
close
()
f
.
close
()
with
io
.
open
(
'test_data/columns.txt'
,
'w'
)
as
f
:
with
io
.
open
(
'test_data/columns.txt'
,
'w'
)
as
f
:
write_str
=
str
(
len
(
wide_columns
))
+
'
\n
'
+
str
(
len
(
deep_columns
))
+
'
\n
'
write_str
=
str
(
len
(
wide_columns
))
+
'
\n
'
+
str
(
len
(
deep_columns
))
+
'
\n
'
f
.
write
(
write_str
)
f
.
write
(
write_str
)
f
.
close
()
f
.
close
()
train_df
[
wide_columns
+
deep_columns
+
[
'label'
]].
fillna
(
0
).
to_csv
(
train_data_path
,
index
=
False
)
train_df
[
wide_columns
+
deep_columns
+
[
'label'
]].
fillna
(
0
).
to_csv
(
test_df
[
wide_columns
+
deep_columns
+
[
'label'
]].
fillna
(
0
).
to_csv
(
test_data_path
,
index
=
False
)
train_data_path
,
index
=
False
)
test_df
[
wide_columns
+
deep_columns
+
[
'label'
]].
fillna
(
0
).
to_csv
(
test_data_path
,
index
=
False
)
def
clean_file
(
train_path
,
test_path
,
train_data_path
,
test_data_path
):
def
clean_file
(
train_path
,
test_path
,
train_data_path
,
test_data_path
):
_clean_file
(
train_path
,
train_data_path
)
_clean_file
(
train_path
,
train_data_path
)
_clean_file
(
test_path
,
test_data_path
)
_clean_file
(
test_path
,
test_data_path
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
args
=
args
.
parse_args
()
args
=
args
.
parse_args
()
clean_file
(
args
.
train_path
,
args
.
test_path
,
args
.
train_data_path
,
args
.
test_data_path
)
clean_file
(
args
.
train_path
,
args
.
test_path
,
args
.
train_data_path
,
args
.
test_data_path
)
build_model_columns
(
args
.
train_data_path
,
args
.
test_data_path
)
build_model_columns
(
args
.
train_data_path
,
args
.
test_data_path
)
models/rank/wide_deep/data/get_slot_data.py
浏览文件 @
c68f2694
...
@@ -20,6 +20,7 @@ except ImportError:
...
@@ -20,6 +20,7 @@ except ImportError:
import
pickle
import
pickle
import
paddle.fluid.incubate.data_generator
as
dg
import
paddle.fluid.incubate.data_generator
as
dg
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
...
@@ -50,7 +51,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -50,7 +51,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
wide_feat
,
deep_deat
,
label
=
self
.
_process_line
(
line
)
wide_feat
,
deep_deat
,
label
=
self
.
_process_line
(
line
)
s
=
""
s
=
""
for
i
in
[(
'wide_input'
,
wide_feat
),
(
'deep_input'
,
deep_deat
),
(
'label'
,
label
)]:
for
i
in
[(
'wide_input'
,
wide_feat
),
(
'deep_input'
,
deep_deat
),
(
'label'
,
label
)]:
k
=
i
[
0
]
k
=
i
[
0
]
v
=
i
[
1
]
v
=
i
[
1
]
for
j
in
v
:
for
j
in
v
:
...
@@ -60,6 +62,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -60,6 +62,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return
data_iter
return
data_iter
reader
=
TrainReader
(
"../config.yaml"
)
reader
=
TrainReader
(
"../config.yaml"
)
reader
.
init
()
reader
.
init
()
reader
.
run_from_stdin
()
reader
.
run_from_stdin
()
models/rank/xdeepfm/data/download.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
os
import
shutil
import
shutil
import
sys
import
sys
...
...
models/rank/xdeepfm/data/get_slot_data.py
浏览文件 @
c68f2694
...
@@ -21,6 +21,7 @@ except ImportError:
...
@@ -21,6 +21,7 @@ except ImportError:
import
pickle
import
pickle
import
paddle.fluid.incubate.data_generator
as
dg
import
paddle.fluid.incubate.data_generator
as
dg
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
...
@@ -48,7 +49,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -48,7 +49,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
feat_idx
,
feat_value
,
label
=
self
.
_process_line
(
line
)
feat_idx
,
feat_value
,
label
=
self
.
_process_line
(
line
)
s
=
""
s
=
""
for
i
in
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]:
for
i
in
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]:
k
=
i
[
0
]
k
=
i
[
0
]
v
=
i
[
1
]
v
=
i
[
1
]
for
j
in
v
:
for
j
in
v
:
...
@@ -58,6 +60,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
...
@@ -58,6 +60,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return
data_iter
return
data_iter
reader
=
TrainReader
(
"../config.yaml"
)
reader
=
TrainReader
(
"../config.yaml"
)
reader
.
init
()
reader
.
init
()
reader
.
run_from_stdin
()
reader
.
run_from_stdin
()
models/recall/gnn/data_process.sh
浏览文件 @
c68f2694
...
@@ -31,5 +31,3 @@ mv diginetica/train.txt train_data
...
@@ -31,5 +31,3 @@ mv diginetica/train.txt train_data
mkdir
test_data
mkdir
test_data
mv
diginetica/test.txt test_data
mv
diginetica/test.txt test_data
models/recall/gnn/raw_data/convert_data.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
argparse
import
time
import
time
import
pickle
import
pickle
...
@@ -10,6 +24,7 @@ parser.add_argument(
...
@@ -10,6 +24,7 @@ parser.add_argument(
help
=
'dataset dir: diginetica/yoochoose1_4/yoochoose1_64/sample'
)
help
=
'dataset dir: diginetica/yoochoose1_4/yoochoose1_64/sample'
)
opt
=
parser
.
parse_args
()
opt
=
parser
.
parse_args
()
def
process_data
(
file_type
):
def
process_data
(
file_type
):
path
=
os
.
path
.
join
(
opt
.
data_dir
,
file_type
)
path
=
os
.
path
.
join
(
opt
.
data_dir
,
file_type
)
output_path
=
os
.
path
.
splitext
(
path
)[
0
]
+
".txt"
output_path
=
os
.
path
.
splitext
(
path
)[
0
]
+
".txt"
...
@@ -23,6 +38,7 @@ def process_data(file_type):
...
@@ -23,6 +38,7 @@ def process_data(file_type):
fout
.
write
(
str
(
data
[
i
][
1
]))
fout
.
write
(
str
(
data
[
i
][
1
]))
fout
.
write
(
"
\n
"
)
fout
.
write
(
"
\n
"
)
process_data
(
"train"
)
process_data
(
"train"
)
process_data
(
"test"
)
process_data
(
"test"
)
...
...
models/recall/gnn/raw_data/download.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
requests
import
requests
import
sys
import
sys
import
time
import
time
...
...
models/recall/readme.md
浏览文件 @
c68f2694
...
@@ -78,4 +78,3 @@ python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn
...
@@ -78,4 +78,3 @@ python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn
| MOVIELENS | NCF | 0.688 | -- |
| MOVIELENS | NCF | 0.688 | -- |
| -- | Youtube | -- | -- |
| -- | Youtube | -- | -- |
| 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 |
| 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 |
models/recall/word2vec/prepare_data.sh
浏览文件 @
c68f2694
...
@@ -35,6 +35,3 @@ wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.ta
...
@@ -35,6 +35,3 @@ wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.ta
tar
xzvf test_dir.tar
-C
raw_data
tar
xzvf test_dir.tar
-C
raw_data
mv
raw_data/data/test_dir test_data/
mv
raw_data/data/test_dir test_data/
rm
-rf
raw_data
rm
-rf
raw_data
models/treebased/README.md
浏览文件 @
c68f2694
models/treebased/tdm/tree/layer_list.txt
浏览文件 @
c68f2694
run.py
浏览文件 @
c68f2694
...
@@ -26,8 +26,10 @@ from paddlerec.core.utils import util
...
@@ -26,8 +26,10 @@ from paddlerec.core.utils import util
engines
=
{}
engines
=
{}
device
=
[
"CPU"
,
"GPU"
]
device
=
[
"CPU"
,
"GPU"
]
clusters
=
[
"SINGLE"
,
"LOCAL_CLUSTER"
,
"CLUSTER"
]
clusters
=
[
"SINGLE"
,
"LOCAL_CLUSTER"
,
"CLUSTER"
]
engine_choices
=
[
"SINGLE"
,
"LOCAL_CLUSTER"
,
"CLUSTER"
,
engine_choices
=
[
"TDM_SINGLE"
,
"TDM_LOCAL_CLUSTER"
,
"TDM_CLUSTER"
]
"SINGLE"
,
"LOCAL_CLUSTER"
,
"CLUSTER"
,
"TDM_SINGLE"
,
"TDM_LOCAL_CLUSTER"
,
"TDM_CLUSTER"
]
custom_model
=
[
'TDM'
]
custom_model
=
[
'TDM'
]
model_name
=
""
model_name
=
""
...
@@ -66,7 +68,8 @@ def get_engine(args):
...
@@ -66,7 +68,8 @@ def get_engine(args):
engine
=
engine
.
upper
()
engine
=
engine
.
upper
()
if
engine
not
in
engine_choices
:
if
engine
not
in
engine_choices
:
raise
ValueError
(
"train.engin can not be chosen in {}"
.
format
(
engine_choices
))
raise
ValueError
(
"train.engin can not be chosen in {}"
.
format
(
engine_choices
))
print
(
"engines:
\n
{}"
.
format
(
engines
))
print
(
"engines:
\n
{}"
.
format
(
engines
))
...
@@ -77,8 +80,10 @@ def get_engine(args):
...
@@ -77,8 +80,10 @@ def get_engine(args):
def
get_transpiler
():
def
get_transpiler
():
FNULL
=
open
(
os
.
devnull
,
'w'
)
FNULL
=
open
(
os
.
devnull
,
'w'
)
cmd
=
[
"python"
,
"-c"
,
cmd
=
[
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"
]
"python"
,
"-c"
,
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"
]
proc
=
subprocess
.
Popen
(
cmd
,
stdout
=
FNULL
,
stderr
=
FNULL
,
cwd
=
os
.
getcwd
())
proc
=
subprocess
.
Popen
(
cmd
,
stdout
=
FNULL
,
stderr
=
FNULL
,
cwd
=
os
.
getcwd
())
ret
=
proc
.
wait
()
ret
=
proc
.
wait
()
if
ret
==
-
11
:
if
ret
==
-
11
:
...
@@ -152,7 +157,8 @@ def cluster_engine(args):
...
@@ -152,7 +157,8 @@ def cluster_engine(args):
update_workspace
(
flattens
)
update_workspace
(
flattens
)
envs
.
set_runtime_environs
(
flattens
)
envs
.
set_runtime_environs
(
flattens
)
print
(
envs
.
pretty_print_envs
(
flattens
,
(
"Submit Runtime Envs"
,
"Value"
)))
print
(
envs
.
pretty_print_envs
(
flattens
,
(
"Submit Runtime Envs"
,
"Value"
)))
launch
=
ClusterEngine
(
None
,
args
.
model
)
launch
=
ClusterEngine
(
None
,
args
.
model
)
return
launch
return
launch
...
@@ -163,7 +169,8 @@ def cluster_engine(args):
...
@@ -163,7 +169,8 @@ def cluster_engine(args):
cluster_envs
=
{}
cluster_envs
=
{}
cluster_envs
[
"train.trainer.trainer"
]
=
trainer
cluster_envs
[
"train.trainer.trainer"
]
=
trainer
cluster_envs
[
"train.trainer.engine"
]
=
"cluster"
cluster_envs
[
"train.trainer.engine"
]
=
"cluster"
cluster_envs
[
"train.trainer.threads"
]
=
envs
.
get_runtime_environ
(
"CPU_NUM"
)
cluster_envs
[
"train.trainer.threads"
]
=
envs
.
get_runtime_environ
(
"CPU_NUM"
)
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
print
(
"launch {} engine with cluster to with model: {}"
.
format
(
print
(
"launch {} engine with cluster to with model: {}"
.
format
(
trainer
,
args
.
model
))
trainer
,
args
.
model
))
...
@@ -181,7 +188,8 @@ def cluster_engine(args):
...
@@ -181,7 +188,8 @@ def cluster_engine(args):
def
cluster_mpi_engine
(
args
):
def
cluster_mpi_engine
(
args
):
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
cluster_envs
=
{}
cluster_envs
=
{}
cluster_envs
[
"train.trainer.trainer"
]
=
"CtrCodingTrainer"
cluster_envs
[
"train.trainer.trainer"
]
=
"CtrCodingTrainer"
...
@@ -209,7 +217,8 @@ def local_cluster_engine(args):
...
@@ -209,7 +217,8 @@ def local_cluster_engine(args):
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
cluster_envs
[
"CPU_NUM"
]
=
"2"
cluster_envs
[
"CPU_NUM"
]
=
"2"
print
(
"launch {} engine with cluster to run model: {}"
.
format
(
trainer
,
args
.
model
))
print
(
"launch {} engine with cluster to run model: {}"
.
format
(
trainer
,
args
.
model
))
set_runtime_envs
(
cluster_envs
,
args
.
model
)
set_runtime_envs
(
cluster_envs
,
args
.
model
)
launch
=
LocalClusterEngine
(
cluster_envs
,
args
.
model
)
launch
=
LocalClusterEngine
(
cluster_envs
,
args
.
model
)
...
@@ -217,10 +226,12 @@ def local_cluster_engine(args):
...
@@ -217,10 +226,12 @@ def local_cluster_engine(args):
def
local_mpi_engine
(
args
):
def
local_mpi_engine
(
args
):
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
from
paddlerec.core.engine.local_mpi
import
LocalMPIEngine
from
paddlerec.core.engine.local_mpi
import
LocalMPIEngine
print
(
"use 1X1 MPI ClusterTraining at localhost to run model: {}"
.
format
(
args
.
model
))
print
(
"use 1X1 MPI ClusterTraining at localhost to run model: {}"
.
format
(
args
.
model
))
mpi
=
util
.
run_which
(
"mpirun"
)
mpi
=
util
.
run_which
(
"mpirun"
)
if
not
mpi
:
if
not
mpi
:
...
...
setup.py
浏览文件 @
c68f2694
...
@@ -11,7 +11,6 @@
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""
"""
setup for paddle-rec.
setup for paddle-rec.
"""
"""
...
@@ -22,11 +21,7 @@ from setuptools import setup, find_packages
...
@@ -22,11 +21,7 @@ from setuptools import setup, find_packages
import
shutil
import
shutil
import
tempfile
import
tempfile
requires
=
[
"paddlepaddle == 1.7.2"
,
"pyyaml >= 5.1.1"
]
requires
=
[
"paddlepaddle == 1.7.2"
,
"pyyaml >= 5.1.1"
]
about
=
{}
about
=
{}
about
[
"__title__"
]
=
"paddle-rec"
about
[
"__title__"
]
=
"paddle-rec"
...
@@ -48,18 +43,27 @@ def build(dirname):
...
@@ -48,18 +43,27 @@ def build(dirname):
package_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
package_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
run_cmd
(
"cp -r {}/* {}"
.
format
(
package_dir
,
dirname
))
run_cmd
(
"cp -r {}/* {}"
.
format
(
package_dir
,
dirname
))
run_cmd
(
"mkdir {}"
.
format
(
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mkdir {}"
.
format
(
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"core"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"doc"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
os
.
path
.
join
(
dirname
,
"core"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"models"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"tests"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
os
.
path
.
join
(
dirname
,
"doc"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"tools"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"*.py"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
os
.
path
.
join
(
dirname
,
"models"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"tests"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"tools"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"*.py"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
packages
=
find_packages
(
dirname
,
include
=
(
'paddlerec.*'
))
packages
=
find_packages
(
dirname
,
include
=
(
'paddlerec.*'
))
package_dir
=
{
''
:
dirname
}
package_dir
=
{
''
:
dirname
}
package_data
=
{}
package_data
=
{}
models_copy
=
[
'data/*.txt'
,
'data/*/*.txt'
,
'*.yaml'
,
'*.sh'
,
'tree/*.npy'
,
'tree/*.txt'
]
models_copy
=
[
'data/*.txt'
,
'data/*/*.txt'
,
'*.yaml'
,
'*.sh'
,
'tree/*.npy'
,
'tree/*.txt'
]
engine_copy
=
[
'*/*.sh'
]
engine_copy
=
[
'*/*.sh'
]
for
package
in
packages
:
for
package
in
packages
:
if
package
.
startswith
(
"paddlerec.models."
):
if
package
.
startswith
(
"paddlerec.models."
):
...
@@ -80,8 +84,7 @@ def build(dirname):
...
@@ -80,8 +84,7 @@ def build(dirname):
package_data
=
package_data
,
package_data
=
package_data
,
python_requires
=
">=2.7"
,
python_requires
=
">=2.7"
,
install_requires
=
requires
,
install_requires
=
requires
,
zip_safe
=
False
zip_safe
=
False
)
)
dirname
=
tempfile
.
mkdtemp
()
dirname
=
tempfile
.
mkdtemp
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录