Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
c68f2694
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c68f2694
编写于
5月 20, 2020
作者:
T
tangwei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix code style
上级
66e1859f
变更
49
展开全部
隐藏空白更改
内联
并排
Showing
49 changed file
with
548 addition
and
190 deletion
+548
-190
core/engine/cluster/cloud/__init__.py
core/engine/cluster/cloud/__init__.py
+13
-0
core/modules/coding/layers.py
core/modules/coding/layers.py
+13
-0
core/trainers/__init__.py
core/trainers/__init__.py
+0
-3
core/trainers/ctr_coding_trainer.py
core/trainers/ctr_coding_trainer.py
+14
-9
core/trainers/ctr_modul_trainer.py
core/trainers/ctr_modul_trainer.py
+145
-71
doc/benchmark.md
doc/benchmark.md
+1
-1
doc/contribute.md
doc/contribute.md
+1
-1
doc/design.md
doc/design.md
+1
-1
doc/distributed_train.md
doc/distributed_train.md
+0
-2
doc/faq.md
doc/faq.md
+1
-1
doc/local_train.md
doc/local_train.md
+1
-1
doc/model_list.md
doc/model_list.md
+0
-1
doc/optimization_model.md
doc/optimization_model.md
+1
-1
doc/predict.md
doc/predict.md
+1
-1
doc/ps_background.md
doc/ps_background.md
+0
-1
models/contentunderstanding/__init__.py
models/contentunderstanding/__init__.py
+13
-0
models/contentunderstanding/classification/config.yaml
models/contentunderstanding/classification/config.yaml
+0
-1
models/contentunderstanding/classification/model.py
models/contentunderstanding/classification/model.py
+5
-2
models/contentunderstanding/classification/reader.py
models/contentunderstanding/classification/reader.py
+2
-2
models/contentunderstanding/readme.md
models/contentunderstanding/readme.md
+0
-1
models/contentunderstanding/tagspace/config.yaml
models/contentunderstanding/tagspace/config.yaml
+0
-1
models/match/__init__.py
models/match/__init__.py
+13
-0
models/match/readme.md
models/match/readme.md
+0
-1
models/multitask/__init__.py
models/multitask/__init__.py
+13
-0
models/multitask/readme.md
models/multitask/readme.md
+0
-1
models/rank/__init__.py
models/rank/__init__.py
+13
-0
models/rank/dcn/data/download.py
models/rank/dcn/data/download.py
+14
-0
models/rank/dcn/data/get_slot_data.py
models/rank/dcn/data/get_slot_data.py
+4
-3
models/rank/dcn/data/preprocess.py
models/rank/dcn/data/preprocess.py
+14
-0
models/rank/deepfm/data/download_preprocess.py
models/rank/deepfm/data/download_preprocess.py
+14
-0
models/rank/deepfm/data/get_slot_data.py
models/rank/deepfm/data/get_slot_data.py
+7
-3
models/rank/deepfm/data/preprocess.py
models/rank/deepfm/data/preprocess.py
+14
-0
models/rank/din/data/build_dataset.py
models/rank/din/data/build_dataset.py
+14
-0
models/rank/din/data/convert_pd.py
models/rank/din/data/convert_pd.py
+14
-0
models/rank/din/data/remap_id.py
models/rank/din/data/remap_id.py
+14
-0
models/rank/dnn/data/get_slot_data.py
models/rank/dnn/data/get_slot_data.py
+4
-2
models/rank/wide_deep/data/data_preparation.py
models/rank/wide_deep/data/data_preparation.py
+90
-43
models/rank/wide_deep/data/get_slot_data.py
models/rank/wide_deep/data/get_slot_data.py
+4
-1
models/rank/xdeepfm/data/download.py
models/rank/xdeepfm/data/download.py
+14
-0
models/rank/xdeepfm/data/get_slot_data.py
models/rank/xdeepfm/data/get_slot_data.py
+4
-1
models/recall/gnn/data_process.sh
models/recall/gnn/data_process.sh
+0
-2
models/recall/gnn/raw_data/convert_data.py
models/recall/gnn/raw_data/convert_data.py
+16
-0
models/recall/gnn/raw_data/download.py
models/recall/gnn/raw_data/download.py
+14
-0
models/recall/readme.md
models/recall/readme.md
+0
-1
models/recall/word2vec/prepare_data.sh
models/recall/word2vec/prepare_data.sh
+0
-3
models/treebased/README.md
models/treebased/README.md
+1
-1
models/treebased/tdm/tree/layer_list.txt
models/treebased/tdm/tree/layer_list.txt
+1
-1
run.py
run.py
+22
-11
setup.py
setup.py
+18
-15
未找到文件。
core/engine/cluster/cloud/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
core/modules/coding/layers.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
core/trainers/__init__.py
浏览文件 @
c68f2694
...
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
trainer implement.
...
...
@@ -22,5 +21,3 @@ Trainer
↘ (for online learning training) OnlineLearningTrainer
"""
core/trainers/ctr_coding_trainer.py
浏览文件 @
c68f2694
...
...
@@ -59,8 +59,10 @@ class CtrTrainer(Trainer):
reader_class
=
envs
.
get_global_env
(
"class"
,
None
,
namespace
)
abs_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
reader
=
os
.
path
.
join
(
abs_dir
,
'../utils'
,
'dataset_instance.py'
)
pipe_cmd
=
"python {} {} {} {}"
.
format
(
reader
,
reader_class
,
"TRAIN"
,
self
.
_config_yaml
)
train_data_path
=
envs
.
get_global_env
(
"train_data_path"
,
None
,
namespace
)
pipe_cmd
=
"python {} {} {} {}"
.
format
(
reader
,
reader_class
,
"TRAIN"
,
self
.
_config_yaml
)
train_data_path
=
envs
.
get_global_env
(
"train_data_path"
,
None
,
namespace
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
()
dataset
.
set_use_var
(
inputs
)
...
...
@@ -87,7 +89,8 @@ class CtrTrainer(Trainer):
self
.
model
.
train_net
()
optimizer
=
self
.
model
.
optimizer
()
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
{
"use_cvm"
:
False
})
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
{
"use_cvm"
:
False
})
optimizer
.
minimize
(
self
.
model
.
get_avg_cost
())
if
fleet
.
is_server
():
...
...
@@ -118,16 +121,18 @@ class CtrTrainer(Trainer):
gs
=
shuf
*
0
fleet
.
_role_maker
.
_node_type_comm
.
Allreduce
(
shuf
,
gs
)
print
(
"trainer id: {}, trainers: {}, gs: {}"
.
format
(
fleet
.
worker_index
(),
fleet
.
worker_num
(),
gs
))
print
(
"trainer id: {}, trainers: {}, gs: {}"
.
format
(
fleet
.
worker_index
(
),
fleet
.
worker_num
(),
gs
))
epochs
=
envs
.
get_global_env
(
"train.epochs"
)
for
i
in
range
(
epochs
):
self
.
_exe
.
train_from_dataset
(
program
=
fluid
.
default_main_program
(),
dataset
=
dataset
,
fetch_list
=
self
.
fetch_vars
,
fetch_info
=
self
.
fetch_alias
,
print_period
=
self
.
fetch_period
)
self
.
_exe
.
train_from_dataset
(
program
=
fluid
.
default_main_program
(),
dataset
=
dataset
,
fetch_list
=
self
.
fetch_vars
,
fetch_info
=
self
.
fetch_alias
,
print_period
=
self
.
fetch_period
)
context
[
'status'
]
=
'terminal_pass'
fleet
.
stop_worker
()
...
...
core/trainers/ctr_modul_trainer.py
浏览文件 @
c68f2694
此差异已折叠。
点击以展开。
doc/benchmark.md
浏览文件 @
c68f2694
# PaddleRec Benchmark
> 占位
\ No newline at end of file
> 占位
doc/contribute.md
浏览文件 @
c68f2694
# PaddleRec 贡献代码
> 占位
\ No newline at end of file
> 占位
doc/design.md
浏览文件 @
c68f2694
...
...
@@ -279,4 +279,4 @@ class Metric(object):
pass
```
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考
[
auc_metric.py
](
../core/metrics/auc_metrics.py
)
\ No newline at end of file
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考
[
auc_metric.py
](
../core/metrics/auc_metrics.py
)
doc/distributed_train.md
浏览文件 @
c68f2694
...
...
@@ -7,5 +7,3 @@
### K8S集群运行分布式
> 占位
doc/faq.md
浏览文件 @
c68f2694
# 常见问题FAQ
> 占位
\ No newline at end of file
> 占位
doc/local_train.md
浏览文件 @
c68f2694
# PaddleRec 单机训练
> 占位
\ No newline at end of file
> 占位
doc/model_list.md
浏览文件 @
c68f2694
...
...
@@ -12,4 +12,3 @@
| 多任务 |
[
ESMM
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 |
[
DSSM
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 |
[
Multiview-Simnet
](
)
| ✓ | x | ✓ | x | ✓ | ✓ |
doc/optimization_model.md
浏览文件 @
c68f2694
# PaddleRec 模型调参
> 占位
\ No newline at end of file
> 占位
doc/predict.md
浏览文件 @
c68f2694
# PaddleRec 离线预测
\ No newline at end of file
# PaddleRec 离线预测
doc/ps_background.md
浏览文件 @
c68f2694
...
...
@@ -5,4 +5,3 @@
## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839)
models/contentunderstanding/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/contentunderstanding/classification/config.yaml
浏览文件 @
c68f2694
...
...
@@ -37,4 +37,3 @@ train:
dirname
:
"
inference"
epoch_interval
:
100
save_last
:
True
models/contentunderstanding/classification/model.py
浏览文件 @
c68f2694
...
...
@@ -31,7 +31,8 @@ class Model(ModelBase):
def
train_net
(
self
):
""" network definition """
data
=
fluid
.
data
(
name
=
"input"
,
shape
=
[
None
,
self
.
max_len
],
dtype
=
'int64'
)
data
=
fluid
.
data
(
name
=
"input"
,
shape
=
[
None
,
self
.
max_len
],
dtype
=
'int64'
)
label
=
fluid
.
data
(
name
=
"label"
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
seq_len
=
fluid
.
data
(
name
=
"seq_len"
,
shape
=
[
None
],
dtype
=
'int64'
)
...
...
@@ -51,7 +52,9 @@ class Model(ModelBase):
# full connect layer
fc_1
=
fluid
.
layers
.
fc
(
input
=
[
conv
],
size
=
self
.
hid_dim
)
# softmax layer
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_1
],
size
=
self
.
class_dim
,
act
=
"softmax"
)
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_1
],
size
=
self
.
class_dim
,
act
=
"softmax"
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
...
...
models/contentunderstanding/classification/reader.py
浏览文件 @
c68f2694
...
...
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
sys
from
paddlerec.core.reader
import
Reader
...
...
@@ -38,7 +37,8 @@ class TrainReader(Reader):
data
=
[
int
(
i
)
for
i
in
data
]
label
=
[
int
(
i
)
for
i
in
label
]
seq_len
=
[
int
(
i
)
for
i
in
seq_len
]
print
>>
sys
.
stderr
,
str
([(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)])
print
>>
sys
.
stderr
,
str
(
[(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)])
yield
[(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)]
return
data_iter
models/contentunderstanding/readme.md
浏览文件 @
c68f2694
...
...
@@ -87,4 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
| :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: |
| ag news dataset | TagSpace | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- |
models/contentunderstanding/tagspace/config.yaml
浏览文件 @
c68f2694
...
...
@@ -47,4 +47,3 @@ train:
dirname
:
"
inference"
epoch_interval
:
100
save_last
:
True
models/match/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/match/readme.md
浏览文件 @
c68f2694
...
...
@@ -37,4 +37,3 @@
python
-m
paddlerec.run
-m
paddlerec.models.match.dssm
# dssm
python
-m
paddlerec.run
-m
paddlerec.models.match.multiview-simnet
# multiview-simnet
```
models/multitask/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/multitask/readme.md
浏览文件 @
c68f2694
...
...
@@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm
| Census-income Data | Share-Bottom | -- | 0.93120/0.99256 |
| Census-income Data | MMoE | -- | 0.94465/0.99324 |
| Ali-CCP | ESMM | -- | 0.97181/0.49967 |
models/rank/__init__.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/rank/dcn/data/download.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
io
...
...
models/rank/dcn/data/get_slot_data.py
浏览文件 @
c68f2694
...
...
@@ -26,8 +26,8 @@ from collections import Counter
import
os
import
paddle.fluid.incubate.data_generator
as
dg
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
def
__init__
(
self
,
config
):
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
...
...
@@ -83,11 +83,11 @@ class TrainReader(dg.MultiSlotDataGenerator):
if
idx
==
2
else
math
.
log
(
1
+
float
(
features
[
idx
])))
for
idx
in
self
.
cat_idx_
:
if
features
[
idx
]
==
''
or
features
[
idx
]
not
in
self
.
cat_feat_idx_dict_list
[
idx
-
14
]:
idx
]
not
in
self
.
cat_feat_idx_dict_list
[
idx
-
14
]:
label_feat_list
[
idx
].
append
(
0
)
else
:
label_feat_list
[
idx
].
append
(
self
.
cat_feat_idx_dict_list
[
idx
-
14
][
features
[
idx
]])
idx
-
14
][
features
[
idx
]])
label_feat_list
[
0
].
append
(
int
(
features
[
0
]))
return
label_feat_list
...
...
@@ -109,6 +109,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return
data_iter
reader
=
TrainReader
(
"../config.yaml"
)
reader
.
init
()
reader
.
run_from_stdin
()
models/rank/dcn/data/preprocess.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
,
absolute_import
,
division
import
os
...
...
models/rank/deepfm/data/download_preprocess.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
shutil
import
sys
...
...
models/rank/deepfm/data/get_slot_data.py
浏览文件 @
c68f2694
...
...
@@ -19,8 +19,9 @@ try:
import
cPickle
as
pickle
except
ImportError
:
import
pickle
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
def
__init__
(
self
,
config
):
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
...
...
@@ -44,7 +45,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
self
.
categorical_range_
=
range
(
14
,
40
)
# load preprocessed feature dict
self
.
feat_dict_name
=
"aid_data/feat_dict_10.pkl2"
self
.
feat_dict_
=
pickle
.
load
(
open
(
self
.
feat_dict_name
,
'rb'
))
self
.
feat_dict_
=
pickle
.
load
(
open
(
self
.
feat_dict_name
,
'rb'
))
def
_process_line
(
self
,
line
):
features
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
...
...
@@ -77,15 +78,18 @@ class TrainReader(dg.MultiSlotDataGenerator):
def
data_iter
():
feat_idx
,
feat_value
,
label
=
self
.
_process_line
(
line
)
s
=
""
for
i
in
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]:
for
i
in
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]:
k
=
i
[
0
]
v
=
i
[
1
]
for
j
in
v
:
s
+=
" "
+
k
+
":"
+
str
(
j
)
print
s
.
strip
()
yield
None
return
data_iter
reader
=
TrainReader
(
"../config.yaml"
)
reader
.
init
()
reader
.
run_from_stdin
()
models/rank/deepfm/data/preprocess.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
numpy
from
collections
import
Counter
...
...
models/rank/din/data/build_dataset.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
random
import
pickle
...
...
models/rank/din/data/convert_pd.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
pickle
import
pandas
as
pd
...
...
models/rank/din/data/remap_id.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
random
import
pickle
...
...
models/rank/dnn/data/get_slot_data.py
浏览文件 @
c68f2694
...
...
@@ -32,6 +32,7 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
"""
Read the data line by line and process it as a dictionary
"""
def
reader
():
"""
This function needs to be implemented by the user, based on data format
...
...
@@ -57,11 +58,12 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
feature_name
.
append
(
"label"
)
s
=
"click:"
+
str
(
label
[
0
])
for
i
in
dense_feature
:
s
+=
" dense_feature:"
+
str
(
i
)
s
+=
" dense_feature:"
+
str
(
i
)
for
i
in
range
(
1
,
1
+
len
(
categorical_range_
)):
s
+=
" "
+
str
(
i
)
+
":"
+
str
(
sparse_feature
[
i
-
1
][
0
])
s
+=
" "
+
str
(
i
)
+
":"
+
str
(
sparse_feature
[
i
-
1
][
0
])
print
s
.
strip
()
yield
None
return
reader
...
...
models/rank/wide_deep/data/data_preparation.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
io
import
args
import
pandas
as
pd
from
sklearn
import
preprocessing
from
sklearn
import
preprocessing
def
_clean_file
(
source_path
,
target_path
):
def
_clean_file
(
source_path
,
target_path
):
"""makes changes to match the CSV format."""
with
io
.
open
(
source_path
,
'r'
)
as
temp_eval_file
:
with
io
.
open
(
target_path
,
'w'
)
as
eval_file
:
...
...
@@ -17,15 +32,16 @@ def _clean_file(source_path,target_path):
line
=
line
[:
-
1
]
line
+=
'
\n
'
eval_file
.
write
(
line
)
def
build_model_columns
(
train_data_path
,
test_data_path
):
# The column names are from
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
column_names
=
[
'age'
,
'workclass'
,
'fnlwgt'
,
'education'
,
'education_num'
,
'marital_status'
,
'occupation'
,
'relationship'
,
'race'
,
'gender'
,
'capital_gain'
,
'capital_loss'
,
'hours_per_week'
,
'native_country'
,
'income_bracket'
'age'
,
'workclass'
,
'fnlwgt'
,
'education'
,
'education_num'
,
'marital_status'
,
'occupation'
,
'relationship'
,
'race'
,
'gender'
,
'capital_gain'
,
'capital_loss'
,
'hours_per_week'
,
'native_country'
,
'income_bracket'
]
# Load the dataset in Pandas
...
...
@@ -44,61 +60,92 @@ def build_model_columns(train_data_path, test_data_path):
# First group of tasks according to the paper
#label_columns = ['income_50k', 'marital_stat']
categorical_columns
=
[
'education'
,
'marital_status'
,
'relationship'
,
'workclass'
,
'occupation'
]
categorical_columns
=
[
'education'
,
'marital_status'
,
'relationship'
,
'workclass'
,
'occupation'
]
for
col
in
categorical_columns
:
label_train
=
preprocessing
.
LabelEncoder
()
train_df
[
col
]
=
label_train
.
fit_transform
(
train_df
[
col
])
train_df
[
col
]
=
label_train
.
fit_transform
(
train_df
[
col
])
label_test
=
preprocessing
.
LabelEncoder
()
test_df
[
col
]
=
label_test
.
fit_transform
(
test_df
[
col
])
bins
=
[
18
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
]
train_df
[
'age_buckets'
]
=
pd
.
cut
(
train_df
[
'age'
].
values
.
tolist
(),
bins
,
labels
=
False
)
test_df
[
'age_buckets'
]
=
pd
.
cut
(
test_df
[
'age'
].
values
.
tolist
(),
bins
,
labels
=
False
)
base_columns
=
[
'education'
,
'marital_status'
,
'relationship'
,
'workclass'
,
'occupation'
,
'age_buckets'
]
train_df
[
'education_occupation'
]
=
train_df
[
'education'
].
astype
(
str
)
+
'_'
+
train_df
[
'occupation'
].
astype
(
str
)
test_df
[
'education_occupation'
]
=
test_df
[
'education'
].
astype
(
str
)
+
'_'
+
test_df
[
'occupation'
].
astype
(
str
)
train_df
[
'age_buckets_education_occupation'
]
=
train_df
[
'age_buckets'
].
astype
(
str
)
+
'_'
+
train_df
[
'education'
].
astype
(
str
)
+
'_'
+
train_df
[
'occupation'
].
astype
(
str
)
test_df
[
'age_buckets_education_occupation'
]
=
test_df
[
'age_buckets'
].
astype
(
str
)
+
'_'
+
test_df
[
'education'
].
astype
(
str
)
+
'_'
+
test_df
[
'occupation'
].
astype
(
str
)
crossed_columns
=
[
'education_occupation'
,
'age_buckets_education_occupation'
]
test_df
[
col
]
=
label_test
.
fit_transform
(
test_df
[
col
])
bins
=
[
18
,
25
,
30
,
35
,
40
,
45
,
50
,
55
,
60
,
65
]
train_df
[
'age_buckets'
]
=
pd
.
cut
(
train_df
[
'age'
].
values
.
tolist
(),
bins
,
labels
=
False
)
test_df
[
'age_buckets'
]
=
pd
.
cut
(
test_df
[
'age'
].
values
.
tolist
(),
bins
,
labels
=
False
)
base_columns
=
[
'education'
,
'marital_status'
,
'relationship'
,
'workclass'
,
'occupation'
,
'age_buckets'
]
train_df
[
'education_occupation'
]
=
train_df
[
'education'
].
astype
(
str
)
+
'_'
+
train_df
[
'occupation'
].
astype
(
str
)
test_df
[
'education_occupation'
]
=
test_df
[
'education'
].
astype
(
str
)
+
'_'
+
test_df
[
'occupation'
].
astype
(
str
)
train_df
[
'age_buckets_education_occupation'
]
=
train_df
[
'age_buckets'
].
astype
(
str
)
+
'_'
+
train_df
[
'education'
].
astype
(
str
)
+
'_'
+
train_df
[
'occupation'
].
astype
(
str
)
test_df
[
'age_buckets_education_occupation'
]
=
test_df
[
'age_buckets'
].
astype
(
str
)
+
'_'
+
test_df
[
'education'
].
astype
(
str
)
+
'_'
+
test_df
[
'occupation'
].
astype
(
str
)
crossed_columns
=
[
'education_occupation'
,
'age_buckets_education_occupation'
]
for
col
in
crossed_columns
:
label_train
=
preprocessing
.
LabelEncoder
()
train_df
[
col
]
=
label_train
.
fit_transform
(
train_df
[
col
])
train_df
[
col
]
=
label_train
.
fit_transform
(
train_df
[
col
])
label_test
=
preprocessing
.
LabelEncoder
()
test_df
[
col
]
=
label_test
.
fit_transform
(
test_df
[
col
])
test_df
[
col
]
=
label_test
.
fit_transform
(
test_df
[
col
])
wide_columns
=
base_columns
+
crossed_columns
train_df_temp
=
pd
.
get_dummies
(
train_df
[
categorical_columns
],
columns
=
categorical_columns
)
test_df_temp
=
pd
.
get_dummies
(
test_df
[
categorical_columns
],
columns
=
categorical_columns
)
train_df_temp
=
pd
.
get_dummies
(
train_df
[
categorical_columns
],
columns
=
categorical_columns
)
test_df_temp
=
pd
.
get_dummies
(
test_df
[
categorical_columns
],
columns
=
categorical_columns
)
train_df
=
train_df
.
join
(
train_df_temp
)
test_df
=
test_df
.
join
(
test_df_temp
)
deep_columns
=
list
(
train_df_temp
.
columns
)
+
[
'age'
,
'education_num'
,
'capital_gain'
,
'capital_loss'
,
'hours_per_week'
]
train_df
[
'label'
]
=
train_df
[
'income_bracket'
].
apply
(
lambda
x
:
1
if
x
==
'>50K'
else
0
)
test_df
[
'label'
]
=
test_df
[
'income_bracket'
].
apply
(
lambda
x
:
1
if
x
==
'>50K'
else
0
)
with
io
.
open
(
'train_data/columns.txt'
,
'w'
)
as
f
:
write_str
=
str
(
len
(
wide_columns
))
+
'
\n
'
+
str
(
len
(
deep_columns
))
+
'
\n
'
deep_columns
=
list
(
train_df_temp
.
columns
)
+
[
'age'
,
'education_num'
,
'capital_gain'
,
'capital_loss'
,
'hours_per_week'
]
train_df
[
'label'
]
=
train_df
[
'income_bracket'
].
apply
(
lambda
x
:
1
if
x
==
'>50K'
else
0
)
test_df
[
'label'
]
=
test_df
[
'income_bracket'
].
apply
(
lambda
x
:
1
if
x
==
'>50K'
else
0
)
with
io
.
open
(
'train_data/columns.txt'
,
'w'
)
as
f
:
write_str
=
str
(
len
(
wide_columns
))
+
'
\n
'
+
str
(
len
(
deep_columns
))
+
'
\n
'
f
.
write
(
write_str
)
f
.
close
()
with
io
.
open
(
'test_data/columns.txt'
,
'w'
)
as
f
:
write_str
=
str
(
len
(
wide_columns
))
+
'
\n
'
+
str
(
len
(
deep_columns
))
+
'
\n
'
with
io
.
open
(
'test_data/columns.txt'
,
'w'
)
as
f
:
write_str
=
str
(
len
(
wide_columns
))
+
'
\n
'
+
str
(
len
(
deep_columns
))
+
'
\n
'
f
.
write
(
write_str
)
f
.
close
()
train_df
[
wide_columns
+
deep_columns
+
[
'label'
]].
fillna
(
0
).
to_csv
(
train_data_path
,
index
=
False
)
test_df
[
wide_columns
+
deep_columns
+
[
'label'
]].
fillna
(
0
).
to_csv
(
test_data_path
,
index
=
False
)
train_df
[
wide_columns
+
deep_columns
+
[
'label'
]].
fillna
(
0
).
to_csv
(
train_data_path
,
index
=
False
)
test_df
[
wide_columns
+
deep_columns
+
[
'label'
]].
fillna
(
0
).
to_csv
(
test_data_path
,
index
=
False
)
def
clean_file
(
train_path
,
test_path
,
train_data_path
,
test_data_path
):
_clean_file
(
train_path
,
train_data_path
)
_clean_file
(
test_path
,
test_data_path
)
if
__name__
==
'__main__'
:
args
=
args
.
parse_args
()
clean_file
(
args
.
train_path
,
args
.
test_path
,
args
.
train_data_path
,
args
.
test_data_path
)
clean_file
(
args
.
train_path
,
args
.
test_path
,
args
.
train_data_path
,
args
.
test_data_path
)
build_model_columns
(
args
.
train_data_path
,
args
.
test_data_path
)
models/rank/wide_deep/data/get_slot_data.py
浏览文件 @
c68f2694
...
...
@@ -20,6 +20,7 @@ except ImportError:
import
pickle
import
paddle.fluid.incubate.data_generator
as
dg
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
def
__init__
(
self
,
config
):
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
...
...
@@ -50,7 +51,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
wide_feat
,
deep_deat
,
label
=
self
.
_process_line
(
line
)
s
=
""
for
i
in
[(
'wide_input'
,
wide_feat
),
(
'deep_input'
,
deep_deat
),
(
'label'
,
label
)]:
for
i
in
[(
'wide_input'
,
wide_feat
),
(
'deep_input'
,
deep_deat
),
(
'label'
,
label
)]:
k
=
i
[
0
]
v
=
i
[
1
]
for
j
in
v
:
...
...
@@ -60,6 +62,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return
data_iter
reader
=
TrainReader
(
"../config.yaml"
)
reader
.
init
()
reader
.
run_from_stdin
()
models/rank/xdeepfm/data/download.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
shutil
import
sys
...
...
models/rank/xdeepfm/data/get_slot_data.py
浏览文件 @
c68f2694
...
...
@@ -21,6 +21,7 @@ except ImportError:
import
pickle
import
paddle.fluid.incubate.data_generator
as
dg
class
TrainReader
(
dg
.
MultiSlotDataGenerator
):
def
__init__
(
self
,
config
):
dg
.
MultiSlotDataGenerator
.
__init__
(
self
)
...
...
@@ -48,7 +49,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
feat_idx
,
feat_value
,
label
=
self
.
_process_line
(
line
)
s
=
""
for
i
in
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]:
for
i
in
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]:
k
=
i
[
0
]
v
=
i
[
1
]
for
j
in
v
:
...
...
@@ -58,6 +60,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return
data_iter
reader
=
TrainReader
(
"../config.yaml"
)
reader
.
init
()
reader
.
run_from_stdin
()
models/recall/gnn/data_process.sh
浏览文件 @
c68f2694
...
...
@@ -31,5 +31,3 @@ mv diginetica/train.txt train_data
mkdir
test_data
mv
diginetica/test.txt test_data
models/recall/gnn/raw_data/convert_data.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
time
import
pickle
...
...
@@ -10,6 +24,7 @@ parser.add_argument(
help
=
'dataset dir: diginetica/yoochoose1_4/yoochoose1_64/sample'
)
opt
=
parser
.
parse_args
()
def
process_data
(
file_type
):
path
=
os
.
path
.
join
(
opt
.
data_dir
,
file_type
)
output_path
=
os
.
path
.
splitext
(
path
)[
0
]
+
".txt"
...
...
@@ -23,6 +38,7 @@ def process_data(file_type):
fout
.
write
(
str
(
data
[
i
][
1
]))
fout
.
write
(
"
\n
"
)
process_data
(
"train"
)
process_data
(
"test"
)
...
...
models/recall/gnn/raw_data/download.py
浏览文件 @
c68f2694
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
requests
import
sys
import
time
...
...
models/recall/readme.md
浏览文件 @
c68f2694
...
...
@@ -78,4 +78,3 @@ python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn
| MOVIELENS | NCF | 0.688 | -- |
| -- | Youtube | -- | -- |
| 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 |
models/recall/word2vec/prepare_data.sh
浏览文件 @
c68f2694
...
...
@@ -35,6 +35,3 @@ wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.ta
tar
xzvf test_dir.tar
-C
raw_data
mv
raw_data/data/test_dir test_data/
rm
-rf
raw_data
models/treebased/README.md
浏览文件 @
c68f2694
...
...
@@ -24,4 +24,4 @@ TDM是为大规模推荐系统设计的、能承载任意先进模型来高效
-
如何组网?答:paddle封装了大量的深度学习OP,用户可以根据需求设计自己的网络结构。
-
训练数据如何组织?答:tdm的训练数据主要为:
`user/query emb`
加
`item`
的正样本,
`item`
需要映射到树的某个叶子节点。用户只需准备符合该构成的数据即可。负样本的生成,会基于用户提供的树结构,以及paddle提供的
`tdm-sampler op`
完成高效的负采样,并自动添加相应的label,参与tdm中深度学习模型的训练。
-
大规模的数据与模型训练如何实现?答:基于paddle优秀的大规模参数服务器分布式能力,可以实现高效的分布式训练。基于paddle-fleet api,学习门槛极低,且可以灵活的支持增量训练,流式训练等业务需求。
3.
训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。
\ No newline at end of file
3.
训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。
models/treebased/tdm/tree/layer_list.txt
浏览文件 @
c68f2694
1,2
3,4,5,6
7,8,9,10,11,12,13
14,15,16,17,18,19,20,21,22,23,24,25
\ No newline at end of file
14,15,16,17,18,19,20,21,22,23,24,25
run.py
浏览文件 @
c68f2694
...
...
@@ -26,8 +26,10 @@ from paddlerec.core.utils import util
engines
=
{}
device
=
[
"CPU"
,
"GPU"
]
clusters
=
[
"SINGLE"
,
"LOCAL_CLUSTER"
,
"CLUSTER"
]
engine_choices
=
[
"SINGLE"
,
"LOCAL_CLUSTER"
,
"CLUSTER"
,
"TDM_SINGLE"
,
"TDM_LOCAL_CLUSTER"
,
"TDM_CLUSTER"
]
engine_choices
=
[
"SINGLE"
,
"LOCAL_CLUSTER"
,
"CLUSTER"
,
"TDM_SINGLE"
,
"TDM_LOCAL_CLUSTER"
,
"TDM_CLUSTER"
]
custom_model
=
[
'TDM'
]
model_name
=
""
...
...
@@ -66,7 +68,8 @@ def get_engine(args):
engine
=
engine
.
upper
()
if
engine
not
in
engine_choices
:
raise
ValueError
(
"train.engin can not be chosen in {}"
.
format
(
engine_choices
))
raise
ValueError
(
"train.engin can not be chosen in {}"
.
format
(
engine_choices
))
print
(
"engines:
\n
{}"
.
format
(
engines
))
...
...
@@ -77,8 +80,10 @@ def get_engine(args):
def
get_transpiler
():
FNULL
=
open
(
os
.
devnull
,
'w'
)
cmd
=
[
"python"
,
"-c"
,
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"
]
cmd
=
[
"python"
,
"-c"
,
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"
]
proc
=
subprocess
.
Popen
(
cmd
,
stdout
=
FNULL
,
stderr
=
FNULL
,
cwd
=
os
.
getcwd
())
ret
=
proc
.
wait
()
if
ret
==
-
11
:
...
...
@@ -152,7 +157,8 @@ def cluster_engine(args):
update_workspace
(
flattens
)
envs
.
set_runtime_environs
(
flattens
)
print
(
envs
.
pretty_print_envs
(
flattens
,
(
"Submit Runtime Envs"
,
"Value"
)))
print
(
envs
.
pretty_print_envs
(
flattens
,
(
"Submit Runtime Envs"
,
"Value"
)))
launch
=
ClusterEngine
(
None
,
args
.
model
)
return
launch
...
...
@@ -163,7 +169,8 @@ def cluster_engine(args):
cluster_envs
=
{}
cluster_envs
[
"train.trainer.trainer"
]
=
trainer
cluster_envs
[
"train.trainer.engine"
]
=
"cluster"
cluster_envs
[
"train.trainer.threads"
]
=
envs
.
get_runtime_environ
(
"CPU_NUM"
)
cluster_envs
[
"train.trainer.threads"
]
=
envs
.
get_runtime_environ
(
"CPU_NUM"
)
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
print
(
"launch {} engine with cluster to with model: {}"
.
format
(
trainer
,
args
.
model
))
...
...
@@ -181,7 +188,8 @@ def cluster_engine(args):
def
cluster_mpi_engine
(
args
):
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
cluster_envs
=
{}
cluster_envs
[
"train.trainer.trainer"
]
=
"CtrCodingTrainer"
...
...
@@ -209,7 +217,8 @@ def local_cluster_engine(args):
cluster_envs
[
"train.trainer.platform"
]
=
envs
.
get_platform
()
cluster_envs
[
"CPU_NUM"
]
=
"2"
print
(
"launch {} engine with cluster to run model: {}"
.
format
(
trainer
,
args
.
model
))
print
(
"launch {} engine with cluster to run model: {}"
.
format
(
trainer
,
args
.
model
))
set_runtime_envs
(
cluster_envs
,
args
.
model
)
launch
=
LocalClusterEngine
(
cluster_envs
,
args
.
model
)
...
...
@@ -217,10 +226,12 @@ def local_cluster_engine(args):
def
local_mpi_engine
(
args
):
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
print
(
"launch cluster engine with cluster to run model: {}"
.
format
(
args
.
model
))
from
paddlerec.core.engine.local_mpi
import
LocalMPIEngine
print
(
"use 1X1 MPI ClusterTraining at localhost to run model: {}"
.
format
(
args
.
model
))
print
(
"use 1X1 MPI ClusterTraining at localhost to run model: {}"
.
format
(
args
.
model
))
mpi
=
util
.
run_which
(
"mpirun"
)
if
not
mpi
:
...
...
setup.py
浏览文件 @
c68f2694
...
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
setup for paddle-rec.
"""
...
...
@@ -22,11 +21,7 @@ from setuptools import setup, find_packages
import
shutil
import
tempfile
requires
=
[
"paddlepaddle == 1.7.2"
,
"pyyaml >= 5.1.1"
]
requires
=
[
"paddlepaddle == 1.7.2"
,
"pyyaml >= 5.1.1"
]
about
=
{}
about
[
"__title__"
]
=
"paddle-rec"
...
...
@@ -48,18 +43,27 @@ def build(dirname):
package_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
run_cmd
(
"cp -r {}/* {}"
.
format
(
package_dir
,
dirname
))
run_cmd
(
"mkdir {}"
.
format
(
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"core"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"doc"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"models"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"tests"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"tools"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"*.py"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"core"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"doc"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"models"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"tests"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"tools"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
run_cmd
(
"mv {} {}"
.
format
(
os
.
path
.
join
(
dirname
,
"*.py"
),
os
.
path
.
join
(
dirname
,
"paddlerec"
)))
packages
=
find_packages
(
dirname
,
include
=
(
'paddlerec.*'
))
package_dir
=
{
''
:
dirname
}
package_data
=
{}
models_copy
=
[
'data/*.txt'
,
'data/*/*.txt'
,
'*.yaml'
,
'*.sh'
,
'tree/*.npy'
,
'tree/*.txt'
]
models_copy
=
[
'data/*.txt'
,
'data/*/*.txt'
,
'*.yaml'
,
'*.sh'
,
'tree/*.npy'
,
'tree/*.txt'
]
engine_copy
=
[
'*/*.sh'
]
for
package
in
packages
:
if
package
.
startswith
(
"paddlerec.models."
):
...
...
@@ -80,8 +84,7 @@ def build(dirname):
package_data
=
package_data
,
python_requires
=
">=2.7"
,
install_requires
=
requires
,
zip_safe
=
False
)
zip_safe
=
False
)
dirname
=
tempfile
.
mkdtemp
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录