Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
情韵~
PaddleRec
提交
7a3ec4e6
P
PaddleRec
项目概览
情韵~
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
7a3ec4e6
编写于
5月 19, 2020
作者:
T
tangwei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
for mat
上级
801dfd34
变更
55
隐藏空白更改
内联
并排
Showing
55 changed file
with
515 addition
and
491 deletion
+515
-491
core/engine/engine.py
core/engine/engine.py
+0
-1
core/engine/local_cluster.py
core/engine/local_cluster.py
+0
-1
core/metric.py
core/metric.py
+1
-1
core/metrics/auc_metrics.py
core/metrics/auc_metrics.py
+1
-1
core/trainers/__init__.py
core/trainers/__init__.py
+26
-0
core/trainers/ctr_coding_trainer.py
core/trainers/ctr_coding_trainer.py
+1
-1
core/trainers/ctr_modul_trainer.py
core/trainers/ctr_modul_trainer.py
+1
-1
core/trainers/online_learning_trainer.py
core/trainers/online_learning_trainer.py
+1
-1
core/trainers/single_trainer.py
core/trainers/single_trainer.py
+4
-3
core/trainers/tdm_cluster_trainer.py
core/trainers/tdm_cluster_trainer.py
+0
-1
core/trainers/tdm_single_trainer.py
core/trainers/tdm_single_trainer.py
+0
-1
core/trainers/transpiler_trainer.py
core/trainers/transpiler_trainer.py
+3
-3
core/utils/dataset_holder.py
core/utils/dataset_holder.py
+2
-12
core/utils/fs.py
core/utils/fs.py
+13
-11
core/utils/table.py
core/utils/table.py
+1
-1
models/contentunderstanding/classification/model.py
models/contentunderstanding/classification/model.py
+2
-2
models/contentunderstanding/classification/reader.py
models/contentunderstanding/classification/reader.py
+4
-3
models/contentunderstanding/tagspace/model.py
models/contentunderstanding/tagspace/model.py
+9
-11
models/contentunderstanding/tagspace/reader.py
models/contentunderstanding/tagspace/reader.py
+4
-5
models/match/dssm/model.py
models/match/dssm/model.py
+29
-28
models/match/dssm/synthetic_reader.py
models/match/dssm/synthetic_reader.py
+1
-1
models/match/multiview-simnet/evaluate_reader.py
models/match/multiview-simnet/evaluate_reader.py
+4
-3
models/match/multiview-simnet/generate_synthetic_data.py
models/match/multiview-simnet/generate_synthetic_data.py
+7
-3
models/match/multiview-simnet/model.py
models/match/multiview-simnet/model.py
+34
-31
models/match/multiview-simnet/reader.py
models/match/multiview-simnet/reader.py
+4
-3
models/multitask/esmm/esmm_infer_reader.py
models/multitask/esmm/esmm_infer_reader.py
+12
-11
models/multitask/esmm/esmm_reader.py
models/multitask/esmm/esmm_reader.py
+15
-13
models/multitask/esmm/model.py
models/multitask/esmm/model.py
+34
-36
models/multitask/mmoe/census_reader.py
models/multitask/mmoe/census_reader.py
+2
-2
models/multitask/mmoe/model.py
models/multitask/mmoe/model.py
+31
-31
models/multitask/share-bottom/census_reader.py
models/multitask/share-bottom/census_reader.py
+2
-2
models/multitask/share-bottom/model.py
models/multitask/share-bottom/model.py
+26
-26
models/rank/dcn/criteo_reader.py
models/rank/dcn/criteo_reader.py
+7
-7
models/rank/dcn/model.py
models/rank/dcn/model.py
+10
-11
models/rank/deepfm/criteo_reader.py
models/rank/deepfm/criteo_reader.py
+4
-3
models/rank/deepfm/model.py
models/rank/deepfm/model.py
+27
-27
models/rank/din/reader.py
models/rank/din/reader.py
+10
-15
models/rank/wide_deep/model.py
models/rank/wide_deep/model.py
+38
-29
models/rank/wide_deep/reader.py
models/rank/wide_deep/reader.py
+4
-3
models/rank/xdeepfm/criteo_reader.py
models/rank/xdeepfm/criteo_reader.py
+4
-4
models/rank/xdeepfm/model.py
models/rank/xdeepfm/model.py
+14
-14
models/recall/gnn/evaluate_reader.py
models/recall/gnn/evaluate_reader.py
+9
-7
models/recall/gnn/model.py
models/recall/gnn/model.py
+64
-64
models/recall/gnn/reader.py
models/recall/gnn/reader.py
+9
-7
models/recall/gru4rec/model.py
models/recall/gru4rec/model.py
+0
-2
models/recall/ssr/model.py
models/recall/ssr/model.py
+4
-6
models/recall/ssr/ssr_infer_reader.py
models/recall/ssr/ssr_infer_reader.py
+1
-3
models/recall/ssr/ssr_reader.py
models/recall/ssr/ssr_reader.py
+0
-2
models/recall/word2vec/preprocess.py
models/recall/word2vec/preprocess.py
+14
-16
models/recall/word2vec/w2v_evaluate_reader.py
models/recall/word2vec/w2v_evaluate_reader.py
+7
-8
models/recall/word2vec/w2v_reader.py
models/recall/word2vec/w2v_reader.py
+4
-4
models/treebased/tdm/model.py
models/treebased/tdm/model.py
+8
-8
models/treebased/tdm/tdm_evaluate_reader.py
models/treebased/tdm/tdm_evaluate_reader.py
+1
-0
models/treebased/tdm/tdm_reader.py
models/treebased/tdm/tdm_reader.py
+1
-0
setup.py
setup.py
+1
-1
未找到文件。
core/engine/engine.py
浏览文件 @
7a3ec4e6
...
...
@@ -29,4 +29,3 @@ class Engine:
@
abc
.
abstractmethod
def
run
(
self
):
pass
core/engine/local_cluster.py
浏览文件 @
7a3ec4e6
...
...
@@ -20,7 +20,6 @@ import os
import
sys
import
subprocess
from
paddlerec.core.engine.engine
import
Engine
from
paddlerec.core.utils
import
envs
...
...
core/metric.py
浏览文件 @
7a3ec4e6
...
...
@@ -53,7 +53,7 @@ class Metric(object):
pass
@
abc
.
abstractmethod
def
get_result_to_string
(
self
):
def
__str__
(
self
):
"""
Return:
result(string) : calculate result with string format, for output
...
...
core/metrics/auc_metrics.py
浏览文件 @
7a3ec4e6
...
...
@@ -200,7 +200,7 @@ class AUCMetric(Metric):
""" """
return
self
.
_result
def
get_result_to_string
(
self
):
def
__str__
(
self
):
""" """
result
=
self
.
get_result
()
result_str
=
"%s AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f RMSE=%.6f "
\
...
...
core/trainers/__init__.py
浏览文件 @
7a3ec4e6
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
trainer implement.
↗ (single/cluster) CtrTrainer
Trainer
↗ (for single training) SingleTrainer/TDMSingleTrainer
↘ TranspilerTrainer → (for cluster training) ClusterTrainer/TDMClusterTrainer
↘ (for online learning training) OnlineLearningTrainer
"""
core/trainers/ctr_coding_trainer.py
浏览文件 @
7a3ec4e6
...
...
@@ -23,7 +23,7 @@ from paddlerec.core.utils import envs
from
paddlerec.core.trainer
import
Trainer
class
Ctr
Paddle
Trainer
(
Trainer
):
class
CtrTrainer
(
Trainer
):
"""R
"""
...
...
core/trainers/ctr_modul_trainer.py
浏览文件 @
7a3ec4e6
...
...
@@ -72,7 +72,7 @@ def worker_numric_max(value, env="mpi"):
return
wroker_numric_opt
(
value
,
env
,
"max"
)
class
Ctr
Paddle
Trainer
(
Trainer
):
class
CtrTrainer
(
Trainer
):
"""R
"""
...
...
core/trainers/online_learning_trainer.py
浏览文件 @
7a3ec4e6
...
...
@@ -31,7 +31,7 @@ from paddlerec.core.utils import envs
from
paddlerec.core.trainers.transpiler_trainer
import
TranspileTrainer
class
Cluster
Trainer
(
TranspileTrainer
):
class
OnlineLearning
Trainer
(
TranspileTrainer
):
def
processor_register
(
self
):
role
=
PaddleCloudRoleMaker
()
fleet
.
init
(
role
)
...
...
core/trainers/single_trainer.py
浏览文件 @
7a3ec4e6
...
...
@@ -36,7 +36,8 @@ class SingleTrainer(TranspileTrainer):
self
.
regist_context_processor
(
'uninit'
,
self
.
instance
)
self
.
regist_context_processor
(
'init_pass'
,
self
.
init
)
self
.
regist_context_processor
(
'startup_pass'
,
self
.
startup
)
if
envs
.
get_platform
()
==
"LINUX"
and
envs
.
get_global_env
(
"dataset_class"
,
None
,
"train.reader"
)
!=
"DataLoader"
:
if
envs
.
get_platform
()
==
"LINUX"
and
envs
.
get_global_env
(
"dataset_class"
,
None
,
"train.reader"
)
!=
"DataLoader"
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataset_train
)
else
:
self
.
regist_context_processor
(
'train_pass'
,
self
.
dataloader_train
)
...
...
@@ -122,8 +123,8 @@ class SingleTrainer(TranspileTrainer):
fetch_info
=
self
.
fetch_alias
,
print_period
=
self
.
fetch_period
)
end_time
=
time
.
time
()
times
=
end_time
-
begin_time
print
(
"epoch {} using time {}, speed {:.2f} lines/s"
.
format
(
i
,
times
,
ins
/
times
))
times
=
end_time
-
begin_time
print
(
"epoch {} using time {}, speed {:.2f} lines/s"
.
format
(
i
,
times
,
ins
/
times
))
self
.
save
(
i
,
"train"
,
is_fleet
=
False
)
context
[
'status'
]
=
'infer_pass'
...
...
core/trainers/tdm_cluster_trainer.py
浏览文件 @
7a3ec4e6
...
...
@@ -27,7 +27,6 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import f
from
paddlerec.core.utils
import
envs
from
paddlerec.core.trainers.cluster_trainer
import
ClusterTrainer
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
...
...
core/trainers/tdm_single_trainer.py
浏览文件 @
7a3ec4e6
...
...
@@ -24,7 +24,6 @@ import paddle.fluid as fluid
from
paddlerec.core.trainers.single_trainer
import
SingleTrainer
from
paddlerec.core.utils
import
envs
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(message)s"
)
logger
=
logging
.
getLogger
(
"fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
...
...
core/trainers/transpiler_trainer.py
浏览文件 @
7a3ec4e6
...
...
@@ -147,8 +147,8 @@ class TranspileTrainer(Trainer):
if
not
need_save
(
epoch_id
,
save_interval
,
False
):
return
# print("save inference model is not supported now.")
# return
# print("save inference model is not supported now.")
# return
feed_varnames
=
envs
.
get_global_env
(
"save.inference.feed_varnames"
,
None
,
namespace
)
...
...
@@ -248,7 +248,7 @@ class TranspileTrainer(Trainer):
'evaluate_model_path'
,
""
,
namespace
=
'evaluate'
))]
is_return_numpy
=
envs
.
get_global_env
(
'is_return_numpy'
,
True
,
namespace
=
'evaluate'
)
'is_return_numpy'
,
True
,
namespace
=
'evaluate'
)
for
(
epoch
,
model_dir
)
in
model_list
:
print
(
"Begin to infer No.{} model, model_dir: {}"
.
format
(
...
...
core/utils/dataset.py
→
core/utils/dataset
_holder
.py
浏览文件 @
7a3ec4e6
...
...
@@ -22,7 +22,7 @@ from paddlerec.core.utils import fs as fs
from
paddlerec.core.utils
import
util
as
util
class
Dataset
(
object
):
class
Dataset
Holder
(
object
):
"""
Dataset Base
"""
...
...
@@ -62,7 +62,7 @@ class Dataset(object):
pass
class
TimeSplitDataset
(
Dataset
):
class
TimeSplitDataset
Holder
(
DatasetHolder
):
"""
Dataset with time split dir. root_path/$DAY/$HOUR
"""
...
...
@@ -142,16 +142,6 @@ class TimeSplitDataset(Dataset):
data_time
=
data_time
+
datetime
.
timedelta
(
minutes
=
self
.
_split_interval
)
return
data_file_list
class
FluidTimeSplitDataset
(
TimeSplitDataset
):
"""
A Dataset with time split for PaddleFluid
"""
def
__init__
(
self
,
config
):
""" """
TimeSplitDataset
.
__init__
(
self
,
config
)
def
_alloc_dataset
(
self
,
file_list
):
""" """
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
self
.
_config
[
'dataset_type'
])
...
...
core/utils/fs.py
浏览文件 @
7a3ec4e6
...
...
@@ -29,12 +29,12 @@ class LocalFSClient(object):
"""
Util for local disk file_system io
"""
def
__init__
(
self
):
"""R
"""
pass
def
write
(
self
,
content
,
path
,
mode
):
"""
write to file
...
...
@@ -44,7 +44,7 @@ class LocalFSClient(object):
mode(string): w/a w:clear_write a:append_write
"""
temp_dir
=
os
.
path
.
dirname
(
path
)
if
not
os
.
path
.
exists
(
temp_dir
):
if
not
os
.
path
.
exists
(
temp_dir
):
os
.
makedirs
(
temp_dir
)
f
=
open
(
path
,
mode
)
f
.
write
(
content
)
...
...
@@ -76,7 +76,7 @@ class LocalFSClient(object):
"""R
"""
os
.
system
(
"rm -rf "
+
path
)
def
is_exist
(
self
,
path
):
"""R
"""
...
...
@@ -95,13 +95,14 @@ class FileHandler(object):
"""
A Smart file handler. auto judge local/afs by path
"""
def
__init__
(
self
,
config
):
"""R
"""
if
'fs_name'
in
config
:
hadoop_home
=
"$HADOOP_HOME"
hadoop_home
=
"$HADOOP_HOME"
hdfs_configs
=
{
"hadoop.job.ugi"
:
config
[
'fs_ugi'
],
"hadoop.job.ugi"
:
config
[
'fs_ugi'
],
"fs.default.name"
:
config
[
'fs_name'
]
}
self
.
_hdfs_client
=
HDFSClient
(
hadoop_home
,
hdfs_configs
)
...
...
@@ -132,7 +133,8 @@ class FileHandler(object):
if
mode
.
find
(
'a'
)
>=
0
:
org_content
=
self
.
_hdfs_client
.
cat
(
dest_path
)
content
=
content
+
org_content
self
.
_local_fs_client
.
write
(
content
,
temp_local_file
,
mode
)
#fleet hdfs_client only support upload, so write tmp file
self
.
_local_fs_client
.
write
(
content
,
temp_local_file
,
mode
)
# fleet hdfs_client only support upload, so write tmp file
self
.
_hdfs_client
.
delete
(
dest_path
+
".tmp"
)
self
.
_hdfs_client
.
upload
(
dest_path
+
".tmp"
,
temp_local_file
)
self
.
_hdfs_client
.
delete
(
dest_path
+
".bak"
)
...
...
@@ -140,7 +142,7 @@ class FileHandler(object):
self
.
_hdfs_client
.
rename
(
dest_path
+
".tmp"
,
dest_path
)
else
:
self
.
_local_fs_client
.
write
(
content
,
dest_path
,
mode
)
def
cat
(
self
,
path
):
"""R
"""
...
...
@@ -149,7 +151,7 @@ class FileHandler(object):
return
hdfs_cat
else
:
return
self
.
_local_fs_client
.
cat
(
path
)
def
ls
(
self
,
path
):
"""R
"""
...
...
@@ -161,7 +163,7 @@ class FileHandler(object):
files
=
self
.
_local_fs_client
.
ls
(
path
)
files
=
[
path
+
'/'
+
fi
for
fi
in
files
]
# absulte path
return
files
def
cp
(
self
,
org_path
,
dest_path
):
"""R
"""
...
...
@@ -171,6 +173,6 @@ class FileHandler(object):
return
self
.
_local_fs_client
.
cp
(
org_path
,
dest_path
)
if
not
org_is_afs
and
dest_is_afs
:
return
self
.
_hdfs_client
.
upload
(
dest_path
,
org_path
)
if
org_is_afs
and
not
dest_is_afs
:
if
org_is_afs
and
not
dest_is_afs
:
return
self
.
_hdfs_client
.
download
(
org_path
,
dest_path
)
print
(
"Not Suppor hdfs cp currently"
)
core/utils/table.py
浏览文件 @
7a3ec4e6
...
...
@@ -18,7 +18,7 @@ class TableMeta(object):
Simple ParamTable Meta, Contain table_id
"""
TableId
=
1
@
staticmethod
def
alloc_new_table
(
table_id
):
"""
...
...
models/contentunderstanding/classification/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -30,7 +30,7 @@ class Model(ModelBase):
def
train_net
(
self
):
""" network definition """
data
=
fluid
.
data
(
name
=
"input"
,
shape
=
[
None
,
self
.
max_len
],
dtype
=
'int64'
)
label
=
fluid
.
data
(
name
=
"label"
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
seq_len
=
fluid
.
data
(
name
=
"seq_len"
,
shape
=
[
None
],
dtype
=
'int64'
)
...
...
@@ -54,7 +54,7 @@ class Model(ModelBase):
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_1
],
size
=
self
.
class_dim
,
act
=
"softmax"
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
self
.
cost
=
avg_cost
self
.
_metrics
[
"acc"
]
=
acc
...
...
models/contentunderstanding/classification/reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -22,12 +22,12 @@ class TrainReader(Reader):
def
init
(
self
):
pass
def
_process_line
(
self
,
l
):
def
_process_line
(
self
,
l
):
l
=
l
.
strip
().
split
(
" "
)
data
=
l
[
0
:
10
]
seq_len
=
l
[
10
:
11
]
label
=
l
[
11
:]
return
data
,
label
,
seq_len
return
data
,
label
,
seq_len
def
generate_sample
(
self
,
line
):
def
data_iter
():
...
...
@@ -38,6 +38,7 @@ class TrainReader(Reader):
data
=
[
int
(
i
)
for
i
in
data
]
label
=
[
int
(
i
)
for
i
in
label
]
seq_len
=
[
int
(
i
)
for
i
in
seq_len
]
print
>>
sys
.
stderr
,
str
([(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)])
print
>>
sys
.
stderr
,
str
([(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)])
yield
[(
'data'
,
data
),
(
'label'
,
label
),
(
'seq_len'
,
seq_len
)]
return
data_iter
models/contentunderstanding/tagspace/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -18,6 +18,7 @@ import paddle.fluid.layers.tensor as tensor
import
paddle.fluid.layers.control_flow
as
cf
from
paddlerec.core.model
import
Model
as
ModelBase
from
paddlerec.core.utils
import
envs
class
Model
(
ModelBase
):
...
...
@@ -25,14 +26,13 @@ class Model(ModelBase):
ModelBase
.
__init__
(
self
,
config
)
self
.
cost
=
None
self
.
metrics
=
{}
self
.
vocab_text_size
=
11447
#envs.get_global_env("vocab_text_size", None, self._namespace)
self
.
vocab_tag_size
=
4
#envs.get_global_env("vocab_tag_size", None, self._namespace)
self
.
emb_dim
=
10
#envs.get_global_env("emb_dim", None, self._namespace)
self
.
hid_dim
=
1000
#envs.get_global_env("hid_dim", None, self._namespace)
self
.
win_size
=
5
#envs.get_global_env("win_size", None, self._namespace)
self
.
margin
=
0.1
#envs.get_global_env("margin", None, self._namespace)
self
.
neg_size
=
3
#envs.get_global_env("neg_size", None, self._namespace)
print
self
.
emb_dim
self
.
vocab_text_size
=
envs
.
get_global_env
(
"vocab_text_size"
,
None
,
self
.
_namespace
)
self
.
vocab_tag_size
=
envs
.
get_global_env
(
"vocab_tag_size"
,
None
,
self
.
_namespace
)
self
.
emb_dim
=
envs
.
get_global_env
(
"emb_dim"
,
None
,
self
.
_namespace
)
self
.
hid_dim
=
envs
.
get_global_env
(
"hid_dim"
,
None
,
self
.
_namespace
)
self
.
win_size
=
envs
.
get_global_env
(
"win_size"
,
None
,
self
.
_namespace
)
self
.
margin
=
envs
.
get_global_env
(
"margin"
,
None
,
self
.
_namespace
)
self
.
neg_size
=
envs
.
get_global_env
(
"neg_size"
,
None
,
self
.
_namespace
)
def
train_net
(
self
):
""" network definition """
...
...
@@ -96,11 +96,9 @@ class Model(ModelBase):
return
self
.
metrics
def
optimizer
(
self
):
learning_rate
=
0.01
#
envs.get_global_env("hyper_parameters.base_lr", None, self._namespace)
learning_rate
=
envs
.
get_global_env
(
"hyper_parameters.base_lr"
,
None
,
self
.
_namespace
)
sgd_optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
learning_rate
)
#sgd_optimizer.minimize(avg_cost)
return
sgd_optimizer
def
infer_net
(
self
,
parameter_list
):
self
.
train_net
()
models/contentunderstanding/tagspace/reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -19,11 +19,12 @@ import numpy as np
from
paddlerec.core.reader
import
Reader
class
TrainReader
(
Reader
):
def
init
(
self
):
pass
def
_process_line
(
self
,
l
):
def
_process_line
(
self
,
l
):
tag_size
=
4
neg_size
=
3
l
=
l
.
strip
().
split
(
","
)
...
...
@@ -46,10 +47,7 @@ class TrainReader(Reader):
neg_index
=
rand_i
neg_tag
.
append
(
neg_index
)
sum_n
+=
1
# if n > 0 and len(text) > n:
# #yield None
# return None, None, None
return
text
,
pos_tag
,
neg_tag
return
text
,
pos_tag
,
neg_tag
def
generate_sample
(
self
,
line
):
def
data_iter
():
...
...
@@ -58,4 +56,5 @@ class TrainReader(Reader):
yield
None
return
yield
[(
'text'
,
text
),
(
'pos_tag'
,
pos_tag
),
(
'neg_tag'
,
neg_tag
)]
return
data_iter
models/match/dssm/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -24,11 +24,12 @@ class Model(ModelBase):
def
input
(
self
):
TRIGRAM_D
=
envs
.
get_global_env
(
"hyper_parameters.TRIGRAM_D"
,
None
,
self
.
_namespace
)
Neg
=
envs
.
get_global_env
(
"hyper_parameters.NEG"
,
None
,
self
.
_namespace
)
Neg
=
envs
.
get_global_env
(
"hyper_parameters.NEG"
,
None
,
self
.
_namespace
)
self
.
query
=
fluid
.
data
(
name
=
"query"
,
shape
=
[
-
1
,
TRIGRAM_D
],
dtype
=
'float32'
,
lod_level
=
0
)
self
.
doc_pos
=
fluid
.
data
(
name
=
"doc_pos"
,
shape
=
[
-
1
,
TRIGRAM_D
],
dtype
=
'float32'
,
lod_level
=
0
)
self
.
doc_negs
=
[
fluid
.
data
(
name
=
"doc_neg_"
+
str
(
i
),
shape
=
[
-
1
,
TRIGRAM_D
],
dtype
=
"float32"
,
lod_level
=
0
)
for
i
in
range
(
Neg
)]
self
.
doc_negs
=
[
fluid
.
data
(
name
=
"doc_neg_"
+
str
(
i
),
shape
=
[
-
1
,
TRIGRAM_D
],
dtype
=
"float32"
,
lod_level
=
0
)
for
i
in
range
(
Neg
)]
self
.
_data_var
.
append
(
self
.
query
)
self
.
_data_var
.
append
(
self
.
doc_pos
)
for
input
in
self
.
doc_negs
:
...
...
@@ -37,40 +38,40 @@ class Model(ModelBase):
if
self
.
_platform
!=
"LINUX"
:
self
.
_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
def
net
(
self
,
is_infer
=
False
):
hidden_layers
=
envs
.
get_global_env
(
"hyper_parameters.fc_sizes"
,
None
,
self
.
_namespace
)
hidden_layers
=
envs
.
get_global_env
(
"hyper_parameters.fc_sizes"
,
None
,
self
.
_namespace
)
hidden_acts
=
envs
.
get_global_env
(
"hyper_parameters.fc_acts"
,
None
,
self
.
_namespace
)
def
fc
(
data
,
hidden_layers
,
hidden_acts
,
names
):
fc_inputs
=
[
data
]
for
i
in
range
(
len
(
hidden_layers
)):
xavier
=
fluid
.
initializer
.
Xavier
(
uniform
=
True
,
fan_in
=
fc_inputs
[
-
1
].
shape
[
1
],
fan_out
=
hidden_layers
[
i
])
out
=
fluid
.
layers
.
fc
(
input
=
fc_inputs
[
-
1
],
size
=
hidden_layers
[
i
],
act
=
hidden_acts
[
i
],
param_attr
=
xavier
,
bias_attr
=
xavier
,
name
=
names
[
i
])
fc_inputs
.
append
(
out
)
return
fc_inputs
[
-
1
]
query_fc
=
fc
(
self
.
query
,
hidden_layers
,
hidden_acts
,
[
'query_l1'
,
'query_l2'
,
'query_l3'
])
doc_pos_fc
=
fc
(
self
.
doc_pos
,
hidden_layers
,
hidden_acts
,
[
'doc_pos_l1'
,
'doc_pos_l2'
,
'doc_pos_l3'
])
self
.
R_Q_D_p
=
fluid
.
layers
.
cos_sim
(
query_fc
,
doc_pos_fc
)
for
i
in
range
(
len
(
hidden_layers
)):
xavier
=
fluid
.
initializer
.
Xavier
(
uniform
=
True
,
fan_in
=
fc_inputs
[
-
1
].
shape
[
1
],
fan_out
=
hidden_layers
[
i
])
out
=
fluid
.
layers
.
fc
(
input
=
fc_inputs
[
-
1
],
size
=
hidden_layers
[
i
],
act
=
hidden_acts
[
i
],
param_attr
=
xavier
,
bias_attr
=
xavier
,
name
=
names
[
i
])
fc_inputs
.
append
(
out
)
return
fc_inputs
[
-
1
]
query_fc
=
fc
(
self
.
query
,
hidden_layers
,
hidden_acts
,
[
'query_l1'
,
'query_l2'
,
'query_l3'
])
doc_pos_fc
=
fc
(
self
.
doc_pos
,
hidden_layers
,
hidden_acts
,
[
'doc_pos_l1'
,
'doc_pos_l2'
,
'doc_pos_l3'
])
self
.
R_Q_D_p
=
fluid
.
layers
.
cos_sim
(
query_fc
,
doc_pos_fc
)
if
is_infer
:
return
R_Q_D_ns
=
[]
for
i
,
doc_neg
in
enumerate
(
self
.
doc_negs
):
doc_neg_fc_i
=
fc
(
doc_neg
,
hidden_layers
,
hidden_acts
,
[
'doc_neg_l1_'
+
str
(
i
),
'doc_neg_l2_'
+
str
(
i
),
'doc_neg_l3_'
+
str
(
i
)])
for
i
,
doc_neg
in
enumerate
(
self
.
doc_negs
):
doc_neg_fc_i
=
fc
(
doc_neg
,
hidden_layers
,
hidden_acts
,
[
'doc_neg_l1_'
+
str
(
i
),
'doc_neg_l2_'
+
str
(
i
),
'doc_neg_l3_'
+
str
(
i
)])
R_Q_D_ns
.
append
(
fluid
.
layers
.
cos_sim
(
query_fc
,
doc_neg_fc_i
))
concat_Rs
=
fluid
.
layers
.
concat
(
input
=
[
self
.
R_Q_D_p
]
+
R_Q_D_ns
,
axis
=-
1
)
prob
=
fluid
.
layers
.
softmax
(
concat_Rs
,
axis
=
1
)
hit_prob
=
fluid
.
layers
.
slice
(
prob
,
axes
=
[
0
,
1
],
starts
=
[
0
,
0
],
ends
=
[
4
,
1
])
prob
=
fluid
.
layers
.
softmax
(
concat_Rs
,
axis
=
1
)
hit_prob
=
fluid
.
layers
.
slice
(
prob
,
axes
=
[
0
,
1
],
starts
=
[
0
,
0
],
ends
=
[
4
,
1
])
loss
=
-
fluid
.
layers
.
reduce_sum
(
fluid
.
layers
.
log
(
hit_prob
))
self
.
avg_cost
=
fluid
.
layers
.
mean
(
x
=
loss
)
...
...
@@ -100,10 +101,10 @@ class Model(ModelBase):
self
.
doc_pos
=
fluid
.
data
(
name
=
"doc_pos"
,
shape
=
[
-
1
,
TRIGRAM_D
],
dtype
=
'float32'
,
lod_level
=
0
)
self
.
_infer_data_var
=
[
self
.
query
,
self
.
doc_pos
]
self
.
_infer_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
self
.
_infer_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
def
infer_net
(
self
):
self
.
infer_input
()
self
.
infer_input
()
self
.
net
(
is_infer
=
True
)
self
.
infer_results
()
self
.
infer_results
()
models/match/dssm/synthetic_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -37,7 +37,7 @@ class TrainReader(Reader):
neg_docs
=
[]
for
i
in
range
(
len
(
features
)
-
2
):
feature_names
.
append
(
'doc_neg_'
+
str
(
i
))
neg_docs
.
append
(
map
(
float
,
features
[
i
+
2
].
split
(
','
)))
neg_docs
.
append
(
map
(
float
,
features
[
i
+
2
].
split
(
','
)))
yield
zip
(
feature_names
,
[
query
]
+
[
pos_doc
]
+
neg_docs
)
...
...
models/match/multiview-simnet/evaluate_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -18,8 +18,8 @@ from paddlerec.core.utils import envs
class
EvaluateReader
(
Reader
):
def
init
(
self
):
self
.
query_slots
=
envs
.
get_global_env
(
"hyper_parameters.query_slots"
,
None
,
"train.model"
)
self
.
title_slots
=
envs
.
get_global_env
(
"hyper_parameters.title_slots"
,
None
,
"train.model"
)
self
.
query_slots
=
envs
.
get_global_env
(
"hyper_parameters.query_slots"
,
None
,
"train.model"
)
self
.
title_slots
=
envs
.
get_global_env
(
"hyper_parameters.title_slots"
,
None
,
"train.model"
)
self
.
all_slots
=
[]
for
i
in
range
(
self
.
query_slots
):
...
...
@@ -49,6 +49,7 @@ class EvaluateReader(Reader):
if
visit
:
self
.
_all_slots_dict
[
slot
][
0
]
=
False
else
:
output
[
index
][
1
].
append
(
padding
)
output
[
index
][
1
].
append
(
padding
)
yield
output
return
data_iter
models/match/multiview-simnet/generate_synthetic_data.py
浏览文件 @
7a3ec4e6
...
...
@@ -14,10 +14,12 @@
import
random
class
Dataset
:
def
__init__
(
self
):
pass
class
SyntheticDataset
(
Dataset
):
def
__init__
(
self
,
sparse_feature_dim
,
query_slot_num
,
title_slot_num
,
dataset_size
=
10000
):
# ids are randomly generated
...
...
@@ -39,7 +41,7 @@ class SyntheticDataset(Dataset):
for
i
in
range
(
self
.
query_slot_num
):
qslot
=
generate_ids
(
self
.
ids_per_slot
,
self
.
sparse_feature_dim
)
qslot
=
[
str
(
fea
)
+
':'
+
str
(
i
)
for
fea
in
qslot
]
qslot
=
[
str
(
fea
)
+
':'
+
str
(
i
)
for
fea
in
qslot
]
query_slots
+=
qslot
for
i
in
range
(
self
.
title_slot_num
):
pt_slot
=
generate_ids
(
self
.
ids_per_slot
,
...
...
@@ -50,7 +52,8 @@ class SyntheticDataset(Dataset):
for
i
in
range
(
self
.
title_slot_num
):
nt_slot
=
generate_ids
(
self
.
ids_per_slot
,
self
.
sparse_feature_dim
)
nt_slot
=
[
str
(
fea
)
+
':'
+
str
(
i
+
self
.
query_slot_num
+
self
.
title_slot_num
)
for
fea
in
nt_slot
]
nt_slot
=
[
str
(
fea
)
+
':'
+
str
(
i
+
self
.
query_slot_num
+
self
.
title_slot_num
)
for
fea
in
nt_slot
]
neg_title_slots
+=
nt_slot
yield
query_slots
+
pos_title_slots
+
neg_title_slots
else
:
...
...
@@ -67,6 +70,7 @@ class SyntheticDataset(Dataset):
def
test
(
self
):
return
self
.
_reader_creator
(
False
)
if
__name__
==
'__main__'
:
sparse_feature_dim
=
1000001
query_slots
=
1
...
...
@@ -75,7 +79,7 @@ if __name__ == '__main__':
dataset
=
SyntheticDataset
(
sparse_feature_dim
,
query_slots
,
title_slots
,
dataset_size
)
train_reader
=
dataset
.
train
()
test_reader
=
dataset
.
test
()
with
open
(
"data/train/train.txt"
,
'w'
)
as
fout
:
for
data
in
train_reader
():
fout
.
write
(
' '
.
join
(
data
))
...
...
models/match/multiview-simnet/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -19,6 +19,7 @@ import paddle.fluid.layers.control_flow as cf
from
paddlerec.core.utils
import
envs
from
paddlerec.core.model
import
Model
as
ModelBase
class
BowEncoder
(
object
):
""" bow-encoder """
...
...
@@ -94,13 +95,14 @@ class SimpleEncoderFactory(object):
rnn_encode
=
GrnnEncoder
(
hidden_size
=
enc_hid_size
)
return
rnn_encode
class
Model
(
ModelBase
):
def
__init__
(
self
,
config
):
ModelBase
.
__init__
(
self
,
config
)
self
.
init_config
()
def
init_config
(
self
):
self
.
_fetch_interval
=
1
self
.
_fetch_interval
=
1
query_encoder
=
envs
.
get_global_env
(
"hyper_parameters.query_encoder"
,
None
,
self
.
_namespace
)
title_encoder
=
envs
.
get_global_env
(
"hyper_parameters.title_encoder"
,
None
,
self
.
_namespace
)
query_encode_dim
=
envs
.
get_global_env
(
"hyper_parameters.query_encode_dim"
,
None
,
self
.
_namespace
)
...
...
@@ -112,19 +114,19 @@ class Model(ModelBase):
factory
.
create
(
query_encoder
,
query_encode_dim
)
for
i
in
range
(
query_slots
)
]
self
.
title_encoders
=
[
self
.
title_encoders
=
[
factory
.
create
(
title_encoder
,
title_encode_dim
)
for
i
in
range
(
title_slots
)
]
self
.
emb_size
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
,
None
,
self
.
_namespace
)
self
.
emb_dim
=
envs
.
get_global_env
(
"hyper_parameters.embedding_dim"
,
None
,
self
.
_namespace
)
self
.
emb_shape
=
[
self
.
emb_size
,
self
.
emb_dim
]
self
.
hidden_size
=
envs
.
get_global_env
(
"hyper_parameters.hidden_size"
,
None
,
self
.
_namespace
)
self
.
margin
=
0.1
self
.
emb_size
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
,
None
,
self
.
_namespace
)
self
.
emb_dim
=
envs
.
get_global_env
(
"hyper_parameters.embedding_dim"
,
None
,
self
.
_namespace
)
self
.
emb_shape
=
[
self
.
emb_size
,
self
.
emb_dim
]
self
.
hidden_size
=
envs
.
get_global_env
(
"hyper_parameters.hidden_size"
,
None
,
self
.
_namespace
)
self
.
margin
=
0.1
def
input
(
self
,
is_train
=
True
):
self
.
q_slots
=
[
self
.
q_slots
=
[
fluid
.
data
(
name
=
"%d"
%
i
,
shape
=
[
None
,
1
],
lod_level
=
1
,
dtype
=
'int64'
)
for
i
in
range
(
len
(
self
.
query_encoders
))
...
...
@@ -135,22 +137,23 @@ class Model(ModelBase):
for
i
in
range
(
len
(
self
.
title_encoders
))
]
if
is_train
==
False
:
return
self
.
q_slots
+
self
.
pt_slots
if
is_train
==
False
:
return
self
.
q_slots
+
self
.
pt_slots
self
.
nt_slots
=
[
fluid
.
data
(
name
=
"%d"
%
(
i
+
len
(
self
.
query_encoders
)
+
len
(
self
.
title_encoders
)),
shape
=
[
None
,
1
],
lod_level
=
1
,
dtype
=
'int64'
)
name
=
"%d"
%
(
i
+
len
(
self
.
query_encoders
)
+
len
(
self
.
title_encoders
)),
shape
=
[
None
,
1
],
lod_level
=
1
,
dtype
=
'int64'
)
for
i
in
range
(
len
(
self
.
title_encoders
))
]
return
self
.
q_slots
+
self
.
pt_slots
+
self
.
nt_slots
def
train_input
(
self
):
res
=
self
.
input
()
self
.
_data_var
=
res
use_dataloader
=
envs
.
get_global_env
(
"hyper_parameters.use_DataLoader"
,
False
,
self
.
_namespace
)
use_dataloader
=
envs
.
get_global_env
(
"hyper_parameters.use_DataLoader"
,
False
,
self
.
_namespace
)
if
self
.
_platform
!=
"LINUX"
or
use_dataloader
:
self
.
_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
...
...
@@ -158,15 +161,15 @@ class Model(ModelBase):
def
get_acc
(
self
,
x
,
y
):
less
=
tensor
.
cast
(
cf
.
less_than
(
x
,
y
),
dtype
=
'float32'
)
label_ones
=
fluid
.
layers
.
fill_constant_batch_size_like
(
label_ones
=
fluid
.
layers
.
fill_constant_batch_size_like
(
input
=
x
,
dtype
=
'float32'
,
shape
=
[
-
1
,
1
],
value
=
1.0
)
correct
=
fluid
.
layers
.
reduce_sum
(
less
)
total
=
fluid
.
layers
.
reduce_sum
(
label_ones
)
total
=
fluid
.
layers
.
reduce_sum
(
label_ones
)
acc
=
fluid
.
layers
.
elementwise_div
(
correct
,
total
)
return
acc
return
acc
def
net
(
self
):
q_embs
=
[
q_embs
=
[
fluid
.
embedding
(
input
=
query
,
size
=
self
.
emb_shape
,
param_attr
=
"emb"
)
for
query
in
self
.
q_slots
...
...
@@ -181,8 +184,8 @@ class Model(ModelBase):
input
=
title
,
size
=
self
.
emb_shape
,
param_attr
=
"emb"
)
for
title
in
self
.
nt_slots
]
# encode each embedding field with encoder
# encode each embedding field with encoder
q_encodes
=
[
self
.
query_encoders
[
i
].
forward
(
emb
)
for
i
,
emb
in
enumerate
(
q_embs
)
]
...
...
@@ -198,7 +201,7 @@ class Model(ModelBase):
pt_concat
=
fluid
.
layers
.
concat
(
pt_encodes
)
nt_concat
=
fluid
.
layers
.
concat
(
nt_encodes
)
# projection of hidden layer
# projection of hidden layer
q_hid
=
fluid
.
layers
.
fc
(
q_concat
,
size
=
self
.
hidden_size
,
param_attr
=
'q_fc.w'
,
...
...
@@ -216,7 +219,7 @@ class Model(ModelBase):
cos_pos
=
fluid
.
layers
.
cos_sim
(
q_hid
,
pt_hid
)
cos_neg
=
fluid
.
layers
.
cos_sim
(
q_hid
,
nt_hid
)
# pairwise hinge_loss
# pairwise hinge_loss
loss_part1
=
fluid
.
layers
.
elementwise_sub
(
tensor
.
fill_constant_batch_size_like
(
input
=
cos_pos
,
...
...
@@ -233,7 +236,7 @@ class Model(ModelBase):
loss_part2
)
self
.
avg_cost
=
fluid
.
layers
.
mean
(
loss_part3
)
self
.
acc
=
self
.
get_acc
(
cos_neg
,
cos_pos
)
self
.
acc
=
self
.
get_acc
(
cos_neg
,
cos_pos
)
def
avg_loss
(
self
):
self
.
_cost
=
self
.
avg_cost
...
...
@@ -250,19 +253,19 @@ class Model(ModelBase):
def
optimizer
(
self
):
learning_rate
=
envs
.
get_global_env
(
"hyper_parameters.learning_rate"
,
None
,
self
.
_namespace
)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
learning_rate
)
return
optimizer
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
learning_rate
)
return
optimizer
def
infer_input
(
self
):
res
=
self
.
input
(
is_train
=
False
)
self
.
_infer_data_var
=
res
self
.
_infer_data_var
=
res
self
.
_infer_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
def
infer_net
(
self
):
self
.
infer_input
()
# lookup embedding for each slot
self
.
infer_input
()
# lookup embedding for each slot
q_embs
=
[
fluid
.
embedding
(
input
=
query
,
size
=
self
.
emb_shape
,
param_attr
=
"emb"
)
...
...
@@ -273,14 +276,14 @@ class Model(ModelBase):
input
=
title
,
size
=
self
.
emb_shape
,
param_attr
=
"emb"
)
for
title
in
self
.
pt_slots
]
# encode each embedding field with encoder
# encode each embedding field with encoder
q_encodes
=
[
self
.
query_encoders
[
i
].
forward
(
emb
)
for
i
,
emb
in
enumerate
(
q_embs
)
]
pt_encodes
=
[
self
.
title_encoders
[
i
].
forward
(
emb
)
for
i
,
emb
in
enumerate
(
pt_embs
)
]
# concat multi view for query, pos_title, neg_title
# concat multi view for query, pos_title, neg_title
q_concat
=
fluid
.
layers
.
concat
(
q_encodes
)
pt_concat
=
fluid
.
layers
.
concat
(
pt_encodes
)
# projection of hidden layer
...
...
models/match/multiview-simnet/reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -18,8 +18,8 @@ from paddlerec.core.utils import envs
class
TrainReader
(
Reader
):
def
init
(
self
):
self
.
query_slots
=
envs
.
get_global_env
(
"hyper_parameters.query_slots"
,
None
,
"train.model"
)
self
.
title_slots
=
envs
.
get_global_env
(
"hyper_parameters.title_slots"
,
None
,
"train.model"
)
self
.
query_slots
=
envs
.
get_global_env
(
"hyper_parameters.query_slots"
,
None
,
"train.model"
)
self
.
title_slots
=
envs
.
get_global_env
(
"hyper_parameters.title_slots"
,
None
,
"train.model"
)
self
.
all_slots
=
[]
for
i
in
range
(
self
.
query_slots
):
...
...
@@ -52,6 +52,7 @@ class TrainReader(Reader):
if
visit
:
self
.
_all_slots_dict
[
slot
][
0
]
=
False
else
:
output
[
index
][
1
].
append
(
padding
)
output
[
index
][
1
].
append
(
padding
)
yield
output
return
data_iter
models/multitask/esmm/esmm_infer_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -18,14 +18,14 @@ from collections import defaultdict
from
paddlerec.core.reader
import
Reader
class
EvaluateReader
(
Reader
):
def
init
(
self
):
all_field_id
=
[
'101'
,
'109_14'
,
'110_14'
,
'127_14'
,
'150_14'
,
'121'
,
'122'
,
'124'
,
'125'
,
'126'
,
'127'
,
'128'
,
'129'
,
all_field_id
=
[
'101'
,
'109_14'
,
'110_14'
,
'127_14'
,
'150_14'
,
'121'
,
'122'
,
'124'
,
'125'
,
'126'
,
'127'
,
'128'
,
'129'
,
'205'
,
'206'
,
'207'
,
'210'
,
'216'
,
'508'
,
'509'
,
'702'
,
'853'
,
'301'
]
self
.
all_field_id_dict
=
defaultdict
(
int
)
for
i
,
field_id
in
enumerate
(
all_field_id
):
self
.
all_field_id_dict
[
field_id
]
=
[
False
,
i
]
for
i
,
field_id
in
enumerate
(
all_field_id
):
self
.
all_field_id_dict
[
field_id
]
=
[
False
,
i
]
def
generate_sample
(
self
,
line
):
"""
...
...
@@ -39,25 +39,26 @@ class EvaluateReader(Reader):
features
=
line
.
strip
().
split
(
','
)
ctr
=
int
(
features
[
1
])
cvr
=
int
(
features
[
2
])
padding
=
0
output
=
[(
field_id
,[])
for
field_id
in
self
.
all_field_id_dict
]
output
=
[(
field_id
,
[])
for
field_id
in
self
.
all_field_id_dict
]
for
elem
in
features
[
4
:]:
field_id
,
feat_id
=
elem
.
strip
().
split
(
':'
)
field_id
,
feat_id
=
elem
.
strip
().
split
(
':'
)
if
field_id
not
in
self
.
all_field_id_dict
:
continue
self
.
all_field_id_dict
[
field_id
][
0
]
=
True
index
=
self
.
all_field_id_dict
[
field_id
][
1
]
output
[
index
][
1
].
append
(
int
(
feat_id
))
output
[
index
][
1
].
append
(
int
(
feat_id
))
for
field_id
in
self
.
all_field_id_dict
:
visited
,
index
=
self
.
all_field_id_dict
[
field_id
]
visited
,
index
=
self
.
all_field_id_dict
[
field_id
]
if
visited
:
self
.
all_field_id_dict
[
field_id
][
0
]
=
False
else
:
output
[
index
][
1
].
append
(
padding
)
output
[
index
][
1
].
append
(
padding
)
output
.
append
((
'ctr'
,
[
ctr
]))
output
.
append
((
'cvr'
,
[
cvr
]))
yield
output
return
reader
models/multitask/esmm/esmm_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -21,11 +21,12 @@ from paddlerec.core.reader import Reader
class
TrainReader
(
Reader
):
def
init
(
self
):
all_field_id
=
[
'101'
,
'109_14'
,
'110_14'
,
'127_14'
,
'150_14'
,
'121'
,
'122'
,
'124'
,
'125'
,
'126'
,
'127'
,
'128'
,
'129'
,
all_field_id
=
[
'101'
,
'109_14'
,
'110_14'
,
'127_14'
,
'150_14'
,
'121'
,
'122'
,
'124'
,
'125'
,
'126'
,
'127'
,
'128'
,
'129'
,
'205'
,
'206'
,
'207'
,
'210'
,
'216'
,
'508'
,
'509'
,
'702'
,
'853'
,
'301'
]
self
.
all_field_id_dict
=
defaultdict
(
int
)
for
i
,
field_id
in
enumerate
(
all_field_id
):
self
.
all_field_id_dict
[
field_id
]
=
[
False
,
i
]
for
i
,
field_id
in
enumerate
(
all_field_id
):
self
.
all_field_id_dict
[
field_id
]
=
[
False
,
i
]
def
generate_sample
(
self
,
line
):
"""
...
...
@@ -37,30 +38,31 @@ class TrainReader(Reader):
This function needs to be implemented by the user, based on data format
"""
features
=
line
.
strip
().
split
(
','
)
#ctr = list(map(int, features[1]))
#cvr = list(map(int, features[2]))
#
ctr = list(map(int, features[1]))
#
cvr = list(map(int, features[2]))
ctr
=
int
(
features
[
1
])
cvr
=
int
(
features
[
2
])
padding
=
0
output
=
[(
field_id
,[])
for
field_id
in
self
.
all_field_id_dict
]
output
=
[(
field_id
,
[])
for
field_id
in
self
.
all_field_id_dict
]
for
elem
in
features
[
4
:]:
field_id
,
feat_id
=
elem
.
strip
().
split
(
':'
)
field_id
,
feat_id
=
elem
.
strip
().
split
(
':'
)
if
field_id
not
in
self
.
all_field_id_dict
:
continue
self
.
all_field_id_dict
[
field_id
][
0
]
=
True
index
=
self
.
all_field_id_dict
[
field_id
][
1
]
#
feat_id = list(map(int, feat_id))
output
[
index
][
1
].
append
(
int
(
feat_id
))
#
feat_id = list(map(int, feat_id))
output
[
index
][
1
].
append
(
int
(
feat_id
))
for
field_id
in
self
.
all_field_id_dict
:
visited
,
index
=
self
.
all_field_id_dict
[
field_id
]
visited
,
index
=
self
.
all_field_id_dict
[
field_id
]
if
visited
:
self
.
all_field_id_dict
[
field_id
][
0
]
=
False
else
:
output
[
index
][
1
].
append
(
padding
)
output
[
index
][
1
].
append
(
padding
)
output
.
append
((
'ctr'
,
[
ctr
]))
output
.
append
((
'cvr'
,
[
cvr
]))
yield
output
return
reader
models/multitask/esmm/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -23,71 +23,73 @@ class Model(ModelBase):
def
__init__
(
self
,
config
):
ModelBase
.
__init__
(
self
,
config
)
def
fc
(
self
,
tag
,
data
,
out_dim
,
active
=
'prelu'
):
def
fc
(
self
,
tag
,
data
,
out_dim
,
active
=
'prelu'
):
init_stddev
=
1.0
scales
=
1.0
/
np
.
sqrt
(
data
.
shape
[
1
])
scales
=
1.0
/
np
.
sqrt
(
data
.
shape
[
1
])
p_attr
=
fluid
.
param_attr
.
ParamAttr
(
name
=
'%s_weight'
%
tag
,
initializer
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
init_stddev
*
scales
))
initializer
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
init_stddev
*
scales
))
b_attr
=
fluid
.
ParamAttr
(
name
=
'%s_bias'
%
tag
,
initializer
=
fluid
.
initializer
.
Constant
(
0.1
))
out
=
fluid
.
layers
.
fc
(
input
=
data
,
size
=
out_dim
,
act
=
active
,
param_attr
=
p_attr
,
bias_attr
=
b_attr
,
name
=
tag
)
size
=
out_dim
,
act
=
active
,
param_attr
=
p_attr
,
bias_attr
=
b_attr
,
name
=
tag
)
return
out
def
input_data
(
self
):
sparse_input_ids
=
[
fluid
.
data
(
name
=
"field_"
+
str
(
i
),
shape
=
[
-
1
,
1
],
dtype
=
"int64"
,
lod_level
=
1
)
for
i
in
range
(
0
,
23
)
fluid
.
data
(
name
=
"field_"
+
str
(
i
),
shape
=
[
-
1
,
1
],
dtype
=
"int64"
,
lod_level
=
1
)
for
i
in
range
(
0
,
23
)
]
label_ctr
=
fluid
.
data
(
name
=
"ctr"
,
shape
=
[
-
1
,
1
],
dtype
=
"int64"
)
label_cvr
=
fluid
.
data
(
name
=
"cvr"
,
shape
=
[
-
1
,
1
],
dtype
=
"int64"
)
inputs
=
sparse_input_ids
+
[
label_ctr
]
+
[
label_cvr
]
self
.
_data_var
.
extend
(
inputs
)
return
inputs
def
net
(
self
,
inputs
,
is_infer
=
False
):
vocab_size
=
envs
.
get_global_env
(
"hyper_parameters.vocab_size"
,
None
,
self
.
_namespace
)
embed_size
=
envs
.
get_global_env
(
"hyper_parameters.embed_size"
,
None
,
self
.
_namespace
)
emb
=
[]
for
data
in
inputs
[
0
:
-
2
]:
feat_emb
=
fluid
.
embedding
(
input
=
data
,
size
=
[
vocab_size
,
embed_size
],
param_attr
=
fluid
.
ParamAttr
(
name
=
'dis_emb'
,
learning_rate
=
5
,
initializer
=
fluid
.
initializer
.
Xavier
(
fan_in
=
embed_size
,
fan_out
=
embed_size
)
),
is_sparse
=
True
)
field_emb
=
fluid
.
layers
.
sequence_pool
(
input
=
feat_emb
,
pool_type
=
'sum'
)
size
=
[
vocab_size
,
embed_size
],
param_attr
=
fluid
.
ParamAttr
(
name
=
'dis_emb'
,
learning_rate
=
5
,
initializer
=
fluid
.
initializer
.
Xavier
(
fan_in
=
embed_size
,
fan_out
=
embed_size
)
),
is_sparse
=
True
)
field_emb
=
fluid
.
layers
.
sequence_pool
(
input
=
feat_emb
,
pool_type
=
'sum'
)
emb
.
append
(
field_emb
)
concat_emb
=
fluid
.
layers
.
concat
(
emb
,
axis
=
1
)
# ctr
active
=
'relu'
ctr_fc1
=
self
.
fc
(
'ctr_fc1'
,
concat_emb
,
200
,
active
)
ctr_fc2
=
self
.
fc
(
'ctr_fc2'
,
ctr_fc1
,
80
,
active
)
ctr_out
=
self
.
fc
(
'ctr_out'
,
ctr_fc2
,
2
,
'softmax'
)
# cvr
cvr_fc1
=
self
.
fc
(
'cvr_fc1'
,
concat_emb
,
200
,
active
)
cvr_fc2
=
self
.
fc
(
'cvr_fc2'
,
cvr_fc1
,
80
,
active
)
cvr_out
=
self
.
fc
(
'cvr_out'
,
cvr_fc2
,
2
,
'softmax'
)
cvr_out
=
self
.
fc
(
'cvr_out'
,
cvr_fc2
,
2
,
'softmax'
)
ctr_clk
=
inputs
[
-
2
]
ctcvr_buy
=
inputs
[
-
1
]
ctr_prop_one
=
fluid
.
layers
.
slice
(
ctr_out
,
axes
=
[
1
],
starts
=
[
1
],
ends
=
[
2
])
cvr_prop_one
=
fluid
.
layers
.
slice
(
cvr_out
,
axes
=
[
1
],
starts
=
[
1
],
ends
=
[
2
])
ctcvr_prop_one
=
fluid
.
layers
.
elementwise_mul
(
ctr_prop_one
,
cvr_prop_one
)
ctcvr_prop
=
fluid
.
layers
.
concat
(
input
=
[
1
-
ctcvr_prop_one
,
ctcvr_prop_one
],
axis
=
1
)
ctcvr_prop
=
fluid
.
layers
.
concat
(
input
=
[
1
-
ctcvr_prop_one
,
ctcvr_prop_one
],
axis
=
1
)
auc_ctr
,
batch_auc_ctr
,
auc_states_ctr
=
fluid
.
layers
.
auc
(
input
=
ctr_out
,
label
=
ctr_clk
)
auc_ctcvr
,
batch_auc_ctcvr
,
auc_states_ctcvr
=
fluid
.
layers
.
auc
(
input
=
ctcvr_prop
,
label
=
ctcvr_buy
)
...
...
@@ -97,27 +99,23 @@ class Model(ModelBase):
self
.
_infer_results
[
"AUC_ctcvr"
]
=
auc_ctcvr
return
loss_ctr
=
fluid
.
layers
.
cross_entropy
(
input
=
ctr_out
,
label
=
ctr_clk
)
loss_ctcvr
=
fluid
.
layers
.
cross_entropy
(
input
=
ctcvr_prop
,
label
=
ctcvr_buy
)
cost
=
loss_ctr
+
loss_ctcvr
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
self
.
_cost
=
avg_cost
self
.
_metrics
[
"AUC_ctr"
]
=
auc_ctr
self
.
_metrics
[
"BATCH_AUC_ctr"
]
=
batch_auc_ctr
self
.
_metrics
[
"AUC_ctcvr"
]
=
auc_ctcvr
self
.
_metrics
[
"BATCH_AUC_ctcvr"
]
=
batch_auc_ctcvr
def
train_net
(
self
):
input_data
=
self
.
input_data
()
self
.
net
(
input_data
)
def
infer_net
(
self
):
self
.
_infer_data_var
=
self
.
input_data
()
self
.
_infer_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
self
.
net
(
self
.
_infer_data_var
,
is_infer
=
True
)
models/multitask/mmoe/census_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -43,8 +43,8 @@ class TrainReader(Reader):
label_marital
=
[
1
,
0
]
elif
int
(
l
[
0
])
==
1
:
label_marital
=
[
0
,
1
]
#label_income = np.array(label_income)
#label_marital = np.array(label_marital)
#
label_income = np.array(label_income)
#
label_marital = np.array(label_marital)
feature_name
=
[
"input"
,
"label_income"
,
"label_marital"
]
yield
zip
(
feature_name
,
[
data
]
+
[
label_income
]
+
[
label_marital
])
...
...
models/multitask/mmoe/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -36,22 +36,21 @@ class Model(ModelBase):
if
is_infer
:
self
.
_infer_data_var
=
[
input_data
,
label_income
,
label_marital
]
self
.
_infer_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
self
.
_data_var
.
extend
([
input_data
,
label_income
,
label_marital
])
# f_{i}(x) = activation(W_{i} * x + b), where activation is ReLU according to the paper
expert_outputs
=
[]
for
i
in
range
(
0
,
expert_num
):
expert_output
=
fluid
.
layers
.
fc
(
input
=
input_data
,
size
=
expert_size
,
act
=
'relu'
,
bias_attr
=
fluid
.
ParamAttr
(
learning_rate
=
1.0
),
name
=
'expert_'
+
str
(
i
))
size
=
expert_size
,
act
=
'relu'
,
bias_attr
=
fluid
.
ParamAttr
(
learning_rate
=
1.0
),
name
=
'expert_'
+
str
(
i
))
expert_outputs
.
append
(
expert_output
)
expert_concat
=
fluid
.
layers
.
concat
(
expert_outputs
,
axis
=
1
)
expert_concat
=
fluid
.
layers
.
reshape
(
expert_concat
,[
-
1
,
expert_num
,
expert_size
])
expert_concat
=
fluid
.
layers
.
reshape
(
expert_concat
,
[
-
1
,
expert_num
,
expert_size
])
# g^{k}(x) = activation(W_{gk} * x + b), where activation is softmax according to the paper
output_layers
=
[]
for
i
in
range
(
0
,
gate_num
):
...
...
@@ -61,52 +60,53 @@ class Model(ModelBase):
bias_attr
=
fluid
.
ParamAttr
(
learning_rate
=
1.0
),
name
=
'gate_'
+
str
(
i
))
# f^{k}(x) = sum_{i=1}^{n}(g^{k}(x)_{i} * f_{i}(x))
cur_gate_expert
=
fluid
.
layers
.
elementwise_mul
(
expert_concat
,
cur_gate
,
axis
=
0
)
cur_gate_expert
=
fluid
.
layers
.
elementwise_mul
(
expert_concat
,
cur_gate
,
axis
=
0
)
cur_gate_expert
=
fluid
.
layers
.
reduce_sum
(
cur_gate_expert
,
dim
=
1
)
# Build tower layer
cur_tower
=
fluid
.
layers
.
fc
(
input
=
cur_gate_expert
,
size
=
tower_size
,
act
=
'relu'
,
name
=
'task_layer_'
+
str
(
i
))
out
=
fluid
.
layers
.
fc
(
input
=
cur_tower
,
size
=
2
,
act
=
'softmax'
,
name
=
'out_'
+
str
(
i
))
cur_tower
=
fluid
.
layers
.
fc
(
input
=
cur_gate_expert
,
size
=
tower_size
,
act
=
'relu'
,
name
=
'task_layer_'
+
str
(
i
))
out
=
fluid
.
layers
.
fc
(
input
=
cur_tower
,
size
=
2
,
act
=
'softmax'
,
name
=
'out_'
+
str
(
i
))
output_layers
.
append
(
out
)
pred_income
=
fluid
.
layers
.
clip
(
output_layers
[
0
],
min
=
1e-15
,
max
=
1.0
-
1e-15
)
pred_marital
=
fluid
.
layers
.
clip
(
output_layers
[
1
],
min
=
1e-15
,
max
=
1.0
-
1e-15
)
label_income_1
=
fluid
.
layers
.
slice
(
label_income
,
axes
=
[
1
],
starts
=
[
1
],
ends
=
[
2
])
label_marital_1
=
fluid
.
layers
.
slice
(
label_marital
,
axes
=
[
1
],
starts
=
[
1
],
ends
=
[
2
])
auc_income
,
batch_auc_1
,
auc_states_1
=
fluid
.
layers
.
auc
(
input
=
pred_income
,
label
=
fluid
.
layers
.
cast
(
x
=
label_income_1
,
dtype
=
'int64'
))
auc_marital
,
batch_auc_2
,
auc_states_2
=
fluid
.
layers
.
auc
(
input
=
pred_marital
,
label
=
fluid
.
layers
.
cast
(
x
=
label_marital_1
,
dtype
=
'int64'
))
auc_income
,
batch_auc_1
,
auc_states_1
=
fluid
.
layers
.
auc
(
input
=
pred_income
,
label
=
fluid
.
layers
.
cast
(
x
=
label_income_1
,
dtype
=
'int64'
))
auc_marital
,
batch_auc_2
,
auc_states_2
=
fluid
.
layers
.
auc
(
input
=
pred_marital
,
label
=
fluid
.
layers
.
cast
(
x
=
label_marital_1
,
dtype
=
'int64'
))
if
is_infer
:
self
.
_infer_results
[
"AUC_income"
]
=
auc_income
self
.
_infer_results
[
"AUC_marital"
]
=
auc_marital
return
cost_income
=
fluid
.
layers
.
cross_entropy
(
input
=
pred_income
,
label
=
label_income
,
soft_label
=
True
)
cost_marital
=
fluid
.
layers
.
cross_entropy
(
input
=
pred_marital
,
label
=
label_marital
,
soft_label
=
True
)
cost_income
=
fluid
.
layers
.
cross_entropy
(
input
=
pred_income
,
label
=
label_income
,
soft_label
=
True
)
cost_marital
=
fluid
.
layers
.
cross_entropy
(
input
=
pred_marital
,
label
=
label_marital
,
soft_label
=
True
)
avg_cost_income
=
fluid
.
layers
.
mean
(
x
=
cost_income
)
avg_cost_marital
=
fluid
.
layers
.
mean
(
x
=
cost_marital
)
cost
=
avg_cost_income
+
avg_cost_marital
cost
=
avg_cost_income
+
avg_cost_marital
self
.
_cost
=
cost
self
.
_metrics
[
"AUC_income"
]
=
auc_income
self
.
_metrics
[
"BATCH_AUC_income"
]
=
batch_auc_1
self
.
_metrics
[
"AUC_marital"
]
=
auc_marital
self
.
_metrics
[
"BATCH_AUC_marital"
]
=
batch_auc_2
def
train_net
(
self
):
self
.
MMOE
()
def
infer_net
(
self
):
self
.
MMOE
(
is_infer
=
True
)
models/multitask/share-bottom/census_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -43,8 +43,8 @@ class TrainReader(Reader):
label_marital
=
[
1
,
0
]
elif
int
(
l
[
0
])
==
1
:
label_marital
=
[
0
,
1
]
#label_income = np.array(label_income)
#label_marital = np.array(label_marital)
#
label_income = np.array(label_income)
#
label_marital = np.array(label_marital)
feature_name
=
[
"input"
,
"label_income"
,
"label_marital"
]
yield
zip
(
feature_name
,
[
data
]
+
[
label_income
]
+
[
label_marital
])
...
...
models/multitask/share-bottom/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -32,65 +32,65 @@ class Model(ModelBase):
input_data
=
fluid
.
data
(
name
=
"input"
,
shape
=
[
-
1
,
feature_size
],
dtype
=
"float32"
)
label_income
=
fluid
.
data
(
name
=
"label_income"
,
shape
=
[
-
1
,
2
],
dtype
=
"float32"
,
lod_level
=
0
)
label_marital
=
fluid
.
data
(
name
=
"label_marital"
,
shape
=
[
-
1
,
2
],
dtype
=
"float32"
,
lod_level
=
0
)
if
is_infer
:
self
.
_infer_data_var
=
[
input_data
,
label_income
,
label_marital
]
self
.
_infer_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
self
.
_data_var
.
extend
([
input_data
,
label_income
,
label_marital
])
bottom_output
=
fluid
.
layers
.
fc
(
input
=
input_data
,
size
=
bottom_size
,
act
=
'relu'
,
bias_attr
=
fluid
.
ParamAttr
(
learning_rate
=
1.0
),
name
=
'bottom_output'
)
size
=
bottom_size
,
act
=
'relu'
,
bias_attr
=
fluid
.
ParamAttr
(
learning_rate
=
1.0
),
name
=
'bottom_output'
)
# Build tower layer from bottom layer
output_layers
=
[]
for
index
in
range
(
tower_nums
):
for
index
in
range
(
tower_nums
):
tower_layer
=
fluid
.
layers
.
fc
(
input
=
bottom_output
,
size
=
tower_size
,
act
=
'relu'
,
name
=
'task_layer_'
+
str
(
index
))
size
=
tower_size
,
act
=
'relu'
,
name
=
'task_layer_'
+
str
(
index
))
output_layer
=
fluid
.
layers
.
fc
(
input
=
tower_layer
,
size
=
2
,
act
=
'softmax'
,
name
=
'output_layer_'
+
str
(
index
))
size
=
2
,
act
=
'softmax'
,
name
=
'output_layer_'
+
str
(
index
))
output_layers
.
append
(
output_layer
)
pred_income
=
fluid
.
layers
.
clip
(
output_layers
[
0
],
min
=
1e-15
,
max
=
1.0
-
1e-15
)
pred_marital
=
fluid
.
layers
.
clip
(
output_layers
[
1
],
min
=
1e-15
,
max
=
1.0
-
1e-15
)
label_income_1
=
fluid
.
layers
.
slice
(
label_income
,
axes
=
[
1
],
starts
=
[
1
],
ends
=
[
2
])
label_marital_1
=
fluid
.
layers
.
slice
(
label_marital
,
axes
=
[
1
],
starts
=
[
1
],
ends
=
[
2
])
auc_income
,
batch_auc_1
,
auc_states_1
=
fluid
.
layers
.
auc
(
input
=
pred_income
,
label
=
fluid
.
layers
.
cast
(
x
=
label_income_1
,
dtype
=
'int64'
))
auc_marital
,
batch_auc_2
,
auc_states_2
=
fluid
.
layers
.
auc
(
input
=
pred_marital
,
label
=
fluid
.
layers
.
cast
(
x
=
label_marital_1
,
dtype
=
'int64'
))
auc_income
,
batch_auc_1
,
auc_states_1
=
fluid
.
layers
.
auc
(
input
=
pred_income
,
label
=
fluid
.
layers
.
cast
(
x
=
label_income_1
,
dtype
=
'int64'
))
auc_marital
,
batch_auc_2
,
auc_states_2
=
fluid
.
layers
.
auc
(
input
=
pred_marital
,
label
=
fluid
.
layers
.
cast
(
x
=
label_marital_1
,
dtype
=
'int64'
))
if
is_infer
:
self
.
_infer_results
[
"AUC_income"
]
=
auc_income
self
.
_infer_results
[
"AUC_marital"
]
=
auc_marital
return
cost_income
=
fluid
.
layers
.
cross_entropy
(
input
=
pred_income
,
label
=
label_income
,
soft_label
=
True
)
cost_marital
=
fluid
.
layers
.
cross_entropy
(
input
=
pred_marital
,
label
=
label_marital
,
soft_label
=
True
)
cost_income
=
fluid
.
layers
.
cross_entropy
(
input
=
pred_income
,
label
=
label_income
,
soft_label
=
True
)
cost_marital
=
fluid
.
layers
.
cross_entropy
(
input
=
pred_marital
,
label
=
label_marital
,
soft_label
=
True
)
cost
=
fluid
.
layers
.
elementwise_add
(
cost_income
,
cost_marital
,
axis
=
1
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
self
.
_cost
=
avg_cost
self
.
_metrics
[
"AUC_income"
]
=
auc_income
self
.
_metrics
[
"BATCH_AUC_income"
]
=
batch_auc_1
self
.
_metrics
[
"AUC_marital"
]
=
auc_marital
self
.
_metrics
[
"BATCH_AUC_marital"
]
=
batch_auc_2
def
train_net
(
self
):
self
.
model
()
def
infer_net
(
self
):
self
.
model
(
is_infer
=
True
)
models/rank/dcn/criteo_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -21,7 +21,6 @@ try:
except
ImportError
:
import
pickle
from
paddlerec.core.reader
import
Reader
from
paddlerec.core.utils
import
envs
...
...
@@ -47,7 +46,7 @@ class TrainReader(Reader):
self
.
label_feat_names
=
target
+
dense_feat_names
+
sparse_feat_names
self
.
cat_feat_idx_dict_list
=
[{}
for
_
in
range
(
26
)]
# TODO: set vocabulary dictionary
vocab_dir
=
envs
.
get_global_env
(
"feat_dict_name"
,
None
,
"train.reader"
)
for
i
in
range
(
26
):
...
...
@@ -55,7 +54,7 @@ class TrainReader(Reader):
for
line
in
open
(
os
.
path
.
join
(
vocab_dir
,
'C'
+
str
(
i
+
1
)
+
'.txt'
)):
self
.
cat_feat_idx_dict_list
[
i
][
line
.
strip
()]
=
lookup_idx
lookup_idx
+=
1
lookup_idx
+=
1
def
_process_line
(
self
,
line
):
features
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
...
...
@@ -73,20 +72,21 @@ class TrainReader(Reader):
if
idx
==
2
else
math
.
log
(
1
+
float
(
features
[
idx
])))
for
idx
in
self
.
cat_idx_
:
if
features
[
idx
]
==
''
or
features
[
idx
]
not
in
self
.
cat_feat_idx_dict_list
[
idx
-
14
]:
idx
]
not
in
self
.
cat_feat_idx_dict_list
[
idx
-
14
]:
label_feat_list
[
idx
].
append
(
0
)
else
:
label_feat_list
[
idx
].
append
(
self
.
cat_feat_idx_dict_list
[
idx
-
14
][
features
[
idx
]])
idx
-
14
][
features
[
idx
]])
label_feat_list
[
0
].
append
(
int
(
features
[
0
]))
return
label_feat_list
def
generate_sample
(
self
,
line
):
"""
Read the data line by line and process it as a dictionary
"""
def
data_iter
():
label_feat_list
=
self
.
_process_line
(
line
)
yield
list
(
zip
(
self
.
label_feat_names
,
label_feat_list
))
return
data_iter
\ No newline at end of file
return
data_iter
models/rank/dcn/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -23,7 +23,7 @@ from paddlerec.core.model import Model as ModelBase
class
Model
(
ModelBase
):
def
__init__
(
self
,
config
):
ModelBase
.
__init__
(
self
,
config
)
def
init_network
(
self
):
self
.
cross_num
=
envs
.
get_global_env
(
"hyper_parameters.cross_num"
,
None
,
self
.
_namespace
)
self
.
dnn_hidden_units
=
envs
.
get_global_env
(
"hyper_parameters.dnn_hidden_units"
,
None
,
self
.
_namespace
)
...
...
@@ -50,7 +50,7 @@ class Model(ModelBase):
self
.
net_input
=
None
self
.
loss
=
None
def
_create_embedding_input
(
self
,
data_dict
):
# sparse embedding
sparse_emb_dict
=
OrderedDict
((
name
,
fluid
.
embedding
(
...
...
@@ -78,7 +78,7 @@ class Model(ModelBase):
net_input
=
fluid
.
layers
.
concat
([
dense_input
,
sparse_input
],
axis
=-
1
)
return
net_input
def
_deep_net
(
self
,
input
,
hidden_units
,
use_bn
=
False
,
is_test
=
False
):
for
units
in
hidden_units
:
input
=
fluid
.
layers
.
fc
(
input
=
input
,
size
=
units
)
...
...
@@ -95,7 +95,7 @@ class Model(ModelBase):
[
input_dim
],
dtype
=
'float32'
,
name
=
prefix
+
"_b"
)
xw
=
fluid
.
layers
.
reduce_sum
(
x
*
w
,
dim
=
1
,
keep_dim
=
True
)
# (N, 1)
return
x0
*
xw
+
b
+
x
,
w
def
_cross_net
(
self
,
input
,
num_corss_layers
):
x
=
x0
=
input
l2_reg_cross_list
=
[]
...
...
@@ -106,10 +106,10 @@ class Model(ModelBase):
fluid
.
layers
.
concat
(
l2_reg_cross_list
,
axis
=-
1
))
return
x
,
l2_reg_cross_loss
def
_l2_loss
(
self
,
w
):
return
fluid
.
layers
.
reduce_sum
(
fluid
.
layers
.
square
(
w
))
def
train_net
(
self
):
self
.
init_network
()
self
.
target_input
=
fluid
.
data
(
...
...
@@ -118,14 +118,14 @@ class Model(ModelBase):
for
feat_name
in
self
.
feat_dims_dict
:
data_dict
[
feat_name
]
=
fluid
.
data
(
name
=
feat_name
,
shape
=
[
None
,
1
],
dtype
=
'float32'
)
self
.
net_input
=
self
.
_create_embedding_input
(
data_dict
)
deep_out
=
self
.
_deep_net
(
self
.
net_input
,
self
.
dnn_hidden_units
,
self
.
dnn_use_bn
,
False
)
cross_out
,
l2_reg_cross_loss
=
self
.
_cross_net
(
self
.
net_input
,
self
.
cross_num
)
self
.
cross_num
)
last_out
=
fluid
.
layers
.
concat
([
deep_out
,
cross_out
],
axis
=-
1
)
logit
=
fluid
.
layers
.
fc
(
last_out
,
1
)
...
...
@@ -141,7 +141,6 @@ class Model(ModelBase):
input
=
prob_2d
,
label
=
label_int
,
slide_steps
=
0
)
self
.
_metrics
[
"AUC"
]
=
auc_var
self
.
_metrics
[
"BATCH_AUC"
]
=
batch_auc_var
# logloss
logloss
=
fluid
.
layers
.
log_loss
(
self
.
prob
,
self
.
target_input
)
...
...
models/rank/deepfm/criteo_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -38,7 +38,7 @@ class TrainReader(Reader):
self
.
categorical_range_
=
range
(
14
,
40
)
# load preprocessed feature dict
self
.
feat_dict_name
=
envs
.
get_global_env
(
"feat_dict_name"
,
None
,
"train.reader"
)
self
.
feat_dict_
=
pickle
.
load
(
open
(
self
.
feat_dict_name
,
'rb'
))
self
.
feat_dict_
=
pickle
.
load
(
open
(
self
.
feat_dict_name
,
'rb'
))
def
_process_line
(
self
,
line
):
features
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
...
...
@@ -62,13 +62,14 @@ class TrainReader(Reader):
feat_value
.
append
(
1.0
)
label
=
[
int
(
features
[
0
])]
return
feat_idx
,
feat_value
,
label
def
generate_sample
(
self
,
line
):
"""
Read the data line by line and process it as a dictionary
"""
def
data_iter
():
feat_idx
,
feat_value
,
label
=
self
.
_process_line
(
line
)
yield
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]
return
data_iter
\ No newline at end of file
return
data_iter
models/rank/deepfm/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -29,26 +29,27 @@ class Model(ModelBase):
is_distributed
=
True
if
envs
.
get_trainer
()
==
"CtrTrainer"
else
False
sparse_feature_number
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_number"
,
None
,
self
.
_namespace
)
sparse_feature_dim
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
,
None
,
self
.
_namespace
)
# ------------------------- network input --------------------------
num_field
=
envs
.
get_global_env
(
"hyper_parameters.num_field"
,
None
,
self
.
_namespace
)
raw_feat_idx
=
fluid
.
data
(
name
=
'feat_idx'
,
shape
=
[
None
,
num_field
],
dtype
=
'int64'
)
# None * num_field(defalut:39)
raw_feat_value
=
fluid
.
data
(
name
=
'feat_value'
,
shape
=
[
None
,
num_field
],
dtype
=
'float32'
)
# None * num_field
raw_feat_idx
=
fluid
.
data
(
name
=
'feat_idx'
,
shape
=
[
None
,
num_field
],
dtype
=
'int64'
)
# None * num_field(defalut:39)
raw_feat_value
=
fluid
.
data
(
name
=
'feat_value'
,
shape
=
[
None
,
num_field
],
dtype
=
'float32'
)
# None * num_field
self
.
label
=
fluid
.
data
(
name
=
'label'
,
shape
=
[
None
,
1
],
dtype
=
'float32'
)
# None * 1
feat_idx
=
fluid
.
layers
.
reshape
(
raw_feat_idx
,[
-
1
,
1
])
# (None * num_field) * 1
feat_idx
=
fluid
.
layers
.
reshape
(
raw_feat_idx
,
[
-
1
,
1
])
# (None * num_field) * 1
feat_value
=
fluid
.
layers
.
reshape
(
raw_feat_value
,
[
-
1
,
num_field
,
1
])
# None * num_field * 1
# ------------------------- set _data_var --------------------------
self
.
_data_var
.
append
(
raw_feat_idx
)
self
.
_data_var
.
append
(
raw_feat_value
)
self
.
_data_var
.
append
(
self
.
label
)
if
self
.
_platform
!=
"LINUX"
:
self
.
_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
#------------------------- first order term --------------------------
#
------------------------- first order term --------------------------
reg
=
envs
.
get_global_env
(
"hyper_parameters.reg"
,
1e-4
,
self
.
_namespace
)
first_weights_re
=
fluid
.
embedding
(
...
...
@@ -66,7 +67,7 @@ class Model(ModelBase):
first_weights_re
,
shape
=
[
-
1
,
num_field
,
1
])
# None * num_field * 1
y_first_order
=
fluid
.
layers
.
reduce_sum
((
first_weights
*
feat_value
),
1
)
#------------------------- second order term --------------------------
#
------------------------- second order term --------------------------
feat_embeddings_re
=
fluid
.
embedding
(
input
=
feat_idx
,
...
...
@@ -81,12 +82,12 @@ class Model(ModelBase):
feat_embeddings
=
fluid
.
layers
.
reshape
(
feat_embeddings_re
,
shape
=
[
-
1
,
num_field
,
sparse_feature_dim
])
# None * num_field * embedding_size
sparse_feature_dim
])
# None * num_field * embedding_size
feat_embeddings
=
feat_embeddings
*
feat_value
# None * num_field * embedding_size
# sum_square part
summed_features_emb
=
fluid
.
layers
.
reduce_sum
(
feat_embeddings
,
1
)
# None * embedding_size
1
)
# None * embedding_size
summed_features_emb_square
=
fluid
.
layers
.
square
(
summed_features_emb
)
# None * embedding_size
...
...
@@ -100,13 +101,12 @@ class Model(ModelBase):
summed_features_emb_square
-
squared_sum_features_emb
,
1
,
keep_dim
=
True
)
# None * 1
#------------------------- DNN --------------------------
# ------------------------- DNN --------------------------
layer_sizes
=
envs
.
get_global_env
(
"hyper_parameters.fc_sizes"
,
None
,
self
.
_namespace
)
act
=
envs
.
get_global_env
(
"hyper_parameters.act"
,
None
,
self
.
_namespace
)
y_dnn
=
fluid
.
layers
.
reshape
(
feat_embeddings
,
[
-
1
,
num_field
*
sparse_feature_dim
])
[
-
1
,
num_field
*
sparse_feature_dim
])
for
s
in
layer_sizes
:
y_dnn
=
fluid
.
layers
.
fc
(
input
=
y_dnn
,
...
...
@@ -128,28 +128,28 @@ class Model(ModelBase):
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
TruncatedNormalInitializer
(
loc
=
0.0
,
scale
=
init_value_
)))
#------------------------- DeepFM --------------------------
#
------------------------- DeepFM --------------------------
self
.
predict
=
fluid
.
layers
.
sigmoid
(
y_first_order
+
y_second_order
+
y_dnn
)
def
train_net
(
self
):
self
.
deepfm_net
()
#------------------------- Cost(logloss) --------------------------
#
------------------------- Cost(logloss) --------------------------
cost
=
fluid
.
layers
.
log_loss
(
input
=
self
.
predict
,
label
=
self
.
label
)
avg_cost
=
fluid
.
layers
.
reduce_sum
(
cost
)
self
.
_cost
=
avg_cost
#------------------------- Metric(Auc) --------------------------
#
------------------------- Metric(Auc) --------------------------
predict_2d
=
fluid
.
layers
.
concat
([
1
-
self
.
predict
,
self
.
predict
],
1
)
label_int
=
fluid
.
layers
.
cast
(
self
.
label
,
'int64'
)
auc_var
,
batch_auc_var
,
_
=
fluid
.
layers
.
auc
(
input
=
predict_2d
,
label
=
label_int
,
slide_steps
=
0
)
label
=
label_int
,
slide_steps
=
0
)
self
.
_metrics
[
"AUC"
]
=
auc_var
self
.
_metrics
[
"BATCH_AUC"
]
=
batch_auc_var
...
...
@@ -159,4 +159,4 @@ class Model(ModelBase):
return
optimizer
def
infer_net
(
self
,
parameter_list
):
self
.
deepfm_net
()
\ No newline at end of file
self
.
deepfm_net
()
models/rank/din/reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -32,9 +32,9 @@ class TrainReader(Reader):
self
.
train_data_path
=
envs
.
get_global_env
(
"train_data_path"
,
None
,
"train.reader"
)
self
.
res
=
[]
self
.
max_len
=
0
data_file_list
=
os
.
listdir
(
self
.
train_data_path
)
for
i
in
range
(
0
,
len
(
data_file_list
)):
for
i
in
range
(
0
,
len
(
data_file_list
)):
train_data_file
=
os
.
path
.
join
(
self
.
train_data_path
,
data_file_list
[
i
])
with
open
(
train_data_file
,
"r"
)
as
fin
:
for
line
in
fin
:
...
...
@@ -47,9 +47,6 @@ class TrainReader(Reader):
self
.
batch_size
=
envs
.
get_global_env
(
"batch_size"
,
32
,
"train.reader"
)
self
.
group_size
=
self
.
batch_size
*
20
def
_process_line
(
self
,
line
):
line
=
line
.
strip
().
split
(
';'
)
hist
=
line
[
0
].
split
()
...
...
@@ -58,22 +55,22 @@ class TrainReader(Reader):
cate
=
[
int
(
i
)
for
i
in
cate
]
return
[
hist
,
cate
,
[
int
(
line
[
2
])],
[
int
(
line
[
3
])],
[
float
(
line
[
4
])]]
def
generate_sample
(
self
,
line
):
"""
Read the data line by line and process it as a dictionary
"""
def
data_iter
():
#feat_idx, feat_value, label = self._process_line(line)
#
feat_idx, feat_value, label = self._process_line(line)
yield
self
.
_process_line
(
line
)
return
data_iter
def
pad_batch_data
(
self
,
input
,
max_len
):
res
=
np
.
array
([
x
+
[
0
]
*
(
max_len
-
len
(
x
))
for
x
in
input
])
res
=
res
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
])
return
res
def
make_data
(
self
,
b
):
max_len
=
max
(
len
(
x
[
0
])
for
x
in
b
)
item
=
self
.
pad_batch_data
([
x
[
0
]
for
x
in
b
],
max_len
)
...
...
@@ -81,7 +78,7 @@ class TrainReader(Reader):
len_array
=
[
len
(
x
[
0
])
for
x
in
b
]
mask
=
np
.
array
(
[[
0
]
*
x
+
[
-
1e9
]
*
(
max_len
-
x
)
for
x
in
len_array
]).
reshape
(
[
-
1
,
max_len
,
1
])
[
-
1
,
max_len
,
1
])
target_item_seq
=
np
.
array
(
[[
x
[
2
]]
*
max_len
for
x
in
b
]).
astype
(
"int64"
).
reshape
([
-
1
,
max_len
])
target_cat_seq
=
np
.
array
(
...
...
@@ -93,7 +90,7 @@ class TrainReader(Reader):
target_item_seq
[
i
],
target_cat_seq
[
i
]
])
return
res
def
batch_reader
(
self
,
reader
,
batch_size
,
group_size
):
def
batch_reader
():
bg
=
[]
...
...
@@ -115,7 +112,7 @@ class TrainReader(Reader):
yield
self
.
make_data
(
b
)
return
batch_reader
def
base_read
(
self
,
file_dir
):
res
=
[]
for
train_file
in
file_dir
:
...
...
@@ -126,10 +123,8 @@ class TrainReader(Reader):
cate
=
line
[
1
].
split
()
res
.
append
([
hist
,
cate
,
line
[
2
],
line
[
3
],
float
(
line
[
4
])])
return
res
def
generate_batch_from_trainfiles
(
self
,
files
):
data_set
=
self
.
base_read
(
files
)
random
.
shuffle
(
data_set
)
return
self
.
batch_reader
(
data_set
,
self
.
batch_size
,
self
.
batch_size
*
20
)
\ No newline at end of file
models/rank/wide_deep/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -23,32 +23,39 @@ from paddlerec.core.model import Model as ModelBase
class
Model
(
ModelBase
):
def
__init__
(
self
,
config
):
ModelBase
.
__init__
(
self
,
config
)
def
wide_part
(
self
,
data
):
out
=
fluid
.
layers
.
fc
(
input
=
data
,
size
=
1
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
loc
=
0.0
,
scale
=
1.0
/
math
.
sqrt
(
data
.
shape
[
1
])),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)),
act
=
None
,
name
=
'wide'
)
size
=
1
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
loc
=
0.0
,
scale
=
1.0
/
math
.
sqrt
(
data
.
shape
[
1
])),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)),
act
=
None
,
name
=
'wide'
)
return
out
def
fc
(
self
,
data
,
hidden_units
,
active
,
tag
):
output
=
fluid
.
layers
.
fc
(
input
=
data
,
size
=
hidden_units
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
loc
=
0.0
,
scale
=
1.0
/
math
.
sqrt
(
data
.
shape
[
1
]))),
act
=
active
,
name
=
tag
)
size
=
hidden_units
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
loc
=
0.0
,
scale
=
1.0
/
math
.
sqrt
(
data
.
shape
[
1
]))),
act
=
active
,
name
=
tag
)
return
output
def
deep_part
(
self
,
data
,
hidden1_units
,
hidden2_units
,
hidden3_units
):
l1
=
self
.
fc
(
data
,
hidden1_units
,
'relu'
,
'l1'
)
l2
=
self
.
fc
(
l1
,
hidden2_units
,
'relu'
,
'l2'
)
l3
=
self
.
fc
(
l2
,
hidden3_units
,
'relu'
,
'l3'
)
return
l3
def
train_net
(
self
):
wide_input
=
fluid
.
data
(
name
=
'wide_input'
,
shape
=
[
None
,
8
],
dtype
=
'float32'
)
deep_input
=
fluid
.
data
(
name
=
'deep_input'
,
shape
=
[
None
,
58
],
dtype
=
'float32'
)
...
...
@@ -62,31 +69,33 @@ class Model(ModelBase):
hidden3_units
=
envs
.
get_global_env
(
"hyper_parameters.hidden3_units"
,
25
,
self
.
_namespace
)
wide_output
=
self
.
wide_part
(
wide_input
)
deep_output
=
self
.
deep_part
(
deep_input
,
hidden1_units
,
hidden2_units
,
hidden3_units
)
wide_model
=
fluid
.
layers
.
fc
(
input
=
wide_output
,
size
=
1
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
loc
=
0.0
,
scale
=
1.0
)),
act
=
None
,
name
=
'w_wide'
)
size
=
1
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
loc
=
0.0
,
scale
=
1.0
)),
act
=
None
,
name
=
'w_wide'
)
deep_model
=
fluid
.
layers
.
fc
(
input
=
deep_output
,
size
=
1
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
loc
=
0.0
,
scale
=
1.0
)),
act
=
None
,
name
=
'w_deep'
)
size
=
1
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
loc
=
0.0
,
scale
=
1.0
)),
act
=
None
,
name
=
'w_deep'
)
prediction
=
fluid
.
layers
.
elementwise_add
(
wide_model
,
deep_model
)
pred
=
fluid
.
layers
.
sigmoid
(
fluid
.
layers
.
clip
(
prediction
,
min
=-
15.0
,
max
=
15.0
),
name
=
"prediction"
)
num_seqs
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
pred
,
label
=
fluid
.
layers
.
cast
(
x
=
label
,
dtype
=
'int64'
),
total
=
num_seqs
)
auc_var
,
batch_auc
,
auc_states
=
fluid
.
layers
.
auc
(
input
=
pred
,
label
=
fluid
.
layers
.
cast
(
x
=
label
,
dtype
=
'int64'
))
self
.
_metrics
[
"AUC"
]
=
auc_var
self
.
_metrics
[
"BATCH_AUC"
]
=
batch_auc
self
.
_metrics
[
"ACC"
]
=
acc
cost
=
fluid
.
layers
.
sigmoid_cross_entropy_with_logits
(
x
=
prediction
,
label
=
label
)
cost
=
fluid
.
layers
.
sigmoid_cross_entropy_with_logits
(
x
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
self
.
_cost
=
avg_cost
...
...
@@ -96,4 +105,4 @@ class Model(ModelBase):
return
optimizer
def
infer_net
(
self
,
parameter_list
):
self
.
deepfm_net
()
\ No newline at end of file
self
.
deepfm_net
()
models/rank/wide_deep/reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -30,16 +30,17 @@ class TrainReader(Reader):
line
=
line
.
strip
().
split
(
','
)
features
=
list
(
map
(
float
,
line
))
wide_feat
=
features
[
0
:
8
]
deep_feat
=
features
[
8
:
58
+
8
]
deep_feat
=
features
[
8
:
58
+
8
]
label
=
features
[
-
1
]
return
wide_feat
,
deep_feat
,
[
label
]
def
generate_sample
(
self
,
line
):
"""
Read the data line by line and process it as a dictionary
"""
def
data_iter
():
wide_feat
,
deep_deat
,
label
=
self
.
_process_line
(
line
)
yield
[(
'wide_input'
,
wide_feat
),
(
'deep_input'
,
deep_deat
),
(
'label'
,
label
)]
return
data_iter
\ No newline at end of file
return
data_iter
models/rank/xdeepfm/criteo_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -22,10 +22,10 @@ except ImportError:
from
paddlerec.core.reader
import
Reader
class
TrainReader
(
Reader
):
class
TrainReader
(
Reader
):
def
init
(
self
):
pass
def
_process_line
(
self
,
line
):
features
=
line
.
strip
(
'
\n
'
).
split
(
'
\t
'
)
feat_idx
=
[]
...
...
@@ -35,11 +35,11 @@ class TrainReader(Reader):
feat_value
.
append
(
1.0
)
label
=
[
int
(
features
[
0
])]
return
feat_idx
,
feat_value
,
label
def
generate_sample
(
self
,
line
):
def
data_iter
():
feat_idx
,
feat_value
,
label
=
self
.
_process_line
(
line
)
yield
[(
'feat_idx'
,
feat_idx
),
(
'feat_value'
,
feat_value
),
(
'label'
,
label
)]
return
data_iter
\ No newline at end of file
return
data_iter
models/rank/xdeepfm/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -26,13 +26,13 @@ class Model(ModelBase):
init_value_
=
0.1
initer
=
fluid
.
initializer
.
TruncatedNormalInitializer
(
loc
=
0.0
,
scale
=
init_value_
)
is_distributed
=
True
if
envs
.
get_trainer
()
==
"CtrTrainer"
else
False
sparse_feature_number
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_number"
,
None
,
self
.
_namespace
)
sparse_feature_dim
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
,
None
,
self
.
_namespace
)
# ------------------------- network input --------------------------
num_field
=
envs
.
get_global_env
(
"hyper_parameters.num_field"
,
None
,
self
.
_namespace
)
raw_feat_idx
=
fluid
.
data
(
name
=
'feat_idx'
,
shape
=
[
None
,
num_field
],
dtype
=
'int64'
)
raw_feat_value
=
fluid
.
data
(
name
=
'feat_value'
,
shape
=
[
None
,
num_field
],
dtype
=
'float32'
)
...
...
@@ -51,16 +51,16 @@ class Model(ModelBase):
feat_embeddings
,
[
-
1
,
num_field
,
sparse_feature_dim
])
# None * num_field * embedding_size
feat_embeddings
=
feat_embeddings
*
feat_value
# None * num_field * embedding_size
# ------------------------- set _data_var --------------------------
self
.
_data_var
.
append
(
raw_feat_idx
)
self
.
_data_var
.
append
(
raw_feat_value
)
self
.
_data_var
.
append
(
self
.
label
)
if
self
.
_platform
!=
"LINUX"
:
self
.
_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
# -------------------- linear --------------------
weights_linear
=
fluid
.
embedding
(
...
...
@@ -78,7 +78,7 @@ class Model(ModelBase):
default_initializer
=
fluid
.
initializer
.
ConstantInitializer
(
value
=
0
))
y_linear
=
fluid
.
layers
.
reduce_sum
(
(
weights_linear
*
feat_value
),
1
)
+
b_linear
# -------------------- CIN --------------------
layer_sizes_cin
=
envs
.
get_global_env
(
"hyper_parameters.layer_sizes_cin"
,
None
,
self
.
_namespace
)
...
...
@@ -89,7 +89,7 @@ class Model(ModelBase):
X_0
=
fluid
.
layers
.
reshape
(
fluid
.
layers
.
transpose
(
Xs
[
0
],
[
0
,
2
,
1
]),
[
-
1
,
sparse_feature_dim
,
num_field
,
1
])
# None, embedding_size, num_field, 1
1
])
# None, embedding_size, num_field, 1
X_k
=
fluid
.
layers
.
reshape
(
fluid
.
layers
.
transpose
(
Xs
[
-
1
],
[
0
,
2
,
1
]),
[
-
1
,
sparse_feature_dim
,
1
,
last_s
])
# None, embedding_size, 1, last_s
...
...
@@ -135,7 +135,7 @@ class Model(ModelBase):
layer_sizes_dnn
=
envs
.
get_global_env
(
"hyper_parameters.layer_sizes_dnn"
,
None
,
self
.
_namespace
)
act
=
envs
.
get_global_env
(
"hyper_parameters.act"
,
None
,
self
.
_namespace
)
y_dnn
=
fluid
.
layers
.
reshape
(
feat_embeddings
,
[
-
1
,
num_field
*
sparse_feature_dim
])
[
-
1
,
num_field
*
sparse_feature_dim
])
for
s
in
layer_sizes_dnn
:
y_dnn
=
fluid
.
layers
.
fc
(
input
=
y_dnn
,
size
=
s
,
...
...
@@ -151,7 +151,7 @@ class Model(ModelBase):
# ------------------- xDeepFM ------------------
self
.
predict
=
fluid
.
layers
.
sigmoid
(
y_linear
+
y_cin
+
y_dnn
)
def
train_net
(
self
):
self
.
xdeepfm_net
()
...
...
@@ -163,15 +163,15 @@ class Model(ModelBase):
predict_2d
=
fluid
.
layers
.
concat
([
1
-
self
.
predict
,
self
.
predict
],
1
)
label_int
=
fluid
.
layers
.
cast
(
self
.
label
,
'int64'
)
auc_var
,
batch_auc_var
,
_
=
fluid
.
layers
.
auc
(
input
=
predict_2d
,
label
=
label_int
,
slide_steps
=
0
)
label
=
label_int
,
slide_steps
=
0
)
self
.
_metrics
[
"AUC"
]
=
auc_var
self
.
_metrics
[
"BATCH_AUC"
]
=
batch_auc_var
def
optimizer
(
self
):
learning_rate
=
envs
.
get_global_env
(
"hyper_parameters.learning_rate"
,
None
,
self
.
_namespace
)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
,
lazy_mode
=
True
)
return
optimizer
def
infer_net
(
self
,
parameter_list
):
self
.
xdeepfm_net
()
\ No newline at end of file
self
.
xdeepfm_net
()
models/recall/gnn/evaluate_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -24,17 +24,17 @@ from paddlerec.core.utils import envs
class
EvaluateReader
(
Reader
):
def
init
(
self
):
self
.
batch_size
=
envs
.
get_global_env
(
"batch_size"
,
None
,
"evaluate.reader"
)
self
.
input
=
[]
self
.
length
=
None
def
base_read
(
self
,
files
):
res
=
[]
for
f
in
files
:
with
open
(
f
,
"r"
)
as
fin
:
with
open
(
f
,
"r"
)
as
fin
:
for
line
in
fin
:
line
=
line
.
strip
().
split
(
'
\t
'
)
res
.
append
(
tuple
([
map
(
int
,
line
[
0
].
split
(
','
)),
int
(
line
[
1
])]))
line
=
line
.
strip
().
split
(
'
\t
'
)
res
.
append
(
tuple
([
map
(
int
,
line
[
0
].
split
(
','
)),
int
(
line
[
1
])]))
return
res
def
make_data
(
self
,
cur_batch
,
batch_size
):
...
...
@@ -122,10 +122,11 @@ class EvaluateReader(Reader):
else
:
# Due to fixed batch_size, discard the remaining ins
return
#cur_batch = remain_data[i:]
#yield self.make_data(cur_batch, group_remain % batch_size)
# cur_batch = remain_data[i:]
# yield self.make_data(cur_batch, group_remain % batch_size)
return
_reader
def
generate_batch_from_trainfiles
(
self
,
files
):
self
.
input
=
self
.
base_read
(
files
)
self
.
length
=
len
(
self
.
input
)
...
...
@@ -134,4 +135,5 @@ class EvaluateReader(Reader):
def
generate_sample
(
self
,
line
):
def
data_iter
():
yield
[]
return
data_iter
models/recall/gnn/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -26,19 +26,19 @@ class Model(ModelBase):
def
__init__
(
self
,
config
):
ModelBase
.
__init__
(
self
,
config
)
self
.
init_config
()
def
init_config
(
self
):
self
.
_fetch_interval
=
1
self
.
items_num
,
self
.
ins_num
=
self
.
config_read
(
envs
.
get_global_env
(
"hyper_parameters.config_path"
,
None
,
self
.
_namespace
))
self
.
items_num
,
self
.
ins_num
=
self
.
config_read
(
envs
.
get_global_env
(
"hyper_parameters.config_path"
,
None
,
self
.
_namespace
))
self
.
train_batch_size
=
envs
.
get_global_env
(
"batch_size"
,
None
,
"train.reader"
)
self
.
evaluate_batch_size
=
envs
.
get_global_env
(
"batch_size"
,
None
,
"evaluate.reader"
)
self
.
hidden_size
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
,
None
,
self
.
_namespace
)
self
.
step
=
envs
.
get_global_env
(
"hyper_parameters.gnn_propogation_steps"
,
None
,
self
.
_namespace
)
def
config_read
(
self
,
config_path
=
None
):
if
config_path
is
None
:
raise
ValueError
(
"please set train.model.hyper_parameters.config_path at first"
)
if
config_path
is
None
:
raise
ValueError
(
"please set train.model.hyper_parameters.config_path at first"
)
with
open
(
config_path
,
"r"
)
as
fin
:
item_nums
=
int
(
fin
.
readline
().
strip
())
ins_nums
=
int
(
fin
.
readline
().
strip
())
...
...
@@ -48,49 +48,49 @@ class Model(ModelBase):
self
.
items
=
fluid
.
data
(
name
=
"items"
,
shape
=
[
bs
,
-
1
],
dtype
=
"int64"
)
#
[batch_size, uniq_max]
dtype
=
"int64"
)
#
[batch_size, uniq_max]
self
.
seq_index
=
fluid
.
data
(
name
=
"seq_index"
,
shape
=
[
bs
,
-
1
,
2
],
dtype
=
"int32"
)
#
[batch_size, seq_max, 2]
dtype
=
"int32"
)
#
[batch_size, seq_max, 2]
self
.
last_index
=
fluid
.
data
(
name
=
"last_index"
,
shape
=
[
bs
,
2
],
dtype
=
"int32"
)
#
[batch_size, 2]
dtype
=
"int32"
)
#
[batch_size, 2]
self
.
adj_in
=
fluid
.
data
(
name
=
"adj_in"
,
shape
=
[
bs
,
-
1
,
-
1
],
dtype
=
"float32"
)
#
[batch_size, seq_max, seq_max]
dtype
=
"float32"
)
#
[batch_size, seq_max, seq_max]
self
.
adj_out
=
fluid
.
data
(
name
=
"adj_out"
,
shape
=
[
bs
,
-
1
,
-
1
],
dtype
=
"float32"
)
#
[batch_size, seq_max, seq_max]
dtype
=
"float32"
)
#
[batch_size, seq_max, seq_max]
self
.
mask
=
fluid
.
data
(
name
=
"mask"
,
shape
=
[
bs
,
-
1
,
1
],
dtype
=
"float32"
)
#
[batch_size, seq_max, 1]
dtype
=
"float32"
)
#
[batch_size, seq_max, 1]
self
.
label
=
fluid
.
data
(
name
=
"label"
,
shape
=
[
bs
,
1
],
dtype
=
"int64"
)
#[batch_size, 1]
dtype
=
"int64"
)
# [batch_size, 1]
res
=
[
self
.
items
,
self
.
seq_index
,
self
.
last_index
,
self
.
adj_in
,
self
.
adj_out
,
self
.
mask
,
self
.
label
]
return
res
def
train_input
(
self
):
res
=
self
.
input
(
self
.
train_batch_size
)
self
.
_data_var
=
res
use_dataloader
=
envs
.
get_global_env
(
"hyper_parameters.use_DataLoader"
,
False
,
self
.
_namespace
)
use_dataloader
=
envs
.
get_global_env
(
"hyper_parameters.use_DataLoader"
,
False
,
self
.
_namespace
)
if
self
.
_platform
!=
"LINUX"
or
use_dataloader
:
self
.
_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_data_var
,
capacity
=
256
,
use_double_buffer
=
False
,
iterable
=
False
)
def
net
(
self
,
items_num
,
hidden_size
,
step
,
bs
):
stdv
=
1.0
/
math
.
sqrt
(
hidden_size
)
stdv
=
1.0
/
math
.
sqrt
(
hidden_size
)
def
embedding_layer
(
input
,
table_name
,
emb_dim
,
initializer_instance
=
None
):
def
embedding_layer
(
input
,
table_name
,
emb_dim
,
initializer_instance
=
None
):
emb
=
fluid
.
embedding
(
input
=
input
,
size
=
[
items_num
,
emb_dim
],
...
...
@@ -98,10 +98,10 @@ class Model(ModelBase):
name
=
table_name
,
initializer
=
initializer_instance
),
)
return
emb
sparse_initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)
items_emb
=
embedding_layer
(
self
.
items
,
"emb"
,
hidden_size
,
sparse_initializer
)
return
emb
sparse_initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)
items_emb
=
embedding_layer
(
self
.
items
,
"emb"
,
hidden_size
,
sparse_initializer
)
pre_state
=
items_emb
for
i
in
range
(
step
):
pre_state
=
layers
.
reshape
(
x
=
pre_state
,
shape
=
[
bs
,
-
1
,
hidden_size
])
...
...
@@ -114,7 +114,7 @@ class Model(ModelBase):
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)),
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)))
#[batch_size, uniq_max, h]
low
=-
stdv
,
high
=
stdv
)))
#
[batch_size, uniq_max, h]
state_out
=
layers
.
fc
(
input
=
pre_state
,
name
=
"state_out"
,
...
...
@@ -124,13 +124,13 @@ class Model(ModelBase):
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)),
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)))
#[batch_size, uniq_max, h]
state_adj_in
=
layers
.
matmul
(
self
.
adj_in
,
state_in
)
#[batch_size, uniq_max, h]
state_adj_out
=
layers
.
matmul
(
self
.
adj_out
,
state_out
)
#
[batch_size, uniq_max, h]
low
=-
stdv
,
high
=
stdv
)))
#
[batch_size, uniq_max, h]
state_adj_in
=
layers
.
matmul
(
self
.
adj_in
,
state_in
)
#
[batch_size, uniq_max, h]
state_adj_out
=
layers
.
matmul
(
self
.
adj_out
,
state_out
)
#
[batch_size, uniq_max, h]
gru_input
=
layers
.
concat
([
state_adj_in
,
state_adj_out
],
axis
=
2
)
gru_input
=
layers
.
reshape
(
x
=
gru_input
,
shape
=
[
-
1
,
hidden_size
*
2
])
gru_fc
=
layers
.
fc
(
input
=
gru_input
,
...
...
@@ -141,11 +141,11 @@ class Model(ModelBase):
input
=
gru_fc
,
hidden
=
layers
.
reshape
(
x
=
pre_state
,
shape
=
[
-
1
,
hidden_size
]),
size
=
3
*
hidden_size
)
final_state
=
layers
.
reshape
(
pre_state
,
shape
=
[
bs
,
-
1
,
hidden_size
])
seq
=
layers
.
gather_nd
(
final_state
,
self
.
seq_index
)
last
=
layers
.
gather_nd
(
final_state
,
self
.
last_index
)
seq_fc
=
layers
.
fc
(
input
=
seq
,
name
=
"seq_fc"
,
...
...
@@ -155,7 +155,7 @@ class Model(ModelBase):
num_flatten_dims
=
2
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)))
#
[batch_size, seq_max, h]
low
=-
stdv
,
high
=
stdv
)))
#
[batch_size, seq_max, h]
last_fc
=
layers
.
fc
(
input
=
last
,
name
=
"last_fc"
,
...
...
@@ -165,22 +165,22 @@ class Model(ModelBase):
num_flatten_dims
=
1
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)))
#
[bathc_size, h]
low
=-
stdv
,
high
=
stdv
)))
#
[bathc_size, h]
seq_fc_t
=
layers
.
transpose
(
seq_fc
,
perm
=
[
1
,
0
,
2
])
#[seq_max, batch_size, h]
seq_fc
,
perm
=
[
1
,
0
,
2
])
#
[seq_max, batch_size, h]
add
=
layers
.
elementwise_add
(
seq_fc_t
,
last_fc
)
#[seq_max, batch_size, h]
seq_fc_t
,
last_fc
)
#
[seq_max, batch_size, h]
b
=
layers
.
create_parameter
(
shape
=
[
hidden_size
],
dtype
=
'float32'
,
default_initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.0
))
#[h]
add
=
layers
.
elementwise_add
(
add
,
b
)
#[seq_max, batch_size, h]
add_sigmoid
=
layers
.
sigmoid
(
add
)
#[seq_max, batch_size, h]
default_initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.0
))
#
[h]
add
=
layers
.
elementwise_add
(
add
,
b
)
#
[seq_max, batch_size, h]
add_sigmoid
=
layers
.
sigmoid
(
add
)
# [seq_max, batch_size, h]
add_sigmoid
=
layers
.
transpose
(
add_sigmoid
,
perm
=
[
1
,
0
,
2
])
#[batch_size, seq_max, h]
add_sigmoid
,
perm
=
[
1
,
0
,
2
])
#
[batch_size, seq_max, h]
weight
=
layers
.
fc
(
input
=
add_sigmoid
,
name
=
"weight_fc"
,
...
...
@@ -190,13 +190,13 @@ class Model(ModelBase):
bias_attr
=
False
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)))
#[batch_size, seq_max, 1]
low
=-
stdv
,
high
=
stdv
)))
#
[batch_size, seq_max, 1]
weight
*=
self
.
mask
weight_mask
=
layers
.
elementwise_mul
(
seq
,
weight
,
axis
=
0
)
#
[batch_size, seq_max, h]
global_attention
=
layers
.
reduce_sum
(
weight_mask
,
dim
=
1
)
#
[batch_size, h]
weight_mask
=
layers
.
elementwise_mul
(
seq
,
weight
,
axis
=
0
)
#
[batch_size, seq_max, h]
global_attention
=
layers
.
reduce_sum
(
weight_mask
,
dim
=
1
)
#
[batch_size, h]
final_attention
=
layers
.
concat
(
[
global_attention
,
last
],
axis
=
1
)
#[batch_size, 2*h]
[
global_attention
,
last
],
axis
=
1
)
#
[batch_size, 2*h]
final_attention_fc
=
layers
.
fc
(
input
=
final_attention
,
name
=
"final_attention_fc"
,
...
...
@@ -204,14 +204,14 @@ class Model(ModelBase):
bias_attr
=
False
,
act
=
None
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)))
#[batch_size, h]
# all_vocab = layers.create_global_var(
# shape=[items_num - 1],
# value=0,
# dtype="int64",
# persistable=True,
# name="all_vocab")
low
=-
stdv
,
high
=
stdv
)))
#
[batch_size, h]
# all_vocab = layers.create_global_var(
# shape=[items_num - 1],
# value=0,
# dtype="int64",
# persistable=True,
# name="all_vocab")
all_vocab
=
np
.
arange
(
1
,
items_num
).
reshape
((
-
1
)).
astype
(
'int32'
)
all_vocab
=
fluid
.
layers
.
cast
(
x
=
fluid
.
layers
.
assign
(
all_vocab
),
dtype
=
'int64'
)
...
...
@@ -221,13 +221,13 @@ class Model(ModelBase):
name
=
"emb"
,
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)),
size
=
[
items_num
,
hidden_size
])
#[all_vocab, h]
size
=
[
items_num
,
hidden_size
])
#
[all_vocab, h]
logits
=
layers
.
matmul
(
x
=
final_attention_fc
,
y
=
all_emb
,
transpose_y
=
True
)
#[batch_size, all_vocab]
transpose_y
=
True
)
#
[batch_size, all_vocab]
softmax
=
layers
.
softmax_with_cross_entropy
(
logits
=
logits
,
label
=
self
.
label
)
#[batch_size, 1]
logits
=
logits
,
label
=
self
.
label
)
#
[batch_size, 1]
self
.
loss
=
layers
.
reduce_mean
(
softmax
)
# [1]
self
.
acc
=
layers
.
accuracy
(
input
=
logits
,
label
=
self
.
label
,
k
=
20
)
...
...
@@ -250,7 +250,7 @@ class Model(ModelBase):
decay_steps
=
envs
.
get_global_env
(
"hyper_parameters.decay_steps"
,
None
,
self
.
_namespace
)
decay_rate
=
envs
.
get_global_env
(
"hyper_parameters.decay_rate"
,
None
,
self
.
_namespace
)
l2
=
envs
.
get_global_env
(
"hyper_parameters.l2"
,
None
,
self
.
_namespace
)
optimizer
=
fluid
.
optimizer
.
Adam
(
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
fluid
.
layers
.
exponential_decay
(
learning_rate
=
learning_rate
,
decay_steps
=
decay_steps
*
step_per_epoch
,
...
...
@@ -258,18 +258,18 @@ class Model(ModelBase):
regularization
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
l2
))
return
optimizer
return
optimizer
def
infer_input
(
self
):
self
.
_reader_namespace
=
"evaluate.reader"
res
=
self
.
input
(
self
.
evaluate_batch_size
)
self
.
_infer_data_var
=
res
self
.
_infer_data_var
=
res
self
.
_infer_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
def
infer_net
(
self
):
self
.
infer_input
()
self
.
net
(
self
.
items_num
,
self
.
hidden_size
,
self
.
step
,
self
.
evaluate_batch_size
)
self
.
infer_input
()
self
.
net
(
self
.
items_num
,
self
.
hidden_size
,
self
.
step
,
self
.
evaluate_batch_size
)
self
.
_infer_results
[
'acc'
]
=
self
.
acc
self
.
_infer_results
[
'loss'
]
=
self
.
loss
self
.
_infer_results
[
'loss'
]
=
self
.
loss
models/recall/gnn/reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -24,17 +24,17 @@ from paddlerec.core.utils import envs
class
TrainReader
(
Reader
):
def
init
(
self
):
self
.
batch_size
=
envs
.
get_global_env
(
"batch_size"
,
None
,
"train.reader"
)
self
.
input
=
[]
self
.
length
=
None
def
base_read
(
self
,
files
):
res
=
[]
for
f
in
files
:
with
open
(
f
,
"r"
)
as
fin
:
with
open
(
f
,
"r"
)
as
fin
:
for
line
in
fin
:
line
=
line
.
strip
().
split
(
'
\t
'
)
res
.
append
(
tuple
([
map
(
int
,
line
[
0
].
split
(
','
)),
int
(
line
[
1
])]))
line
=
line
.
strip
().
split
(
'
\t
'
)
res
.
append
(
tuple
([
map
(
int
,
line
[
0
].
split
(
','
)),
int
(
line
[
1
])]))
return
res
def
make_data
(
self
,
cur_batch
,
batch_size
):
...
...
@@ -122,10 +122,11 @@ class TrainReader(Reader):
else
:
# Due to fixed batch_size, discard the remaining ins
return
#cur_batch = remain_data[i:]
#yield self.make_data(cur_batch, group_remain % batch_size)
# cur_batch = remain_data[i:]
# yield self.make_data(cur_batch, group_remain % batch_size)
return
_reader
def
generate_batch_from_trainfiles
(
self
,
files
):
self
.
input
=
self
.
base_read
(
files
)
self
.
length
=
len
(
self
.
input
)
...
...
@@ -134,4 +135,5 @@ class TrainReader(Reader):
def
generate_sample
(
self
,
line
):
def
data_iter
():
yield
[]
return
data_iter
models/recall/gru4rec/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -86,10 +86,8 @@ class Model(ModelBase):
self
.
_metrics
[
"cost"
]
=
avg_cost
self
.
_metrics
[
"acc"
]
=
acc
def
train_net
(
self
):
self
.
all_vocab_network
()
def
infer_net
(
self
):
self
.
all_vocab_network
(
is_infer
=
True
)
models/recall/ssr/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -51,6 +51,7 @@ class GrnnEncoder(object):
bias_attr
=
self
.
param_name
+
".bias"
)
return
fluid
.
layers
.
sequence_pool
(
input
=
gru_h
,
pool_type
=
'max'
)
class
PairwiseHingeLoss
(
object
):
def
__init__
(
self
,
margin
=
0.8
):
self
.
margin
=
margin
...
...
@@ -67,6 +68,7 @@ class PairwiseHingeLoss(object):
loss_part2
)
return
loss_part3
class
Model
(
ModelBase
):
def
__init__
(
self
,
config
):
ModelBase
.
__init__
(
self
,
config
)
...
...
@@ -77,7 +79,6 @@ class Model(ModelBase):
return
correct
def
train
(
self
):
vocab_size
=
envs
.
get_global_env
(
"hyper_parameters.vocab_size"
,
None
,
self
.
_namespace
)
emb_dim
=
envs
.
get_global_env
(
"hyper_parameters.emb_dim"
,
None
,
self
.
_namespace
)
hidden_size
=
envs
.
get_global_env
(
"hyper_parameters.hidden_size"
,
None
,
self
.
_namespace
)
...
...
@@ -121,16 +122,14 @@ class Model(ModelBase):
hinge_loss
=
self
.
pairwise_hinge_loss
.
forward
(
cos_pos
,
cos_neg
)
avg_cost
=
fluid
.
layers
.
mean
(
hinge_loss
)
correct
=
self
.
get_correct
(
cos_neg
,
cos_pos
)
self
.
_cost
=
avg_cost
self
.
_metrics
[
"correct"
]
=
correct
self
.
_metrics
[
"hinge_loss"
]
=
hinge_loss
def
train_net
(
self
):
self
.
train
()
def
infer
(
self
):
vocab_size
=
envs
.
get_global_env
(
"hyper_parameters.vocab_size"
,
None
,
self
.
_namespace
)
emb_dim
=
envs
.
get_global_env
(
"hyper_parameters.emb_dim"
,
None
,
self
.
_namespace
)
...
...
@@ -143,7 +142,7 @@ class Model(ModelBase):
pos_label
=
fluid
.
data
(
name
=
"pos_label"
,
shape
=
[
None
,
1
],
dtype
=
"int64"
)
self
.
_infer_data_var
=
[
user_data
,
all_item_data
,
pos_label
]
self
.
_infer_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
user_emb
=
fluid
.
embedding
(
input
=
user_data
,
size
=
[
vocab_size
,
emb_dim
],
param_attr
=
"emb.item"
)
...
...
@@ -170,6 +169,5 @@ class Model(ModelBase):
self
.
_infer_results
[
'recall20'
]
=
acc
def
infer_net
(
self
):
self
.
infer
()
models/recall/ssr/ssr_infer_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -20,12 +20,10 @@ from paddlerec.core.reader import Reader
from
paddlerec.core.utils
import
envs
class
EvaluateReader
(
Reader
):
def
init
(
self
):
self
.
vocab_size
=
envs
.
get_global_env
(
"vocab_size"
,
10
,
"train.model.hyper_parameters"
)
def
generate_sample
(
self
,
line
):
"""
Read the data line by line and process it as a dictionary
...
...
@@ -41,6 +39,6 @@ class EvaluateReader(Reader):
src
=
conv_ids
[:
boundary
]
pos_tgt
=
[
conv_ids
[
boundary
]]
feature_name
=
[
"user"
,
"all_item"
,
"p_item"
]
yield
zip
(
feature_name
,
[
src
]
+
[
np
.
arange
(
self
.
vocab_size
).
astype
(
"int64"
).
tolist
()]
+
[
pos_tgt
])
yield
zip
(
feature_name
,
[
src
]
+
[
np
.
arange
(
self
.
vocab_size
).
astype
(
"int64"
).
tolist
()]
+
[
pos_tgt
])
return
reader
models/recall/ssr/ssr_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -19,7 +19,6 @@ import random
from
paddlerec.core.reader
import
Reader
class
TrainReader
(
Reader
):
def
init
(
self
):
pass
...
...
@@ -27,7 +26,6 @@ class TrainReader(Reader):
def
sample_neg_from_seq
(
self
,
seq
):
return
seq
[
random
.
randint
(
0
,
len
(
seq
)
-
1
)]
def
generate_sample
(
self
,
line
):
"""
Read the data line by line and process it as a dictionary
...
...
models/recall/word2vec/preprocess.py
浏览文件 @
7a3ec4e6
...
...
@@ -20,11 +20,8 @@ import random
import
re
import
six
import
argparse
prog
=
re
.
compile
(
"[^a-z ]"
,
flags
=
0
)
...
...
@@ -78,7 +75,7 @@ def parse_args():
def
text_strip
(
text
):
#English Preprocess Rule
#
English Preprocess Rule
return
prog
.
sub
(
""
,
text
.
lower
())
...
...
@@ -120,7 +117,7 @@ def filter_corpus(args):
word_all_count
=
0
id_counts
=
[]
word_id
=
0
#read dict
#
read dict
with
io
.
open
(
args
.
dict_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
word
,
count
=
line
.
split
()[
0
],
int
(
line
.
split
()[
1
])
...
...
@@ -130,13 +127,13 @@ def filter_corpus(args):
id_counts
.
append
(
count
)
word_all_count
+=
count
#write word2id file
#
write word2id file
print
(
"write word2id file to : "
+
args
.
dict_path
+
"_word_to_id_"
)
with
io
.
open
(
args
.
dict_path
+
"_word_to_id_"
,
'w+'
,
encoding
=
'utf-8'
)
as
fid
:
for
k
,
v
in
word_to_id_
.
items
():
fid
.
write
(
k
+
" "
+
str
(
v
)
+
'
\n
'
)
#filter corpus and convert id
#
filter corpus and convert id
if
not
os
.
path
.
exists
(
args
.
output_corpus_dir
):
os
.
makedirs
(
args
.
output_corpus_dir
)
for
file
in
os
.
listdir
(
args
.
input_corpus_dir
):
...
...
@@ -157,9 +154,9 @@ def filter_corpus(args):
count_w
=
id_counts
[
idx
]
corpus_size
=
word_all_count
keep_prob
=
(
math
.
sqrt
(
count_w
/
(
args
.
downsample
*
corpus_size
))
+
1
)
*
(
args
.
downsample
*
corpus_size
)
/
count_w
math
.
sqrt
(
count_w
/
(
args
.
downsample
*
corpus_size
))
+
1
)
*
(
args
.
downsample
*
corpus_size
)
/
count_w
r_value
=
random
.
random
()
if
r_value
>
keep_prob
:
continue
...
...
@@ -205,7 +202,7 @@ def build_dict(args):
for
item
in
item_to_remove
:
unk_sum
+=
word_count
[
item
]
del
word_count
[
item
]
#sort by count
#
sort by count
word_count
[
native_to_unicode
(
'<UNK>'
)]
=
unk_sum
word_count
=
sorted
(
word_count
.
items
(),
key
=
lambda
word_count
:
-
word_count
[
1
])
...
...
@@ -227,17 +224,18 @@ def data_split(args):
for
file_
in
files
:
with
open
(
os
.
path
.
join
(
raw_data_dir
,
file_
),
'r'
)
as
f
:
contents
.
extend
(
f
.
readlines
())
num
=
int
(
args
.
file_nums
)
lines_per_file
=
len
(
contents
)
/
num
print
(
"contents: "
,
str
(
len
(
contents
)))
print
(
"lines_per_file: "
,
str
(
lines_per_file
))
for
i
in
range
(
1
,
num
+
1
):
for
i
in
range
(
1
,
num
+
1
):
with
open
(
os
.
path
.
join
(
new_data_dir
,
"part_"
+
str
(
i
)),
'w'
)
as
fout
:
data
=
contents
[(
i
-
1
)
*
lines_per_file
:
min
(
i
*
lines_per_file
,
len
(
contents
))]
data
=
contents
[(
i
-
1
)
*
lines_per_file
:
min
(
i
*
lines_per_file
,
len
(
contents
))]
for
line
in
data
:
fout
.
write
(
line
)
fout
.
write
(
line
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
...
...
models/recall/word2vec/w2v_evaluate_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -22,7 +22,7 @@ from paddlerec.core.utils import envs
class
EvaluateReader
(
Reader
):
def
init
(
self
):
dict_path
=
envs
.
get_global_env
(
"word_id_dict_path"
,
None
,
"evaluate.reader"
)
dict_path
=
envs
.
get_global_env
(
"word_id_dict_path"
,
None
,
"evaluate.reader"
)
self
.
word_to_id
=
dict
()
self
.
id_to_word
=
dict
()
with
io
.
open
(
dict_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
...
...
@@ -48,19 +48,16 @@ class EvaluateReader(Reader):
if
isinstance
(
s
,
str
):
return
True
return
False
def
_to_unicode
(
self
,
s
,
ignore_errors
=
False
):
if
self
.
_is_unicode
(
s
):
return
s
error_mode
=
"ignore"
if
ignore_errors
else
"strict"
return
s
.
decode
(
"utf-8"
,
errors
=
error_mode
)
def
strip_lines
(
self
,
line
,
vocab
):
return
self
.
_replace_oov
(
vocab
,
self
.
native_to_unicode
(
line
))
def
_replace_oov
(
self
,
original_vocab
,
line
):
"""Replace out-of-vocab words with "<UNK>".
This maintains compatibility with published results.
...
...
@@ -78,5 +75,7 @@ class EvaluateReader(Reader):
def
reader
():
features
=
self
.
strip_lines
(
line
.
lower
(),
self
.
word_to_id
)
features
=
features
.
split
()
yield
[(
'analogy_a'
,
[
self
.
word_to_id
[
features
[
0
]]]),
(
'analogy_b'
,
[
self
.
word_to_id
[
features
[
1
]]]),
(
'analogy_c'
,
[
self
.
word_to_id
[
features
[
2
]]]),
(
'analogy_d'
,
[
self
.
word_to_id
[
features
[
3
]]])]
yield
[(
'analogy_a'
,
[
self
.
word_to_id
[
features
[
0
]]]),
(
'analogy_b'
,
[
self
.
word_to_id
[
features
[
1
]]]),
(
'analogy_c'
,
[
self
.
word_to_id
[
features
[
2
]]]),
(
'analogy_d'
,
[
self
.
word_to_id
[
features
[
3
]]])]
return
reader
models/recall/word2vec/w2v_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -40,7 +40,7 @@ class NumpyRandomInt(object):
class
TrainReader
(
Reader
):
def
init
(
self
):
dict_path
=
envs
.
get_global_env
(
"word_count_dict_path"
,
None
,
"train.reader"
)
dict_path
=
envs
.
get_global_env
(
"word_count_dict_path"
,
None
,
"train.reader"
)
self
.
window_size
=
envs
.
get_global_env
(
"hyper_parameters.window_size"
,
None
,
"train.model"
)
self
.
neg_num
=
envs
.
get_global_env
(
"hyper_parameters.neg_num"
,
None
,
"train.model"
)
self
.
with_shuffle_batch
=
envs
.
get_global_env
(
"hyper_parameters.with_shuffle_batch"
,
None
,
"train.model"
)
...
...
@@ -75,7 +75,7 @@ class TrainReader(Reader):
start_point
=
0
end_point
=
idx
+
target_window
targets
=
words
[
start_point
:
idx
]
+
words
[
idx
+
1
:
end_point
+
1
]
return
targets
return
targets
def
generate_sample
(
self
,
line
):
def
reader
():
...
...
@@ -87,7 +87,7 @@ class TrainReader(Reader):
output
=
[(
'input_word'
,
[
int
(
target_id
)]),
(
'true_label'
,
[
int
(
context_id
)])]
if
not
self
.
with_shuffle_batch
:
neg_array
=
self
.
cs
.
searchsorted
(
np
.
random
.
sample
(
self
.
neg_num
))
output
+=
[(
'neg_label'
,
[
int
(
str
(
i
))
for
i
in
neg_array
])]
output
+=
[(
'neg_label'
,
[
int
(
str
(
i
))
for
i
in
neg_array
])]
yield
output
return
reader
return
reader
models/treebased/tdm/model.py
浏览文件 @
7a3ec4e6
...
...
@@ -134,7 +134,7 @@ class Model(ModelBase):
sample_nodes_emb
=
[
fluid
.
layers
.
reshape
(
sample_nodes_emb
[
i
],
[
-
1
,
self
.
neg_sampling_list
[
i
]
+
self
.
output_positive
,
self
.
node_emb_size
]
self
.
output_positive
,
self
.
node_emb_size
]
)
for
i
in
range
(
self
.
max_layers
)
]
...
...
@@ -229,7 +229,7 @@ class Model(ModelBase):
act
=
self
.
act
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"trans.layer_fc.weight."
+
str
(
i
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"trans.layer_fc.bias."
+
str
(
i
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"trans.layer_fc.bias."
+
str
(
i
)),
)
for
i
in
range
(
self
.
max_layers
)
]
...
...
@@ -268,8 +268,8 @@ class Model(ModelBase):
num_flatten_dims
=
2
,
act
=
self
.
act
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"cls.concat_fc.weight."
+
str
(
i
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls.concat_fc.bias."
+
str
(
i
))
name
=
"cls.concat_fc.weight."
+
str
(
i
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls.concat_fc.bias."
+
str
(
i
))
)
for
i
in
range
(
self
.
max_layers
)
]
...
...
@@ -348,7 +348,7 @@ class Model(ModelBase):
current_layer_node_num
=
self
.
first_layer_node
.
shape
[
1
]
else
:
current_layer_node_num
=
current_layer_node
.
shape
[
1
]
*
\
current_layer_node
.
shape
[
2
]
current_layer_node
.
shape
[
2
]
current_layer_node
=
fluid
.
layers
.
reshape
(
current_layer_node
,
[
-
1
,
current_layer_node_num
])
...
...
@@ -458,7 +458,7 @@ class Model(ModelBase):
param_attr
=
fluid
.
ParamAttr
(
name
=
"trans.layer_fc.weight."
+
str
(
layer_idx
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"trans.layer_fc.bias."
+
str
(
layer_idx
)),
name
=
"trans.layer_fc.bias."
+
str
(
layer_idx
)),
)
return
input_layer_fc_out
...
...
@@ -479,6 +479,6 @@ class Model(ModelBase):
num_flatten_dims
=
2
,
act
=
self
.
act
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"cls.concat_fc.weight."
+
str
(
layer_idx
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls.concat_fc.bias."
+
str
(
layer_idx
)))
name
=
"cls.concat_fc.weight."
+
str
(
layer_idx
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls.concat_fc.bias."
+
str
(
layer_idx
)))
return
hidden_states_fc
models/treebased/tdm/tdm_evaluate_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -28,6 +28,7 @@ class EvaluateReader(Reader):
"""
Read the data line by line and process it as a dictionary
"""
def
reader
():
"""
This function needs to be implemented by the user, based on data format
...
...
models/treebased/tdm/tdm_reader.py
浏览文件 @
7a3ec4e6
...
...
@@ -28,6 +28,7 @@ class TrainReader(Reader):
"""
Read the data line by line and process it as a dictionary
"""
def
reader
():
"""
This function needs to be implemented by the user, based on data format
...
...
setup.py
浏览文件 @
7a3ec4e6
...
...
@@ -36,7 +36,7 @@ about["__author__"] = "paddle-dev"
about
[
"__author_email__"
]
=
"paddle-dev@baidu.com"
about
[
"__url__"
]
=
"https://github.com/PaddlePaddle/PaddleRec"
readme
=
"
...
"
readme
=
""
def
run_cmd
(
command
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录