Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
9b3afd7c
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
9b3afd7c
编写于
6月 02, 2020
作者:
W
wuzhihua
提交者:
GitHub
6月 02, 2020
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #18 from 123malin/modify_yaml
fix match/dssm
上级
730495a7
13bafd53
变更
24
隐藏空白更改
内联
并排
Showing
24 changed file
with
603 addition
and
698 deletion
+603
-698
core/model.py
core/model.py
+7
-5
core/trainers/single_infer.py
core/trainers/single_infer.py
+1
-0
core/trainers/single_trainer.py
core/trainers/single_trainer.py
+1
-7
models/match/dssm/config.yaml
models/match/dssm/config.yaml
+58
-36
models/match/dssm/model.py
models/match/dssm/model.py
+39
-86
models/match/dssm/synthetic_evaluate_reader.py
models/match/dssm/synthetic_evaluate_reader.py
+1
-1
models/match/multiview-simnet/config.yaml
models/match/multiview-simnet/config.yaml
+65
-41
models/match/multiview-simnet/model.py
models/match/multiview-simnet/model.py
+63
-173
models/match/readme.md
models/match/readme.md
+14
-2
models/recall/gnn/config.yaml
models/recall/gnn/config.yaml
+63
-38
models/recall/gnn/data/convert_data.py
models/recall/gnn/data/convert_data.py
+0
-0
models/recall/gnn/data/download.py
models/recall/gnn/data/download.py
+0
-0
models/recall/gnn/data/preprocess.py
models/recall/gnn/data/preprocess.py
+0
-0
models/recall/gnn/data_prepare.sh
models/recall/gnn/data_prepare.sh
+7
-5
models/recall/gnn/evaluate_reader.py
models/recall/gnn/evaluate_reader.py
+3
-3
models/recall/gnn/model.py
models/recall/gnn/model.py
+75
-114
models/recall/gnn/reader.py
models/recall/gnn/reader.py
+2
-3
models/recall/readme.md
models/recall/readme.md
+36
-2
models/recall/word2vec/config.yaml
models/recall/word2vec/config.yaml
+62
-43
models/recall/word2vec/data_prepare.sh
models/recall/word2vec/data_prepare.sh
+8
-7
models/recall/word2vec/model.py
models/recall/word2vec/model.py
+87
-121
models/recall/word2vec/preprocess.py
models/recall/word2vec/preprocess.py
+1
-1
models/recall/word2vec/w2v_evaluate_reader.py
models/recall/word2vec/w2v_evaluate_reader.py
+5
-3
models/recall/word2vec/w2v_reader.py
models/recall/word2vec/w2v_reader.py
+5
-7
未找到文件。
core/model.py
浏览文件 @
9b3afd7c
...
...
@@ -149,11 +149,13 @@ class Model(object):
return
optimizer_i
def
optimizer
(
self
):
learning_rate
=
envs
.
get_global_env
(
"hyper_parameters.learning_rate"
,
None
,
self
.
_namespace
)
optimizer
=
envs
.
get_global_env
(
"hyper_parameters.optimizer"
,
None
,
self
.
_namespace
)
return
self
.
_build_optimizer
(
optimizer
,
learning_rate
)
opt_name
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.class"
)
opt_lr
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.learning_rate"
)
opt_strategy
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.strategy"
)
return
self
.
_build_optimizer
(
opt_name
,
opt_lr
,
opt_strategy
)
def
input_data
(
self
,
is_infer
=
False
,
**
kwargs
):
name
=
"dataset."
+
kwargs
.
get
(
"dataset_name"
)
+
"."
...
...
core/trainers/single_infer.py
浏览文件 @
9b3afd7c
...
...
@@ -167,6 +167,7 @@ class SingleInfer(TranspileTrainer):
model
=
envs
.
lazy_instance_by_fliename
(
model_path
,
"Model"
)(
self
.
_env
)
model
.
_infer_data_var
=
model
.
input_data
(
is_infer
=
True
,
dataset_name
=
model_dict
[
"dataset_name"
])
if
envs
.
get_global_env
(
"dataset."
+
dataset_name
+
".type"
)
==
"DataLoader"
:
...
...
core/trainers/single_trainer.py
浏览文件 @
9b3afd7c
...
...
@@ -147,11 +147,6 @@ class SingleTrainer(TranspileTrainer):
startup_program
=
fluid
.
Program
()
scope
=
fluid
.
Scope
()
dataset_name
=
model_dict
[
"dataset_name"
]
opt_name
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.class"
)
opt_lr
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.learning_rate"
)
opt_strategy
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.strategy"
)
with
fluid
.
program_guard
(
train_program
,
startup_program
):
with
fluid
.
unique_name
.
guard
():
with
fluid
.
scope_guard
(
scope
):
...
...
@@ -168,8 +163,7 @@ class SingleTrainer(TranspileTrainer):
self
.
_get_dataloader
(
dataset_name
,
model
.
_data_loader
)
model
.
net
(
model
.
_data_var
,
False
)
optimizer
=
model
.
_build_optimizer
(
opt_name
,
opt_lr
,
opt_strategy
)
optimizer
=
model
.
optimizer
()
optimizer
.
minimize
(
model
.
_cost
)
self
.
_model
[
model_dict
[
"name"
]][
0
]
=
train_program
self
.
_model
[
model_dict
[
"name"
]][
1
]
=
startup_program
...
...
models/match/dssm/config.yaml
浏览文件 @
9b3afd7c
...
...
@@ -11,44 +11,66 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
evaluate
:
reader
:
batch_size
:
1
class
:
"
{workspace}/synthetic_evaluate_reader.py"
test_data_path
:
"
{workspace}/data/train"
train
:
trainer
:
# for cluster training
strategy
:
"
async"
epochs
:
4
workspace
:
"
paddlerec.models.match.dssm"
reader
:
batch_size
:
4
class
:
"
{workspace}/synthetic_reader.py"
train_data_path
:
"
{workspace}/data/train"
workspace
:
"
paddlerec.models.match.dssm"
dataset
:
-
name
:
dataset_train
batch_size
:
4
type
:
QueueDataset
data_path
:
"
{workspace}/data/train"
data_converter
:
"
{workspace}/synthetic_reader.py"
-
name
:
dataset_infer
batch_size
:
1
type
:
QueueDataset
data_path
:
"
{workspace}/data/train"
data_converter
:
"
{workspace}/synthetic_evaluate_reader.py"
model
:
models
:
"
{workspace}/model.py"
hyper_parameters
:
TRIGRAM_D
:
1000
NEG
:
4
fc_sizes
:
[
300
,
300
,
128
]
fc_acts
:
[
'
tanh'
,
'
tanh'
,
'
tanh'
]
learning_rate
:
0.01
optimizer
:
sgd
hyper_parameters
:
optimizer
:
class
:
sgd
learning_rate
:
0.01
strategy
:
async
trigram_d
:
1000
neg_num
:
4
fc_sizes
:
[
300
,
300
,
128
]
fc_acts
:
[
'
tanh'
,
'
tanh'
,
'
tanh'
]
save
:
increment
:
dirname
:
"
increment"
epoch_interval
:
2
save_last
:
True
mode
:
train_runner
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner
:
-
name
:
train_runner
class
:
single_train
# num of epochs
epochs
:
4
# device to run training or infer
device
:
cpu
save_checkpoint_interval
:
2
# save model interval of epochs
save_inference_interval
:
4
# save inference
save_checkpoint_path
:
"
increment"
# save checkpoint path
save_inference_path
:
"
inference"
# save inference path
save_inference_feed_varnames
:
[
"
query"
,
"
doc_pos"
]
# feed vars of save inference
save_inference_fetch_varnames
:
[
"
cos_sim_0.tmp_0"
]
# fetch vars of save inference
init_model_path
:
"
"
# load model path
fetch_period
:
2
-
name
:
infer_runner
class
:
single_infer
# num of epochs
epochs
:
1
# device to run training or infer
device
:
cpu
fetch_period
:
1
init_model_path
:
"
increment/2"
# load model path
inference
:
dirname
:
"
inference"
epoch_interval
:
4
feed_varnames
:
[
"
query"
,
"
doc_pos"
]
fetch_varnames
:
[
"
cos_sim_0.tmp_0"
]
save_last
:
True
# runner will run all the phase in each epoch
phase
:
-
name
:
phase1
model
:
"
{workspace}/model.py"
# user-defined model
dataset_name
:
dataset_train
# select dataset by name
thread_num
:
1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
models/match/dssm/model.py
浏览文件 @
9b3afd7c
...
...
@@ -22,45 +22,39 @@ class Model(ModelBase):
def
__init__
(
self
,
config
):
ModelBase
.
__init__
(
self
,
config
)
def
input
(
self
):
TRIGRAM_D
=
envs
.
get_global_env
(
"hyper_parameters.TRIGRAM_D"
,
None
,
self
.
_namespace
)
Neg
=
envs
.
get_global_env
(
"hyper_parameters.NEG"
,
None
,
self
.
_namespace
)
self
.
query
=
fluid
.
data
(
name
=
"query"
,
shape
=
[
-
1
,
TRIGRAM_D
],
dtype
=
'float32'
,
lod_level
=
0
)
self
.
doc_pos
=
fluid
.
data
(
def
_init_hyper_parameters
(
self
):
self
.
trigram_d
=
envs
.
get_global_env
(
"hyper_parameters.trigram_d"
)
self
.
neg_num
=
envs
.
get_global_env
(
"hyper_parameters.neg_num"
)
self
.
hidden_layers
=
envs
.
get_global_env
(
"hyper_parameters.fc_sizes"
)
self
.
hidden_acts
=
envs
.
get_global_env
(
"hyper_parameters.fc_acts"
)
self
.
learning_rate
=
envs
.
get_global_env
(
"hyper_parameters.learning_rate"
)
def
input_data
(
self
,
is_infer
=
False
,
**
kwargs
):
query
=
fluid
.
data
(
name
=
"query"
,
shape
=
[
-
1
,
self
.
trigram_d
],
dtype
=
'float32'
,
lod_level
=
0
)
doc_pos
=
fluid
.
data
(
name
=
"doc_pos"
,
shape
=
[
-
1
,
TRIGRAM_D
],
shape
=
[
-
1
,
self
.
trigram_d
],
dtype
=
'float32'
,
lod_level
=
0
)
self
.
doc_negs
=
[
if
is_infer
:
return
[
query
,
doc_pos
]
doc_negs
=
[
fluid
.
data
(
name
=
"doc_neg_"
+
str
(
i
),
shape
=
[
-
1
,
TRIGRAM_D
],
shape
=
[
-
1
,
self
.
trigram_d
],
dtype
=
"float32"
,
lod_level
=
0
)
for
i
in
range
(
Neg
)
lod_level
=
0
)
for
i
in
range
(
self
.
neg_num
)
]
self
.
_data_var
.
append
(
self
.
query
)
self
.
_data_var
.
append
(
self
.
doc_pos
)
for
input
in
self
.
doc_negs
:
self
.
_data_var
.
append
(
input
)
if
self
.
_platform
!=
"LINUX"
:
self
.
_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
def
net
(
self
,
is_infer
=
False
):
hidden_layers
=
envs
.
get_global_env
(
"hyper_parameters.fc_sizes"
,
None
,
self
.
_namespace
)
hidden_acts
=
envs
.
get_global_env
(
"hyper_parameters.fc_acts"
,
None
,
self
.
_namespace
)
return
[
query
,
doc_pos
]
+
doc_negs
def
net
(
self
,
inputs
,
is_infer
=
False
):
def
fc
(
data
,
hidden_layers
,
hidden_acts
,
names
):
fc_inputs
=
[
data
]
for
i
in
range
(
len
(
hidden_layers
)):
...
...
@@ -77,71 +71,30 @@ class Model(ModelBase):
fc_inputs
.
append
(
out
)
return
fc_inputs
[
-
1
]
query_fc
=
fc
(
self
.
query
,
hidden_layers
,
hidden_acts
,
query_fc
=
fc
(
inputs
[
0
],
self
.
hidden_layers
,
self
.
hidden_acts
,
[
'query_l1'
,
'query_l2'
,
'query_l3'
])
doc_pos_fc
=
fc
(
self
.
doc_pos
,
hidden_layers
,
hidden_acts
,
doc_pos_fc
=
fc
(
inputs
[
1
],
self
.
hidden_layers
,
self
.
hidden_acts
,
[
'doc_pos_l1'
,
'doc_pos_l2'
,
'doc_pos_l3'
])
self
.
R_Q_D_p
=
fluid
.
layers
.
cos_sim
(
query_fc
,
doc_pos_fc
)
R_Q_D_p
=
fluid
.
layers
.
cos_sim
(
query_fc
,
doc_pos_fc
)
if
is_infer
:
self
.
_infer_results
[
"query_doc_sim"
]
=
R_Q_D_p
return
R_Q_D_ns
=
[]
for
i
,
doc_neg
in
enumerate
(
self
.
doc_negs
):
doc_neg_fc_i
=
fc
(
doc_neg
,
hidden_layers
,
hidden_acts
,
[
'doc_neg_l1_'
+
str
(
i
),
'doc_neg_l2_'
+
str
(
i
),
'doc_neg_l3_'
+
str
(
i
)
])
for
i
in
range
(
len
(
inputs
)
-
2
):
doc_neg_fc_i
=
fc
(
inputs
[
i
+
2
],
self
.
hidden_layers
,
self
.
hidden_acts
,
[
'doc_neg_l1_'
+
str
(
i
),
'doc_neg_l2_'
+
str
(
i
),
'doc_neg_l3_'
+
str
(
i
)
])
R_Q_D_ns
.
append
(
fluid
.
layers
.
cos_sim
(
query_fc
,
doc_neg_fc_i
))
concat_Rs
=
fluid
.
layers
.
concat
(
input
=
[
self
.
R_Q_D_p
]
+
R_Q_D_ns
,
axis
=-
1
)
concat_Rs
=
fluid
.
layers
.
concat
(
input
=
[
R_Q_D_p
]
+
R_Q_D_ns
,
axis
=-
1
)
prob
=
fluid
.
layers
.
softmax
(
concat_Rs
,
axis
=
1
)
hit_prob
=
fluid
.
layers
.
slice
(
prob
,
axes
=
[
0
,
1
],
starts
=
[
0
,
0
],
ends
=
[
4
,
1
])
loss
=
-
fluid
.
layers
.
reduce_sum
(
fluid
.
layers
.
log
(
hit_prob
))
self
.
avg_cost
=
fluid
.
layers
.
mean
(
x
=
loss
)
def
infer_results
(
self
):
self
.
_infer_results
[
'query_doc_sim'
]
=
self
.
R_Q_D_p
def
avg_loss
(
self
):
self
.
_cost
=
self
.
avg_cost
def
metrics
(
self
):
self
.
_metrics
[
"LOSS"
]
=
self
.
avg_cost
def
train_net
(
self
):
self
.
input
()
self
.
net
(
is_infer
=
False
)
self
.
avg_loss
()
self
.
metrics
()
def
optimizer
(
self
):
learning_rate
=
envs
.
get_global_env
(
"hyper_parameters.learning_rate"
,
None
,
self
.
_namespace
)
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
)
return
optimizer
def
infer_input
(
self
):
TRIGRAM_D
=
envs
.
get_global_env
(
"hyper_parameters.TRIGRAM_D"
,
None
,
self
.
_namespace
)
self
.
query
=
fluid
.
data
(
name
=
"query"
,
shape
=
[
-
1
,
TRIGRAM_D
],
dtype
=
'float32'
,
lod_level
=
0
)
self
.
doc_pos
=
fluid
.
data
(
name
=
"doc_pos"
,
shape
=
[
-
1
,
TRIGRAM_D
],
dtype
=
'float32'
,
lod_level
=
0
)
self
.
_infer_data_var
=
[
self
.
query
,
self
.
doc_pos
]
self
.
_infer_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
def
infer_net
(
self
):
self
.
infer_input
()
self
.
net
(
is_infer
=
True
)
self
.
infer_results
()
avg_cost
=
fluid
.
layers
.
mean
(
x
=
loss
)
self
.
_cost
=
avg_cost
self
.
_metrics
[
"LOSS"
]
=
avg_cost
models/match/dssm/synthetic_evaluate_reader.py
浏览文件 @
9b3afd7c
...
...
@@ -16,7 +16,7 @@ from __future__ import print_function
from
paddlerec.core.reader
import
Reader
class
Evaluate
Reader
(
Reader
):
class
Train
Reader
(
Reader
):
def
init
(
self
):
pass
...
...
models/match/multiview-simnet/config.yaml
浏览文件 @
9b3afd7c
...
...
@@ -11,49 +11,73 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
evaluate
:
workspace
:
"
paddlerec.models.match.multiview-simnet"
reader
:
batch_size
:
2
class
:
"
{workspace}/evaluate_reader.py"
test_data_path
:
"
{workspace}/data/test"
train
:
trainer
:
# for cluster training
strategy
:
"
async"
# workspace
workspace
:
"
paddlerec.models.match.multiview-simnet"
epochs
:
2
workspace
:
"
paddlerec.models.match.multiview-simnet"
# list of dataset
dataset
:
-
name
:
dataset_train
# name of dataset to distinguish different datasets
batch_size
:
2
type
:
DataLoader
# or QueueDataset
data_path
:
"
{workspace}/data/train"
sparse_slots
:
"
1
2
3"
-
name
:
dataset_infer
# name
batch_size
:
2
type
:
DataLoader
# or QueueDataset
data_path
:
"
{workspace}/data/test"
sparse_slots
:
"
1
2"
reader
:
batch_size
:
2
class
:
"
{workspace}/reader.py"
train_data_path
:
"
{workspace}/data/train"
dataset_class
:
"
DataLoader"
# hyper parameters of user-defined network
hyper_parameters
:
optimizer
:
class
:
Adam
learning_rate
:
0.0001
strategy
:
async
query_encoder
:
"
bow"
title_encoder
:
"
bow"
query_encode_dim
:
128
title_encode_dim
:
128
sparse_feature_dim
:
1000001
embedding_dim
:
128
hidden_size
:
128
margin
:
0.1
model
:
models
:
"
{workspace}/model.py"
hyper_parameters
:
use_DataLoader
:
True
query_encoder
:
"
bow"
title_encoder
:
"
bow"
query_encode_dim
:
128
title_encode_dim
:
128
query_slots
:
1
title_slots
:
1
sparse_feature_dim
:
1000001
embedding_dim
:
128
hidden_size
:
128
learning_rate
:
0.0001
optimizer
:
adam
# select runner by name
mode
:
train_runner
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner
:
-
name
:
train_runner
class
:
single_train
# num of epochs
epochs
:
2
# device to run training or infer
device
:
cpu
save_checkpoint_interval
:
1
# save model interval of epochs
save_inference_interval
:
1
# save inference
save_checkpoint_path
:
"
increment"
# save checkpoint path
save_inference_path
:
"
inference"
# save inference path
save_inference_feed_varnames
:
[]
# feed vars of save inference
save_inference_fetch_varnames
:
[]
# fetch vars of save inference
init_model_path
:
"
"
# load model path
fetch_period
:
1
-
name
:
infer_runner
class
:
single_infer
# num of epochs
epochs
:
1
# device to run training or infer
device
:
cpu
fetch_period
:
1
init_model_path
:
"
increment/0"
# load model path
save
:
increment
:
dirname
:
"
increment"
epoch_interval
:
1
save_last
:
True
inference
:
dirname
:
"
inference"
epoch_interval
:
1
save_last
:
True
# runner will run all the phase in each epoch
phase
:
-
name
:
phase1
model
:
"
{workspace}/model.py"
# user-defined model
dataset_name
:
dataset_train
# select dataset by name
thread_num
:
1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
models/match/multiview-simnet/model.py
浏览文件 @
9b3afd7c
...
...
@@ -99,143 +99,89 @@ class SimpleEncoderFactory(object):
class
Model
(
ModelBase
):
def
__init__
(
self
,
config
):
ModelBase
.
__init__
(
self
,
config
)
self
.
init_config
()
def
init_config
(
self
):
self
.
_fetch_interval
=
1
query_encoder
=
envs
.
get_global_env
(
"hyper_parameters.query_encoder"
,
None
,
self
.
_namespace
)
title_encoder
=
envs
.
get_global_env
(
"hyper_parameters.title_encoder"
,
None
,
self
.
_namespace
)
query_encode_dim
=
envs
.
get_global_env
(
"hyper_parameters.query_encode_dim"
,
None
,
self
.
_namespace
)
title_encode_dim
=
envs
.
get_global_env
(
"hyper_parameters.title_encode_dim"
,
None
,
self
.
_namespace
)
query_slots
=
envs
.
get_global_env
(
"hyper_parameters.query_slots"
,
None
,
self
.
_namespace
)
title_slots
=
envs
.
get_global_env
(
"hyper_parameters.title_slots"
,
None
,
self
.
_namespace
)
factory
=
SimpleEncoderFactory
()
self
.
query_encoders
=
[
factory
.
create
(
query_encoder
,
query_encode_dim
)
for
i
in
range
(
query_slots
)
]
self
.
title_encoders
=
[
factory
.
create
(
title_encoder
,
title_encode_dim
)
for
i
in
range
(
title_slots
)
]
def
_init_hyper_parameters
(
self
):
self
.
query_encoder
=
envs
.
get_global_env
(
"hyper_parameters.query_encoder"
)
self
.
title_encoder
=
envs
.
get_global_env
(
"hyper_parameters.title_encoder"
)
self
.
query_encode_dim
=
envs
.
get_global_env
(
"hyper_parameters.query_encode_dim"
)
self
.
title_encode_dim
=
envs
.
get_global_env
(
"hyper_parameters.title_encode_dim"
)
self
.
emb_size
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
,
None
,
self
.
_namespace
)
self
.
emb_dim
=
envs
.
get_global_env
(
"hyper_parameters.embedding_dim"
,
None
,
self
.
_namespace
)
"hyper_parameters.sparse_feature_dim"
)
self
.
emb_dim
=
envs
.
get_global_env
(
"hyper_parameters.embedding_dim"
)
self
.
emb_shape
=
[
self
.
emb_size
,
self
.
emb_dim
]
self
.
hidden_size
=
envs
.
get_global_env
(
"hyper_parameters.hidden_size"
,
None
,
self
.
_namespace
)
self
.
margin
=
0.1
def
input
(
self
,
is_train
=
True
):
self
.
q_slots
=
[
fluid
.
data
(
name
=
"%d"
%
i
,
shape
=
[
None
,
1
],
lod_level
=
1
,
dtype
=
'int64'
)
for
i
in
range
(
len
(
self
.
query_encoders
))
]
self
.
pt_slots
=
[
fluid
.
data
(
name
=
"%d"
%
(
i
+
len
(
self
.
query_encoders
)),
shape
=
[
None
,
1
],
lod_level
=
1
,
dtype
=
'int64'
)
for
i
in
range
(
len
(
self
.
title_encoders
))
]
if
is_train
==
False
:
return
self
.
q_slots
+
self
.
pt_slots
self
.
hidden_size
=
envs
.
get_global_env
(
"hyper_parameters.hidden_size"
)
self
.
margin
=
envs
.
get_global_env
(
"hyper_parameters.margin"
)
self
.
nt_slots
=
[
fluid
.
data
(
name
=
"%d"
%
(
i
+
len
(
self
.
query_encoders
)
+
len
(
self
.
title_encoders
)),
shape
=
[
None
,
1
],
lod_level
=
1
,
dtype
=
'int64'
)
for
i
in
range
(
len
(
self
.
title_encoders
))
def
net
(
self
,
input
,
is_infer
=
False
):
factory
=
SimpleEncoderFactory
()
self
.
q_slots
=
self
.
_sparse_data_var
[
0
:
1
]
self
.
query_encoders
=
[
factory
.
create
(
self
.
query_encoder
,
self
.
query_encode_dim
)
for
_
in
self
.
q_slots
]
return
self
.
q_slots
+
self
.
pt_slots
+
self
.
nt_slots
def
train_input
(
self
):
res
=
self
.
input
()
self
.
_data_var
=
res
use_dataloader
=
envs
.
get_global_env
(
"hyper_parameters.use_DataLoader"
,
False
,
self
.
_namespace
)
if
self
.
_platform
!=
"LINUX"
or
use_dataloader
:
self
.
_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_data_var
,
capacity
=
256
,
use_double_buffer
=
False
,
iterable
=
False
)
def
get_acc
(
self
,
x
,
y
):
less
=
tensor
.
cast
(
cf
.
less_than
(
x
,
y
),
dtype
=
'float32'
)
label_ones
=
fluid
.
layers
.
fill_constant_batch_size_like
(
input
=
x
,
dtype
=
'float32'
,
shape
=
[
-
1
,
1
],
value
=
1.0
)
correct
=
fluid
.
layers
.
reduce_sum
(
less
)
total
=
fluid
.
layers
.
reduce_sum
(
label_ones
)
acc
=
fluid
.
layers
.
elementwise_div
(
correct
,
total
)
return
acc
def
net
(
self
):
q_embs
=
[
fluid
.
embedding
(
input
=
query
,
size
=
self
.
emb_shape
,
param_attr
=
"emb"
)
for
query
in
self
.
q_slots
]
pt_embs
=
[
fluid
.
embedding
(
input
=
title
,
size
=
self
.
emb_shape
,
param_attr
=
"emb"
)
for
title
in
self
.
pt_slots
]
nt_embs
=
[
fluid
.
embedding
(
input
=
title
,
size
=
self
.
emb_shape
,
param_attr
=
"emb"
)
for
title
in
self
.
nt_slots
]
# encode each embedding field with encoder
q_encodes
=
[
self
.
query_encoders
[
i
].
forward
(
emb
)
for
i
,
emb
in
enumerate
(
q_embs
)
]
pt_encodes
=
[
self
.
title_encoders
[
i
].
forward
(
emb
)
for
i
,
emb
in
enumerate
(
pt_embs
)
]
nt_encodes
=
[
self
.
title_encoders
[
i
].
forward
(
emb
)
for
i
,
emb
in
enumerate
(
nt_embs
)
]
# concat multi view for query, pos_title, neg_title
q_concat
=
fluid
.
layers
.
concat
(
q_encodes
)
pt_concat
=
fluid
.
layers
.
concat
(
pt_encodes
)
nt_concat
=
fluid
.
layers
.
concat
(
nt_encodes
)
# projection of hidden layer
q_hid
=
fluid
.
layers
.
fc
(
q_concat
,
size
=
self
.
hidden_size
,
param_attr
=
'q_fc.w'
,
bias_attr
=
'q_fc.b'
)
self
.
pt_slots
=
self
.
_sparse_data_var
[
1
:
2
]
self
.
title_encoders
=
[
factory
.
create
(
self
.
title_encoder
,
self
.
title_encode_dim
)
]
pt_embs
=
[
fluid
.
embedding
(
input
=
title
,
size
=
self
.
emb_shape
,
param_attr
=
"emb"
)
for
title
in
self
.
pt_slots
]
pt_encodes
=
[
self
.
title_encoders
[
i
].
forward
(
emb
)
for
i
,
emb
in
enumerate
(
pt_embs
)
]
pt_concat
=
fluid
.
layers
.
concat
(
pt_encodes
)
pt_hid
=
fluid
.
layers
.
fc
(
pt_concat
,
size
=
self
.
hidden_size
,
param_attr
=
't_fc.w'
,
bias_attr
=
't_fc.b'
)
# cosine of hidden layers
cos_pos
=
fluid
.
layers
.
cos_sim
(
q_hid
,
pt_hid
)
if
is_infer
:
self
.
_infer_results
[
'query_pt_sim'
]
=
cos_pos
return
self
.
nt_slots
=
self
.
_sparse_data_var
[
2
:
3
]
nt_embs
=
[
fluid
.
embedding
(
input
=
title
,
size
=
self
.
emb_shape
,
param_attr
=
"emb"
)
for
title
in
self
.
nt_slots
]
nt_encodes
=
[
self
.
title_encoders
[
i
].
forward
(
emb
)
for
i
,
emb
in
enumerate
(
nt_embs
)
]
nt_concat
=
fluid
.
layers
.
concat
(
nt_encodes
)
nt_hid
=
fluid
.
layers
.
fc
(
nt_concat
,
size
=
self
.
hidden_size
,
param_attr
=
't_fc.w'
,
bias_attr
=
't_fc.b'
)
# cosine of hidden layers
cos_pos
=
fluid
.
layers
.
cos_sim
(
q_hid
,
pt_hid
)
cos_neg
=
fluid
.
layers
.
cos_sim
(
q_hid
,
nt_hid
)
# pairwise hinge_loss
...
...
@@ -254,72 +200,16 @@ class Model(ModelBase):
input
=
loss_part2
,
shape
=
[
-
1
,
1
],
value
=
0.0
,
dtype
=
'float32'
),
loss_part2
)
self
.
avg
_cost
=
fluid
.
layers
.
mean
(
loss_part3
)
self
.
_cost
=
fluid
.
layers
.
mean
(
loss_part3
)
self
.
acc
=
self
.
get_acc
(
cos_neg
,
cos_pos
)
def
avg_loss
(
self
):
self
.
_cost
=
self
.
avg_cost
def
metrics
(
self
):
self
.
_metrics
[
"loss"
]
=
self
.
avg_cost
self
.
_metrics
[
"loss"
]
=
self
.
_cost
self
.
_metrics
[
"acc"
]
=
self
.
acc
def
train_net
(
self
):
self
.
train_input
()
self
.
net
()
self
.
avg_loss
()
self
.
metrics
()
def
optimizer
(
self
):
learning_rate
=
envs
.
get_global_env
(
"hyper_parameters.learning_rate"
,
None
,
self
.
_namespace
)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
learning_rate
)
return
optimizer
def
infer_input
(
self
):
res
=
self
.
input
(
is_train
=
False
)
self
.
_infer_data_var
=
res
self
.
_infer_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
def
infer_net
(
self
):
self
.
infer_input
()
# lookup embedding for each slot
q_embs
=
[
fluid
.
embedding
(
input
=
query
,
size
=
self
.
emb_shape
,
param_attr
=
"emb"
)
for
query
in
self
.
q_slots
]
pt_embs
=
[
fluid
.
embedding
(
input
=
title
,
size
=
self
.
emb_shape
,
param_attr
=
"emb"
)
for
title
in
self
.
pt_slots
]
# encode each embedding field with encoder
q_encodes
=
[
self
.
query_encoders
[
i
].
forward
(
emb
)
for
i
,
emb
in
enumerate
(
q_embs
)
]
pt_encodes
=
[
self
.
title_encoders
[
i
].
forward
(
emb
)
for
i
,
emb
in
enumerate
(
pt_embs
)
]
# concat multi view for query, pos_title, neg_title
q_concat
=
fluid
.
layers
.
concat
(
q_encodes
)
pt_concat
=
fluid
.
layers
.
concat
(
pt_encodes
)
# projection of hidden layer
q_hid
=
fluid
.
layers
.
fc
(
q_concat
,
size
=
self
.
hidden_size
,
param_attr
=
'q_fc.w'
,
bias_attr
=
'q_fc.b'
)
pt_hid
=
fluid
.
layers
.
fc
(
pt_concat
,
size
=
self
.
hidden_size
,
param_attr
=
't_fc.w'
,
bias_attr
=
't_fc.b'
)
# cosine of hidden layers
cos
=
fluid
.
layers
.
cos_sim
(
q_hid
,
pt_hid
)
self
.
_infer_results
[
'query_pt_sim'
]
=
cos
def
get_acc
(
self
,
x
,
y
):
less
=
tensor
.
cast
(
cf
.
less_than
(
x
,
y
),
dtype
=
'float32'
)
label_ones
=
fluid
.
layers
.
fill_constant_batch_size_like
(
input
=
x
,
dtype
=
'float32'
,
shape
=
[
-
1
,
1
],
value
=
1.0
)
correct
=
fluid
.
layers
.
reduce_sum
(
less
)
total
=
fluid
.
layers
.
reduce_sum
(
label_ones
)
acc
=
fluid
.
layers
.
elementwise_div
(
correct
,
total
)
return
acc
models/match/readme.md
浏览文件 @
9b3afd7c
...
...
@@ -31,9 +31,21 @@
<img
align=
"center"
src=
"../../doc/imgs/multiview-simnet.png"
>
<p>
## 使用教程
### 训练
&预测
## 使用教程
(快速开始)
### 训练
```
shell
python
-m
paddlerec.run
-m
paddlerec.models.match.dssm
# dssm
python
-m
paddlerec.run
-m
paddlerec.models.match.multiview-simnet
# multiview-simnet
```
### 预测
```
shell
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer
# 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行:
python
-m
paddlerec.run
-m
./config.yaml
# 以dssm为例
```
models/recall/gnn/config.yaml
浏览文件 @
9b3afd7c
...
...
@@ -11,46 +11,71 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
evaluate
:
workspace
:
"
paddlerec.models.recall.gnn"
reader
:
batch_size
:
50
class
:
"
{workspace}/evaluate_reader.py"
test_data_path
:
"
{workspace}/data/test"
train
:
trainer
:
# for cluster training
strategy
:
"
async"
# workspace
workspace
:
"
paddlerec.models.recall.gnn"
epochs
:
2
workspace
:
"
paddlerec.models.recall.gnn"
# list of dataset
dataset
:
-
name
:
dataset_train
# name of dataset to distinguish different datasets
batch_size
:
100
type
:
DataLoader
# or QueueDataset
data_path
:
"
{workspace}/data/train"
data_converter
:
"
{workspace}/reader.py"
-
name
:
dataset_infer
# name
batch_size
:
50
type
:
DataLoader
# or QueueDataset
data_path
:
"
{workspace}/data/test"
data_converter
:
"
{workspace}/evaluate_reader.py"
reader
:
batch_size
:
100
class
:
"
{workspace}/reader.py"
train_data_path
:
"
{workspace}/data/train"
dataset_class
:
"
DataLoader"
# hyper parameters of user-defined network
hyper_parameters
:
optimizer
:
class
:
Adam
learning_rate
:
0.001
decay_steps
:
3
decay_rate
:
0.1
l2
:
0.00001
sparse_feature_number
:
43098
sparse_feature_dim
:
100
corpus_size
:
719470
gnn_propogation_steps
:
1
model
:
models
:
"
{workspace}/model.py"
hyper_parameters
:
use_DataLoader
:
True
config_path
:
"
{workspace}/data/config.txt"
sparse_feature_dim
:
100
gnn_propogation_steps
:
1
learning_rate
:
0.001
l2
:
0.00001
decay_steps
:
3
decay_rate
:
0.1
optimizer
:
adam
# select runner by name
mode
:
train_runner
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner
:
-
name
:
train_runner
class
:
single_train
# num of epochs
epochs
:
2
# device to run training or infer
device
:
cpu
save_checkpoint_interval
:
1
# save model interval of epochs
save_inference_interval
:
1
# save inference
save_checkpoint_path
:
"
increment"
# save checkpoint path
save_inference_path
:
"
inference"
# save inference path
save_inference_feed_varnames
:
[]
# feed vars of save inference
save_inference_fetch_varnames
:
[]
# fetch vars of save inference
init_model_path
:
"
"
# load model path
fetch_period
:
10
-
name
:
infer_runner
class
:
single_infer
# num of epochs
epochs
:
1
# device to run training or infer
device
:
cpu
fetch_period
:
1
init_model_path
:
"
increment/0"
# load model path
save
:
increment
:
dirname
:
"
increment"
epoch_interval
:
1
save_last
:
True
inference
:
dirname
:
"
inference"
epoch_interval
:
1
save_last
:
True
# runner will run all the phase in each epoch
phase
:
-
name
:
phase1
model
:
"
{workspace}/model.py"
# user-defined model
dataset_name
:
dataset_train
# select dataset by name
thread_num
:
1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
models/recall/gnn/
raw_
data/convert_data.py
→
models/recall/gnn/data/convert_data.py
浏览文件 @
9b3afd7c
文件已移动
models/recall/gnn/
raw_
data/download.py
→
models/recall/gnn/data/download.py
浏览文件 @
9b3afd7c
文件已移动
models/recall/gnn/
raw_
data/preprocess.py
→
models/recall/gnn/data/preprocess.py
浏览文件 @
9b3afd7c
文件已移动
models/recall/gnn/data_pr
ocess
.sh
→
models/recall/gnn/data_pr
epare
.sh
浏览文件 @
9b3afd7c
...
...
@@ -17,7 +17,7 @@
set
-e
echo
"begin to download data"
cd
raw_
data
&&
python download.py
cd
data
&&
python download.py
mkdir
diginetica
python preprocess.py
--dataset
diginetica
...
...
@@ -26,8 +26,10 @@ python convert_data.py --data_dir diginetica
cat
diginetica/train.txt |
wc
-l
>>
diginetica/config.txt
mkdir
train_data
mv
diginetica/train.txt train
_data
rm
-rf
train
&&
mkdir
train
mv
diginetica/train.txt train
mkdir
test_data
mv
diginetica/test.txt test_data
rm
-rf
test
&&
mkdir test
mv
diginetica/test.txt
test
mv
diginetica/config.txt ./config.txt
models/recall/gnn/evaluate_reader.py
浏览文件 @
9b3afd7c
...
...
@@ -21,10 +21,10 @@ from paddlerec.core.reader import Reader
from
paddlerec.core.utils
import
envs
class
Evaluate
Reader
(
Reader
):
class
Train
Reader
(
Reader
):
def
init
(
self
):
self
.
batch_size
=
envs
.
get_global_env
(
"batch_size"
,
None
,
"evaluate.reader
"
)
self
.
batch_size
=
envs
.
get_global_env
(
"dataset.dataset_infer.batch_size
"
)
self
.
input
=
[]
self
.
length
=
None
...
...
models/recall/gnn/model.py
浏览文件 @
9b3afd7c
...
...
@@ -25,74 +25,65 @@ from paddlerec.core.model import Model as ModelBase
class
Model
(
ModelBase
):
def
__init__
(
self
,
config
):
ModelBase
.
__init__
(
self
,
config
)
self
.
init_config
()
def
init_config
(
self
):
self
.
_fetch_interval
=
1
self
.
items_num
,
self
.
ins_num
=
self
.
config_read
(
envs
.
get_global_env
(
"hyper_parameters.config_path"
,
None
,
self
.
_namespace
))
self
.
train_batch_size
=
envs
.
get_global_env
(
"batch_size"
,
None
,
"train.reader"
)
self
.
evaluate_batch_size
=
envs
.
get_global_env
(
"batch_size"
,
None
,
"evaluate.reader"
)
self
.
hidden_size
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
,
None
,
self
.
_namespace
)
self
.
step
=
envs
.
get_global_env
(
"hyper_parameters.gnn_propogation_steps"
,
None
,
self
.
_namespace
)
def
_init_hyper_parameters
(
self
):
self
.
learning_rate
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.learning_rate"
)
self
.
decay_steps
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.decay_steps"
)
self
.
decay_rate
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.decay_rate"
)
self
.
l2
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.l2"
)
self
.
dict_size
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_number"
)
self
.
corpus_size
=
envs
.
get_global_env
(
"hyper_parameters.corpus_size"
)
def
config_read
(
self
,
config_path
=
None
):
if
config_path
is
None
:
raise
ValueError
(
"please set train.model.hyper_parameters.config_path at first"
)
with
open
(
config_path
,
"r"
)
as
fin
:
item_nums
=
int
(
fin
.
readline
().
strip
())
ins_nums
=
int
(
fin
.
readline
().
strip
())
return
item_nums
,
ins_nums
self
.
train_batch_size
=
envs
.
get_global_env
(
"dataset.dataset_train.batch_size"
)
self
.
evaluate_batch_size
=
envs
.
get_global_env
(
"dataset.dataset_infer.batch_size"
)
def
input
(
self
,
bs
):
self
.
items
=
fluid
.
data
(
self
.
hidden_size
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
)
self
.
step
=
envs
.
get_global_env
(
"hyper_parameters.gnn_propogation_steps"
)
def
input_data
(
self
,
is_infer
=
False
,
**
kwargs
):
if
is_infer
:
bs
=
self
.
evaluate_batch_size
else
:
bs
=
self
.
train_batch_size
items
=
fluid
.
data
(
name
=
"items"
,
shape
=
[
bs
,
-
1
],
dtype
=
"int64"
)
# [batch_size, uniq_max]
se
lf
.
se
q_index
=
fluid
.
data
(
seq_index
=
fluid
.
data
(
name
=
"seq_index"
,
shape
=
[
bs
,
-
1
,
2
],
dtype
=
"int32"
)
# [batch_size, seq_max, 2]
self
.
last_index
=
fluid
.
data
(
last_index
=
fluid
.
data
(
name
=
"last_index"
,
shape
=
[
bs
,
2
],
dtype
=
"int32"
)
# [batch_size, 2]
self
.
adj_in
=
fluid
.
data
(
adj_in
=
fluid
.
data
(
name
=
"adj_in"
,
shape
=
[
bs
,
-
1
,
-
1
],
dtype
=
"float32"
)
# [batch_size, seq_max, seq_max]
self
.
adj_out
=
fluid
.
data
(
adj_out
=
fluid
.
data
(
name
=
"adj_out"
,
shape
=
[
bs
,
-
1
,
-
1
],
dtype
=
"float32"
)
# [batch_size, seq_max, seq_max]
self
.
mask
=
fluid
.
data
(
mask
=
fluid
.
data
(
name
=
"mask"
,
shape
=
[
bs
,
-
1
,
1
],
dtype
=
"float32"
)
# [batch_size, seq_max, 1]
self
.
label
=
fluid
.
data
(
label
=
fluid
.
data
(
name
=
"label"
,
shape
=
[
bs
,
1
],
dtype
=
"int64"
)
# [batch_size, 1]
res
=
[
self
.
items
,
self
.
seq_index
,
self
.
last_index
,
self
.
adj_in
,
self
.
adj_out
,
self
.
mask
,
self
.
label
]
res
=
[
items
,
seq_index
,
last_index
,
adj_in
,
adj_out
,
mask
,
label
]
return
res
def
train_input
(
self
):
res
=
self
.
input
(
self
.
train_batch_size
)
self
.
_data_var
=
res
use_dataloader
=
envs
.
get_global_env
(
"hyper_parameters.use_DataLoader"
,
False
,
self
.
_namespace
)
def
net
(
self
,
inputs
,
is_infer
=
False
):
if
is_infer
:
bs
=
self
.
evaluate_batch_size
else
:
bs
=
self
.
train_batch_size
if
self
.
_platform
!=
"LINUX"
or
use_dataloader
:
self
.
_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_data_var
,
capacity
=
256
,
use_double_buffer
=
False
,
iterable
=
False
)
def
net
(
self
,
items_num
,
hidden_size
,
step
,
bs
):
stdv
=
1.0
/
math
.
sqrt
(
hidden_size
)
stdv
=
1.0
/
math
.
sqrt
(
self
.
hidden_size
)
def
embedding_layer
(
input
,
table_name
,
...
...
@@ -100,22 +91,22 @@ class Model(ModelBase):
initializer_instance
=
None
):
emb
=
fluid
.
embedding
(
input
=
input
,
size
=
[
items_num
,
emb_dim
],
size
=
[
self
.
dict_size
,
emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
name
=
table_name
,
initializer
=
initializer_instance
)
,
)
name
=
table_name
,
initializer
=
initializer_instance
))
return
emb
sparse_initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)
items_emb
=
embedding_layer
(
self
.
items
,
"emb"
,
hidden_size
,
items_emb
=
embedding_layer
(
inputs
[
0
],
"emb"
,
self
.
hidden_size
,
sparse_initializer
)
pre_state
=
items_emb
for
i
in
range
(
step
):
for
i
in
range
(
s
elf
.
s
tep
):
pre_state
=
layers
.
reshape
(
x
=
pre_state
,
shape
=
[
bs
,
-
1
,
hidden_size
])
x
=
pre_state
,
shape
=
[
bs
,
-
1
,
self
.
hidden_size
])
state_in
=
layers
.
fc
(
input
=
pre_state
,
name
=
"state_in"
,
size
=
hidden_size
,
size
=
self
.
hidden_size
,
act
=
None
,
num_flatten_dims
=
2
,
param_attr
=
fluid
.
ParamAttr
(
...
...
@@ -127,7 +118,7 @@ class Model(ModelBase):
state_out
=
layers
.
fc
(
input
=
pre_state
,
name
=
"state_out"
,
size
=
hidden_size
,
size
=
self
.
hidden_size
,
act
=
None
,
num_flatten_dims
=
2
,
param_attr
=
fluid
.
ParamAttr
(
...
...
@@ -137,33 +128,34 @@ class Model(ModelBase):
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)))
# [batch_size, uniq_max, h]
state_adj_in
=
layers
.
matmul
(
self
.
adj_in
,
state_adj_in
=
layers
.
matmul
(
inputs
[
3
]
,
state_in
)
# [batch_size, uniq_max, h]
state_adj_out
=
layers
.
matmul
(
self
.
adj_out
,
state_out
)
# [batch_size, uniq_max, h]
inputs
[
4
]
,
state_out
)
# [batch_size, uniq_max, h]
gru_input
=
layers
.
concat
([
state_adj_in
,
state_adj_out
],
axis
=
2
)
gru_input
=
layers
.
reshape
(
x
=
gru_input
,
shape
=
[
-
1
,
hidden_size
*
2
])
x
=
gru_input
,
shape
=
[
-
1
,
self
.
hidden_size
*
2
])
gru_fc
=
layers
.
fc
(
input
=
gru_input
,
name
=
"gru_fc"
,
size
=
3
*
hidden_size
,
size
=
3
*
self
.
hidden_size
,
bias_attr
=
False
)
pre_state
,
_
,
_
=
fluid
.
layers
.
gru_unit
(
input
=
gru_fc
,
hidden
=
layers
.
reshape
(
x
=
pre_state
,
shape
=
[
-
1
,
hidden_size
]),
size
=
3
*
hidden_size
)
x
=
pre_state
,
shape
=
[
-
1
,
self
.
hidden_size
]),
size
=
3
*
self
.
hidden_size
)
final_state
=
layers
.
reshape
(
pre_state
,
shape
=
[
bs
,
-
1
,
hidden_size
])
seq
=
layers
.
gather_nd
(
final_state
,
self
.
seq_index
)
last
=
layers
.
gather_nd
(
final_state
,
self
.
last_index
)
final_state
=
layers
.
reshape
(
pre_state
,
shape
=
[
bs
,
-
1
,
self
.
hidden_size
])
seq
=
layers
.
gather_nd
(
final_state
,
inputs
[
1
])
last
=
layers
.
gather_nd
(
final_state
,
inputs
[
2
])
seq_fc
=
layers
.
fc
(
input
=
seq
,
name
=
"seq_fc"
,
size
=
hidden_size
,
size
=
self
.
hidden_size
,
bias_attr
=
False
,
act
=
None
,
num_flatten_dims
=
2
,
...
...
@@ -171,7 +163,7 @@ class Model(ModelBase):
low
=-
stdv
,
high
=
stdv
)))
# [batch_size, seq_max, h]
last_fc
=
layers
.
fc
(
input
=
last
,
name
=
"last_fc"
,
size
=
hidden_size
,
size
=
self
.
hidden_size
,
bias_attr
=
False
,
act
=
None
,
num_flatten_dims
=
1
,
...
...
@@ -184,7 +176,7 @@ class Model(ModelBase):
add
=
layers
.
elementwise_add
(
seq_fc_t
,
last_fc
)
# [seq_max, batch_size, h]
b
=
layers
.
create_parameter
(
shape
=
[
hidden_size
],
shape
=
[
self
.
hidden_size
],
dtype
=
'float32'
,
default_initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.0
))
# [h]
add
=
layers
.
elementwise_add
(
add
,
b
)
# [seq_max, batch_size, h]
...
...
@@ -202,7 +194,7 @@ class Model(ModelBase):
bias_attr
=
False
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)))
# [batch_size, seq_max, 1]
weight
*=
self
.
mask
weight
*=
inputs
[
5
]
weight_mask
=
layers
.
elementwise_mul
(
seq
,
weight
,
axis
=
0
)
# [batch_size, seq_max, h]
global_attention
=
layers
.
reduce_sum
(
...
...
@@ -213,7 +205,7 @@ class Model(ModelBase):
final_attention_fc
=
layers
.
fc
(
input
=
final_attention
,
name
=
"final_attention_fc"
,
size
=
hidden_size
,
size
=
self
.
hidden_size
,
bias_attr
=
False
,
act
=
None
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
...
...
@@ -225,7 +217,7 @@ class Model(ModelBase):
# dtype="int64",
# persistable=True,
# name="all_vocab")
all_vocab
=
np
.
arange
(
1
,
items_num
).
reshape
((
-
1
)).
astype
(
'int32'
)
all_vocab
=
np
.
arange
(
1
,
self
.
dict_size
).
reshape
((
-
1
)).
astype
(
'int32'
)
all_vocab
=
fluid
.
layers
.
cast
(
x
=
fluid
.
layers
.
assign
(
all_vocab
),
dtype
=
'int64'
)
...
...
@@ -235,63 +227,32 @@ class Model(ModelBase):
name
=
"emb"
,
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
stdv
,
high
=
stdv
)),
size
=
[
items_num
,
hidden_size
])
# [all_vocab, h]
size
=
[
self
.
dict_size
,
self
.
hidden_size
])
# [all_vocab, h]
logits
=
layers
.
matmul
(
x
=
final_attention_fc
,
y
=
all_emb
,
transpose_y
=
True
)
# [batch_size, all_vocab]
softmax
=
layers
.
softmax_with_cross_entropy
(
logits
=
logits
,
label
=
self
.
label
)
# [batch_size, 1]
logits
=
logits
,
label
=
inputs
[
6
]
)
# [batch_size, 1]
self
.
loss
=
layers
.
reduce_mean
(
softmax
)
# [1]
self
.
acc
=
layers
.
accuracy
(
input
=
logits
,
label
=
self
.
label
,
k
=
20
)
self
.
acc
=
layers
.
accuracy
(
input
=
logits
,
label
=
inputs
[
6
]
,
k
=
20
)
def
avg_loss
(
self
):
self
.
_cost
=
self
.
loss
if
is_infer
:
self
.
_infer_results
[
'acc'
]
=
self
.
acc
self
.
_infer_results
[
'loss'
]
=
self
.
loss
return
def
metrics
(
self
):
self
.
_metrics
[
"LOSS"
]
=
self
.
loss
self
.
_metrics
[
"train_acc"
]
=
self
.
acc
def
train_net
(
self
):
self
.
train_input
()
self
.
net
(
self
.
items_num
,
self
.
hidden_size
,
self
.
step
,
self
.
train_batch_size
)
self
.
avg_loss
()
self
.
metrics
()
def
optimizer
(
self
):
learning_rate
=
envs
.
get_global_env
(
"hyper_parameters.learning_rate"
,
None
,
self
.
_namespace
)
step_per_epoch
=
self
.
ins_num
//
self
.
train_batch_size
decay_steps
=
envs
.
get_global_env
(
"hyper_parameters.decay_steps"
,
None
,
self
.
_namespace
)
decay_rate
=
envs
.
get_global_env
(
"hyper_parameters.decay_rate"
,
None
,
self
.
_namespace
)
l2
=
envs
.
get_global_env
(
"hyper_parameters.l2"
,
None
,
self
.
_namespace
)
step_per_epoch
=
self
.
corpus_size
//
self
.
train_batch_size
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
fluid
.
layers
.
exponential_decay
(
learning_rate
=
learning_rate
,
decay_steps
=
decay_steps
*
step_per_epoch
,
decay_rate
=
decay_rate
),
learning_rate
=
self
.
learning_rate
,
decay_steps
=
self
.
decay_steps
*
step_per_epoch
,
decay_rate
=
self
.
decay_rate
),
regularization
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
l2
))
regularization_coeff
=
self
.
l2
))
return
optimizer
def
infer_input
(
self
):
self
.
_reader_namespace
=
"evaluate.reader"
res
=
self
.
input
(
self
.
evaluate_batch_size
)
self
.
_infer_data_var
=
res
self
.
_infer_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
def
infer_net
(
self
):
self
.
infer_input
()
self
.
net
(
self
.
items_num
,
self
.
hidden_size
,
self
.
step
,
self
.
evaluate_batch_size
)
self
.
_infer_results
[
'acc'
]
=
self
.
acc
self
.
_infer_results
[
'loss'
]
=
self
.
loss
models/recall/gnn/reader.py
浏览文件 @
9b3afd7c
...
...
@@ -23,9 +23,8 @@ from paddlerec.core.utils import envs
class
TrainReader
(
Reader
):
def
init
(
self
):
self
.
batch_size
=
envs
.
get_global_env
(
"batch_size"
,
None
,
"train.reader"
)
self
.
batch_size
=
envs
.
get_global_env
(
"dataset.dataset_train.batch_size"
)
self
.
input
=
[]
self
.
length
=
None
...
...
models/recall/readme.md
浏览文件 @
9b3afd7c
...
...
@@ -57,8 +57,8 @@
<img
align=
"center"
src=
"../../doc/imgs/gnn.png"
>
<p>
## 使用教程
###
训练 预测
## 使用教程
(快速开始)
###
```
shell
python
-m
paddlerec.run
-m
paddlerec.models.recall.word2vec
# word2vec
python
-m
paddlerec.run
-m
paddlerec.models.recall.ssr
# ssr
...
...
@@ -67,6 +67,40 @@ python -m paddlerec.run -m paddlerec.models.recall.gnn # gnn
python
-m
paddlerec.run
-m
paddlerec.models.recall.ncf
# ncf
python
-m
paddlerec.run
-m
paddlerec.models.recall.youtube_dnn
# youtube_dnn
```
## 使用教程(复现论文)
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据,并且调整了batch_size等超参以便在样例数据上更加友好的显示训练&测试日志。如果需要复现readme中的效果请按照如下表格调整batch_size等超参,并使用提供的脚本下载对应数据集以及数据预处理。
| 模型 | batch_size | thread_num | epoch_num |
| :---: | :---: | :---: | :---: |
| Word2Vec | 100 | 5 | 5 |
| GNN | 100 | 1 | 30 |
| GRU4REC | 500 | 1 | 10 |
### 数据处理
参考每个模型目录数据下载&预处理脚本。
```
bash
sh data_prepare.sh
```
### 训练
```
bash
cd
modles/recall/gnn
# 进入选定好的召回模型的目录 以gnn为例
python
-m
paddlerec.run
-m
./config.yaml
# 自定义修改超参后,指定配置文件,使用自定义配置
```
### 预测
```
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer
# 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行:
python -m paddlerec.run -m ./config.yaml # 以gnn为例
```
## 效果对比
### 模型效果列表
...
...
models/recall/word2vec/config.yaml
浏览文件 @
9b3afd7c
...
...
@@ -11,51 +11,70 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
evaluate
:
workspace
:
"
paddlerec.models.recall.word2vec"
workspace
:
"
paddlerec.models.recall.word2vec"
evaluate_only
:
False
evaluate_model_path
:
"
"
reader
:
batch_size
:
50
class
:
"
{workspace}/w2v_evaluate_reader.py"
test_data_path
:
"
{workspace}/data/test"
word_id_dict_path
:
"
{workspace}/data/dict/word_id_dict.txt"
# list of dataset
dataset
:
-
name
:
dataset_train
# name of dataset to distinguish different datasets
batch_size
:
100
type
:
DataLoader
# or QueueDataset
data_path
:
"
{workspace}/data/train"
word_count_dict_path
:
"
{workspace}/data/dict/word_count_dict.txt"
data_converter
:
"
{workspace}/w2v_reader.py"
-
name
:
dataset_infer
# name
batch_size
:
50
type
:
DataLoader
# or QueueDataset
data_path
:
"
{workspace}/data/test"
word_id_dict_path
:
"
{workspace}/data/dict/word_id_dict.txt"
data_converter
:
"
{workspace}/w2v_evaluate_reader.py"
train
:
trainer
:
# for cluster training
strategy
:
"
async"
hyper_parameters
:
optimizer
:
learning_rate
:
1.0
decay_steps
:
100000
decay_rate
:
0.999
class
:
sgd
strategy
:
async
sparse_feature_number
:
354051
sparse_feature_dim
:
300
with_shuffle_batch
:
False
neg_num
:
5
window_size
:
5
# select runner by name
mode
:
train_runner
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner
:
-
name
:
train_runner
class
:
single_train
# num of epochs
epochs
:
2
workspace
:
"
paddlerec.models.recall.word2vec"
# device to run training or infer
device
:
cpu
save_checkpoint_interval
:
1
# save model interval of epochs
save_inference_interval
:
1
# save inference
save_checkpoint_path
:
"
increment"
# save checkpoint path
save_inference_path
:
"
inference"
# save inference path
save_inference_feed_varnames
:
[]
# feed vars of save inference
save_inference_fetch_varnames
:
[]
# fetch vars of save inference
init_model_path
:
"
"
# load model path
fetch_period
:
10
-
name
:
infer_runner
class
:
single_infer
# num of epochs
epochs
:
1
# device to run training or infer
device
:
cpu
init_model_path
:
"
increment/0"
# load model path
reader
:
batch_size
:
100
class
:
"
{workspace}/w2v_reader.py"
train_data_path
:
"
{workspace}/data/train"
word_count_dict_path
:
"
{workspace}/data/dict/word_count_dict.txt"
model
:
models
:
"
{workspace}/model.py"
hyper_parameters
:
sparse_feature_number
:
85
sparse_feature_dim
:
300
with_shuffle_batch
:
False
neg_num
:
5
window_size
:
5
learning_rate
:
1.0
decay_steps
:
100000
decay_rate
:
0.999
optimizer
:
sgd
save
:
increment
:
dirname
:
"
increment"
epoch_interval
:
1
save_last
:
True
inference
:
dirname
:
"
inference"
epoch_interval
:
1
save_last
:
True
# runner will run all the phase in each epoch
phase
:
-
name
:
phase1
model
:
"
{workspace}/model.py"
# user-defined model
dataset_name
:
dataset_train
# select dataset by name
thread_num
:
1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
models/recall/word2vec/
prepare_data
.sh
→
models/recall/word2vec/
data_prepare
.sh
浏览文件 @
9b3afd7c
...
...
@@ -22,16 +22,17 @@ tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar
mv
1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ raw_data/
# preprocess data
python preprocess.py
--build_dict
--build_dict_corpus_dir
raw_data/training-monolingual.tokenized.shuffled
--dict_path
raw_data/test_build_dict
python preprocess.py
--filter_corpus
--dict_path
raw_data/test_build_dict
--input_corpus_dir
raw_data/training-monolingual.tokenized.shuffled
--output_corpus_dir
raw_data/convert_text8
--min_count
5
--downsample
0.001
mkdir
thirdparty
mv
raw_data/test_build_dict thirdparty/
mv
raw_data/test_build_dict_word_to_id_ thirdparty/
python preprocess.py
--build_dict
--build_dict_corpus_dir
raw_data/training-monolingual.tokenized.shuffled
--dict_path
raw_data/word_count_dict.txt
python preprocess.py
--filter_corpus
--dict_path
raw_data/word_count_dict.txt
--input_corpus_dir
raw_data/training-monolingual.tokenized.shuffled
--output_corpus_dir
raw_data/convert_text8
--min_count
5
--downsample
0.001
mv
raw_data/word_count_dict.txt data/dict/
mv
raw_data/word_id_dict.txt data/dict/
python preprocess.py
--data_resplit
--input_corpus_dir
=
raw_data/convert_text8
--output_corpus_dir
=
train_data
rm
-rf
data/train/
*
rm
-rf
data/test/
*
python preprocess.py
--data_resplit
--input_corpus_dir
=
raw_data/convert_text8
--output_corpus_dir
=
data/train
# download test data
wget
--no-check-certificate
https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
tar
xzvf test_dir.tar
-C
raw_data
mv
raw_data/data/test_dir
test_data
/
mv
raw_data/data/test_dir
/
*
data/test
/
rm
-rf
raw_data
models/recall/word2vec/model.py
浏览文件 @
9b3afd7c
...
...
@@ -23,45 +23,50 @@ class Model(ModelBase):
def
__init__
(
self
,
config
):
ModelBase
.
__init__
(
self
,
config
)
def
input
(
self
):
neg_num
=
int
(
envs
.
get_global_env
(
"hyper_parameters.neg_num"
,
None
,
self
.
_namespace
))
self
.
input_word
=
fluid
.
data
(
def
_init_hyper_parameters
(
self
):
self
.
is_distributed
=
True
if
envs
.
get_trainer
(
)
==
"CtrTrainer"
else
False
self
.
sparse_feature_number
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_number"
)
self
.
sparse_feature_dim
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
)
self
.
neg_num
=
envs
.
get_global_env
(
"hyper_parameters.neg_num"
)
self
.
with_shuffle_batch
=
envs
.
get_global_env
(
"hyper_parameters.with_shuffle_batch"
)
self
.
learning_rate
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.learning_rate"
)
self
.
decay_steps
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.decay_steps"
)
self
.
decay_rate
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.decay_rate"
)
def
input_data
(
self
,
is_infer
=
False
,
**
kwargs
):
if
is_infer
:
analogy_a
=
fluid
.
data
(
name
=
"analogy_a"
,
shape
=
[
None
],
dtype
=
'int64'
)
analogy_b
=
fluid
.
data
(
name
=
"analogy_b"
,
shape
=
[
None
],
dtype
=
'int64'
)
analogy_c
=
fluid
.
data
(
name
=
"analogy_c"
,
shape
=
[
None
],
dtype
=
'int64'
)
analogy_d
=
fluid
.
data
(
name
=
"analogy_d"
,
shape
=
[
None
],
dtype
=
'int64'
)
return
[
analogy_a
,
analogy_b
,
analogy_c
,
analogy_d
]
input_word
=
fluid
.
data
(
name
=
"input_word"
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
self
.
true_word
=
fluid
.
data
(
true_word
=
fluid
.
data
(
name
=
'true_label'
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
self
.
_data_var
.
append
(
self
.
input_word
)
self
.
_data_var
.
append
(
self
.
true_word
)
with_shuffle_batch
=
bool
(
int
(
envs
.
get_global_env
(
"hyper_parameters.with_shuffle_batch"
,
None
,
self
.
_namespace
)))
if
not
with_shuffle_batch
:
self
.
neg_word
=
fluid
.
data
(
name
=
"neg_label"
,
shape
=
[
None
,
neg_num
],
dtype
=
'int64'
)
self
.
_data_var
.
append
(
self
.
neg_word
)
if
self
.
with_shuffle_batch
:
return
[
input_word
,
true_word
]
if
self
.
_platform
!=
"LINUX"
:
self
.
_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
neg_word
=
fluid
.
data
(
name
=
"neg_label"
,
shape
=
[
None
,
self
.
neg_num
],
dtype
=
'int64'
)
return
[
input_word
,
true_word
,
neg_word
]
def
net
(
self
):
is_distributed
=
True
if
envs
.
get_trainer
()
==
"CtrTrainer"
else
False
neg_num
=
int
(
envs
.
get_global_env
(
"hyper_parameters.neg_num"
,
None
,
self
.
_namespace
))
sparse_feature_number
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_number"
,
None
,
self
.
_namespace
)
sparse_feature_dim
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
,
None
,
self
.
_namespace
)
with_shuffle_batch
=
bool
(
int
(
envs
.
get_global_env
(
"hyper_parameters.with_shuffle_batch"
,
None
,
self
.
_namespace
)))
def
net
(
self
,
inputs
,
is_infer
=
False
):
if
is_infer
:
self
.
infer_net
(
inputs
)
return
def
embedding_layer
(
input
,
table_name
,
...
...
@@ -71,8 +76,8 @@ class Model(ModelBase):
emb
=
fluid
.
embedding
(
input
=
input
,
is_sparse
=
True
,
is_distributed
=
is_distributed
,
size
=
[
sparse_feature_number
,
emb_dim
],
is_distributed
=
self
.
is_distributed
,
size
=
[
s
elf
.
s
parse_feature_number
,
emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
name
=
table_name
,
initializer
=
initializer_instance
),
)
if
squeeze
:
...
...
@@ -80,44 +85,44 @@ class Model(ModelBase):
else
:
return
emb
init_width
=
0.5
/
sparse_feature_dim
init_width
=
0.5
/
s
elf
.
s
parse_feature_dim
emb_initializer
=
fluid
.
initializer
.
Uniform
(
-
init_width
,
init_width
)
emb_w_initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.0
)
input_emb
=
embedding_layer
(
self
.
input_word
,
"emb"
,
sparse_feature_dim
,
input_emb
=
embedding_layer
(
inputs
[
0
],
"emb"
,
self
.
sparse_feature_dim
,
emb_initializer
,
True
)
true_emb_w
=
embedding_layer
(
self
.
true_word
,
"emb_w"
,
sparse_feature_dim
,
emb_w_initializer
,
True
)
true_emb_b
=
embedding_layer
(
self
.
true_word
,
"emb_b"
,
1
,
true_emb_w
=
embedding_layer
(
inputs
[
1
],
"emb_w"
,
self
.
sparse_feature_dim
,
emb_w_initializer
,
True
)
true_emb_b
=
embedding_layer
(
inputs
[
1
],
"emb_b"
,
1
,
emb_w_initializer
,
True
)
if
with_shuffle_batch
:
if
self
.
with_shuffle_batch
:
neg_emb_w_list
=
[]
for
i
in
range
(
neg_num
):
for
i
in
range
(
self
.
neg_num
):
neg_emb_w_list
.
append
(
fluid
.
contrib
.
layers
.
shuffle_batch
(
true_emb_w
))
# shuffle true_word
neg_emb_w_concat
=
fluid
.
layers
.
concat
(
neg_emb_w_list
,
axis
=
0
)
neg_emb_w
=
fluid
.
layers
.
reshape
(
neg_emb_w_concat
,
shape
=
[
-
1
,
neg_num
,
sparse_feature_dim
])
neg_emb_w_concat
,
shape
=
[
-
1
,
self
.
neg_num
,
self
.
sparse_feature_dim
])
neg_emb_b_list
=
[]
for
i
in
range
(
neg_num
):
for
i
in
range
(
self
.
neg_num
):
neg_emb_b_list
.
append
(
fluid
.
contrib
.
layers
.
shuffle_batch
(
true_emb_b
))
# shuffle true_word
neg_emb_b
=
fluid
.
layers
.
concat
(
neg_emb_b_list
,
axis
=
0
)
neg_emb_b_vec
=
fluid
.
layers
.
reshape
(
neg_emb_b
,
shape
=
[
-
1
,
neg_num
])
neg_emb_b
,
shape
=
[
-
1
,
self
.
neg_num
])
else
:
neg_emb_w
=
embedding_layer
(
self
.
neg_word
,
"emb_w"
,
sparse_feature_dim
,
emb_w_initializer
)
neg_emb_b
=
embedding_layer
(
self
.
neg_word
,
"emb_b"
,
1
,
neg_emb_w
=
embedding_layer
(
inputs
[
2
],
"emb_w"
,
self
.
sparse_feature_dim
,
emb_w_initializer
)
neg_emb_b
=
embedding_layer
(
inputs
[
2
]
,
"emb_b"
,
1
,
emb_w_initializer
)
neg_emb_b_vec
=
fluid
.
layers
.
reshape
(
neg_emb_b
,
shape
=
[
-
1
,
neg_num
])
neg_emb_b
,
shape
=
[
-
1
,
self
.
neg_num
])
true_logits
=
fluid
.
layers
.
elementwise_add
(
fluid
.
layers
.
reduce_sum
(
...
...
@@ -127,18 +132,22 @@ class Model(ModelBase):
true_emb_b
)
input_emb_re
=
fluid
.
layers
.
reshape
(
input_emb
,
shape
=
[
-
1
,
1
,
sparse_feature_dim
])
input_emb
,
shape
=
[
-
1
,
1
,
s
elf
.
s
parse_feature_dim
])
neg_matmul
=
fluid
.
layers
.
matmul
(
input_emb_re
,
neg_emb_w
,
transpose_y
=
True
)
neg_logits
=
fluid
.
layers
.
elementwise_add
(
fluid
.
layers
.
reshape
(
neg_matmul
,
shape
=
[
-
1
,
neg_num
]),
neg_emb_b_vec
)
label_ones
=
fluid
.
layers
.
fill_constant_batch_size_like
(
true_logits
,
shape
=
[
-
1
,
1
],
value
=
1.0
,
dtype
=
'float32'
)
label_zeros
=
fluid
.
layers
.
fill_constant_batch_size_like
(
true_logits
,
shape
=
[
-
1
,
neg_num
],
value
=
0.0
,
dtype
=
'float32'
)
neg_matmul_re
=
fluid
.
layers
.
reshape
(
neg_matmul
,
shape
=
[
-
1
,
self
.
neg_num
])
neg_logits
=
fluid
.
layers
.
elementwise_add
(
neg_matmul_re
,
neg_emb_b_vec
)
#nce loss
label_ones
=
fluid
.
layers
.
fill_constant
(
shape
=
[
fluid
.
layers
.
shape
(
true_logits
)[
0
],
1
],
value
=
1.0
,
dtype
=
'float32'
)
label_zeros
=
fluid
.
layers
.
fill_constant
(
shape
=
[
fluid
.
layers
.
shape
(
true_logits
)[
0
],
self
.
neg_num
],
value
=
0.0
,
dtype
=
'float32'
)
true_xent
=
fluid
.
layers
.
sigmoid_cross_entropy_with_logits
(
true_logits
,
label_ones
)
...
...
@@ -149,7 +158,9 @@ class Model(ModelBase):
true_xent
,
dim
=
1
),
fluid
.
layers
.
reduce_sum
(
neg_xent
,
dim
=
1
))
self
.
avg_cost
=
fluid
.
layers
.
reduce_mean
(
cost
)
avg_cost
=
fluid
.
layers
.
reduce_mean
(
cost
)
self
.
_cost
=
avg_cost
global_right_cnt
=
fluid
.
layers
.
create_global_var
(
name
=
"global_right_cnt"
,
persistable
=
True
,
...
...
@@ -164,77 +175,33 @@ class Model(ModelBase):
value
=
0
)
global_right_cnt
.
stop_gradient
=
True
global_total_cnt
.
stop_gradient
=
True
def
avg_loss
(
self
):
self
.
_cost
=
self
.
avg_cost
def
metrics
(
self
):
self
.
_metrics
[
"LOSS"
]
=
self
.
avg_cost
def
train_net
(
self
):
self
.
input
()
self
.
net
()
self
.
avg_loss
()
self
.
metrics
()
self
.
_metrics
[
"LOSS"
]
=
avg_cost
def
optimizer
(
self
):
learning_rate
=
envs
.
get_global_env
(
"hyper_parameters.learning_rate"
,
None
,
self
.
_namespace
)
decay_steps
=
envs
.
get_global_env
(
"hyper_parameters.decay_steps"
,
None
,
self
.
_namespace
)
decay_rate
=
envs
.
get_global_env
(
"hyper_parameters.decay_rate"
,
None
,
self
.
_namespace
)
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
fluid
.
layers
.
exponential_decay
(
learning_rate
=
learning_rate
,
decay_steps
=
decay_steps
,
decay_rate
=
decay_rate
,
learning_rate
=
self
.
learning_rate
,
decay_steps
=
self
.
decay_steps
,
decay_rate
=
self
.
decay_rate
,
staircase
=
True
))
return
optimizer
def
analogy_input
(
self
):
sparse_feature_number
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_number"
,
None
,
self
.
_namespace
)
self
.
analogy_a
=
fluid
.
data
(
name
=
"analogy_a"
,
shape
=
[
None
],
dtype
=
'int64'
)
self
.
analogy_b
=
fluid
.
data
(
name
=
"analogy_b"
,
shape
=
[
None
],
dtype
=
'int64'
)
self
.
analogy_c
=
fluid
.
data
(
name
=
"analogy_c"
,
shape
=
[
None
],
dtype
=
'int64'
)
self
.
analogy_d
=
fluid
.
data
(
name
=
"analogy_d"
,
shape
=
[
None
],
dtype
=
'int64'
)
self
.
_infer_data_var
=
[
self
.
analogy_a
,
self
.
analogy_b
,
self
.
analogy_c
,
self
.
analogy_d
]
self
.
_infer_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
_infer_data_var
,
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
def
infer_net
(
self
):
sparse_feature_dim
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
,
None
,
self
.
_namespace
)
sparse_feature_number
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_number"
,
None
,
self
.
_namespace
)
def
infer_net
(
self
,
inputs
):
def
embedding_layer
(
input
,
table_name
,
initializer_instance
=
None
):
emb
=
fluid
.
embedding
(
input
=
input
,
size
=
[
s
parse_feature_number
,
sparse_feature_dim
],
size
=
[
s
elf
.
sparse_feature_number
,
self
.
sparse_feature_dim
],
param_attr
=
table_name
)
return
emb
self
.
analogy_input
()
all_label
=
np
.
arange
(
sparse_feature_number
).
reshape
(
sparse_feature_number
).
astype
(
'int32'
)
all_label
=
np
.
arange
(
self
.
sparse_feature_number
).
reshape
(
self
.
sparse_feature_number
).
astype
(
'int32'
)
self
.
all_label
=
fluid
.
layers
.
cast
(
x
=
fluid
.
layers
.
assign
(
all_label
),
dtype
=
'int64'
)
emb_all_label
=
embedding_layer
(
self
.
all_label
,
"emb"
)
emb_a
=
embedding_layer
(
self
.
analogy_a
,
"emb"
)
emb_b
=
embedding_layer
(
self
.
analogy_b
,
"emb"
)
emb_c
=
embedding_layer
(
self
.
analogy_c
,
"emb"
)
emb_a
=
embedding_layer
(
inputs
[
0
]
,
"emb"
)
emb_b
=
embedding_layer
(
inputs
[
1
]
,
"emb"
)
emb_c
=
embedding_layer
(
inputs
[
2
]
,
"emb"
)
target
=
fluid
.
layers
.
elementwise_add
(
fluid
.
layers
.
elementwise_sub
(
emb_b
,
emb_a
),
emb_c
)
...
...
@@ -245,8 +212,7 @@ class Model(ModelBase):
values
,
pred_idx
=
fluid
.
layers
.
topk
(
input
=
dist
,
k
=
4
)
label
=
fluid
.
layers
.
expand
(
fluid
.
layers
.
unsqueeze
(
self
.
analogy_d
,
axes
=
[
1
]),
expand_times
=
[
1
,
4
])
inputs
[
3
],
axes
=
[
1
]),
expand_times
=
[
1
,
4
])
label_ones
=
fluid
.
layers
.
fill_constant_batch_size_like
(
label
,
shape
=
[
-
1
,
1
],
value
=
1.0
,
dtype
=
'float32'
)
right_cnt
=
fluid
.
layers
.
reduce_sum
(
input
=
fluid
.
layers
.
cast
(
...
...
models/recall/word2vec/preprocess.py
浏览文件 @
9b3afd7c
...
...
@@ -162,7 +162,7 @@ def filter_corpus(args):
if
r_value
>
keep_prob
:
continue
write_line
+=
str
(
idx
)
write_line
+=
"
,
"
write_line
+=
"
"
signal
=
True
if
signal
:
write_line
=
write_line
[:
-
1
]
+
"
\n
"
...
...
models/recall/word2vec/w2v_evaluate_reader.py
浏览文件 @
9b3afd7c
...
...
@@ -20,10 +20,10 @@ from paddlerec.core.reader import Reader
from
paddlerec.core.utils
import
envs
class
Evaluate
Reader
(
Reader
):
class
Train
Reader
(
Reader
):
def
init
(
self
):
dict_path
=
envs
.
get_global_env
(
"word_id_dict_path"
,
None
,
"evaluate.reader
"
)
dict_path
=
envs
.
get_global_env
(
"dataset.dataset_infer.word_id_dict_path
"
)
self
.
word_to_id
=
dict
()
self
.
id_to_word
=
dict
()
with
io
.
open
(
dict_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
...
...
@@ -75,6 +75,8 @@ class EvaluateReader(Reader):
def
generate_sample
(
self
,
line
):
def
reader
():
if
':'
in
line
:
pass
features
=
self
.
strip_lines
(
line
.
lower
(),
self
.
word_to_id
)
features
=
features
.
split
()
yield
[(
'analogy_a'
,
[
self
.
word_to_id
[
features
[
0
]]]),
...
...
models/recall/word2vec/w2v_reader.py
浏览文件 @
9b3afd7c
...
...
@@ -40,14 +40,12 @@ class NumpyRandomInt(object):
class
TrainReader
(
Reader
):
def
init
(
self
):
dict_path
=
envs
.
get_global_env
(
"word_count_dict_path"
,
None
,
"train.reader"
)
self
.
window_size
=
envs
.
get_global_env
(
"hyper_parameters.window_size"
,
None
,
"train.model"
)
self
.
neg_num
=
envs
.
get_global_env
(
"hyper_parameters.neg_num"
,
None
,
"train.model"
)
dict_path
=
envs
.
get_global_env
(
"dataset.dataset_train.word_count_dict_path"
)
self
.
window_size
=
envs
.
get_global_env
(
"hyper_parameters.window_size"
)
self
.
neg_num
=
envs
.
get_global_env
(
"hyper_parameters.neg_num"
)
self
.
with_shuffle_batch
=
envs
.
get_global_env
(
"hyper_parameters.with_shuffle_batch"
,
None
,
"train.model"
)
"hyper_parameters.with_shuffle_batch"
)
self
.
random_generator
=
NumpyRandomInt
(
1
,
self
.
window_size
+
1
)
self
.
cs
=
None
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录