提交 ebb16616 编写于 作者: D dongdaxiang

Merge branch 'multiview-simnet' into ssr

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v1.2.3
hooks:
- id: trailing-whitespace
\ No newline at end of file
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
如下 如下
如下命令行可以获得训练工具的具体选项,`python train.py -h`内容可以参考说明 如下命令行可以获得训练工具的具体选项,`python train.py -h`内容可以参考说明
```bash ```bash
python train.py python train.py
``` ```
## 未来的工作 ## 未来的工作
- 多种pairwise的损失函数会被加入到这个项目中。对于不同视角的特征,用户-项目之间的匹配关系可以使用不同的损失函数进行联合优化。整个模型会在真实数据中进行验证。 - 多种pairwise的损失函数会被加入到这个项目中。对于不同视角的特征,用户-项目之间的匹配关系可以使用不同的损失函数进行联合优化。整个模型会在真实数据中进行验证。
......
# Multi-view Simnet for Personalized recommendation # Multi-view Simnet for Personalized recommendation
## Introduction ## Introduction
In personalized recommendation scenario, a user often is provided with several items from personalized interest matching model. In real world application, a user may have multiple views of features, say user-id, age, click-history of items. A item, e.g. news, may also have multiple views of features like news title, news category and so on. Multi-view Simnet is matching a model that combine users' and items' multiple views of features into one unified model. The model can be used in many industrial product like Baidu's feed news. In personalized recommendation scenario, a user often is provided with several items from personalized interest matching model. In real world application, a user may have multiple views of features, say user-id, age, click-history of items, search queries. A item, e.g. news, may also have multiple views of features like news title, news category, images in news and so on. Multi-view Simnet is matching a model that combine users' and items' multiple views of features into one unified model. The model can be used in many industrial product like Baidu's feed news. The model is adapted from the paper A Multi-View Deep Learning(MV-DNN) Approach for Cross Domain User Modeling in Recommendation Systems, WWW 2015. The difference between our model and the MV-DNN is that we also consider multiple feature views of users.
## Dataset ## Dataset
Currently, synthetic dataset is provided for proof of concept and we aim to add more real world dataset in this project in the future. Currently, synthetic dataset is provided for proof of concept and we aim to add more real world dataset in this project in the future.
...@@ -12,7 +12,7 @@ This project aims to provide practical usage of Paddle in personalized matching ...@@ -12,7 +12,7 @@ This project aims to provide practical usage of Paddle in personalized matching
## Train ## Train
The command line options for training can be listed by `python train.py -h` The command line options for training can be listed by `python train.py -h`
```bash ```bash
python train.py python train.py
``` ```
## Future work ## Future work
...@@ -20,4 +20,3 @@ python train.py ...@@ -20,4 +20,3 @@ python train.py
- infer will be added - infer will be added
- Parallel Executor will be added in this project - Parallel Executor will be added in this project
- Distributed Training will be added - Distributed Training will be added
...@@ -13,34 +13,26 @@ ...@@ -13,34 +13,26 @@
# limitations under the License. # limitations under the License.
import paddle.fluid as fluid import paddle.fluid as fluid
Embedding=fluid.layers.embedding import paddle.fluid.layers.nn as nn
FC=fluid.layers.fc import paddle.fluid.layers.tensor as tensor
Cast=fluid.layers.cast import paddle.fluid.layers.control_flow as cf
ReduceSum=fluid.layers.reduce_sum import paddle.fluid.layers.io as io
Concat=fluid.layers.concat
Cosine=fluid.layers.cos_sim
ElemSub=fluid.layers.elementwise_sub
ElemDiv=fluid.layers.elementwise_div
ElemMax=fluid.layers.elementwise_max
ElemAdd=fluid.layers.elementwise_add
LessThan=fluid.layers.less_than
FillConst=fluid.layers.fill_constant
FillConstBatch=fluid.layers.fill_constant_batch_size_like
Mean=fluid.layers.mean
Data=fluid.layers.data
class BowEncoder(object): class BowEncoder(object):
""" bow-encoder """ """ bow-encoder """
def __init__(self): def __init__(self):
self.param_name = "" self.param_name = ""
def forward(self, emb): def forward(self, emb):
return fluid.layers.sequence_pool(input=emb, return nn.sequence_pool(input=emb, pool_type='sum')
pool_type='sum')
class CNNEncoder(object): class CNNEncoder(object):
""" cnn-encoder""" """ cnn-encoder"""
def __init__(self,
def __init__(self,
param_name="cnn.w", param_name="cnn.w",
win_size=3, win_size=3,
ksize=128, ksize=128,
...@@ -51,7 +43,7 @@ class CNNEncoder(object): ...@@ -51,7 +43,7 @@ class CNNEncoder(object):
self.ksize = ksize self.ksize = ksize
self.act = act self.act = act
self.pool_type = pool_type self.pool_type = pool_type
def forward(self, emb): def forward(self, emb):
return fluid.nets.sequence_conv_pool( return fluid.nets.sequence_conv_pool(
input=emb, input=emb,
...@@ -61,29 +53,34 @@ class CNNEncoder(object): ...@@ -61,29 +53,34 @@ class CNNEncoder(object):
pool_type=self.pool_type, pool_type=self.pool_type,
attr=self.param_name) attr=self.param_name)
class GrnnEncoder(object): class GrnnEncoder(object):
""" grnn-encoder """ """ grnn-encoder """
def __init__(self,
param_name="grnn.w", def __init__(self, param_name="grnn.w", hidden_size=128):
hidden_size=128):
self.param_name = args self.param_name = args
self.hidden_size = hidden_size self.hidden_size = hidden_size
def forward(self, emb): def forward(self, emb):
gru_h = fluid.layers.dynamic_gru(input=emb, fc0 = nn.fc(input=emb, size=self.hidden_size * 3)
size=self.hidden_size, gru_h = nn.dynamic_gru(
is_reverse=False, input=emb,
attr=self.param_name) size=self.hidden_size,
return fluid.layers.sequence_pool(input=gru_h, is_reverse=False,
pool_type='max') attr=self.param_name)
return nn.sequence_pool(input=gru_h, pool_type='max')
'''this is a very simple Encoder factory '''this is a very simple Encoder factory
most default argument values are used''' most default argument values are used'''
class SimpleEncoderFactory(object): class SimpleEncoderFactory(object):
def __init__(self): def __init__(self):
pass pass
''' create an encoder through create function ''' ''' create an encoder through create function '''
def create(self, enc_type, enc_hid_size): def create(self, enc_type, enc_hid_size):
if enc_type == "bow": if enc_type == "bow":
bow_encode = BowEncoder() bow_encode = BowEncoder()
...@@ -95,16 +92,14 @@ class SimpleEncoderFactory(object): ...@@ -95,16 +92,14 @@ class SimpleEncoderFactory(object):
rnn_encode = GrnnEncoder(hidden_size=enc_hid_size) rnn_encode = GrnnEncoder(hidden_size=enc_hid_size)
return rnn_encode return rnn_encode
class MultiviewSimnet(object): class MultiviewSimnet(object):
""" multi-view simnet """ """ multi-view simnet """
def __init__(self,
embedding_size, def __init__(self, embedding_size, embedding_dim, hidden_size):
embedding_dim,
hidden_size):
self.embedding_size = embedding_size self.embedding_size = embedding_size
self.embedding_dim = embedding_dim self.embedding_dim = embedding_dim
self.emb_shape = [self.embedding_size, self.emb_shape = [self.embedding_size, self.embedding_dim]
self.embedding_dim]
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.margin = 0.1 self.margin = 0.1
...@@ -115,95 +110,126 @@ class MultiviewSimnet(object): ...@@ -115,95 +110,126 @@ class MultiviewSimnet(object):
self.title_encoders = encoders self.title_encoders = encoders
def get_correct(self, x, y): def get_correct(self, x, y):
less = Cast(LessThan(x, y), dtype='float32') less = tensor.cast(cf.less_than(x, y), dtype='float32')
correct = ReduceSum(less) correct = nn.reduce_sum(less)
return correct return correct
def train_net(self): def train_net(self):
# input fields for query, pos_title, neg_title # input fields for query, pos_title, neg_title
q_slots = [Data(name="q%d" % i, shape=[1], lod_level=1, dtype='int64') q_slots = [
for i in range(len(self.query_encoders))] io.data(
pt_slots = [Data(name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') name="q%d" % i, shape=[1], lod_level=1, dtype='int64')
for i in range(len(self.title_encoders))] for i in range(len(self.query_encoders))
nt_slots = [Data(name="nt%d" % i, shape=[1], lod_level=1, dtype='int64') ]
for i in range(len(self.title_encoders))] pt_slots = [
io.data(
name="pt%d" % i, shape=[1], lod_level=1, dtype='int64')
for i in range(len(self.title_encoders))
]
nt_slots = [
io.data(
name="nt%d" % i, shape=[1], lod_level=1, dtype='int64')
for i in range(len(self.title_encoders))
]
# lookup embedding for each slot # lookup embedding for each slot
q_embs = [Embedding(input=query, size=self.emb_shape, q_embs = [
param_attr="emb.w") for query in q_slots] nn.embedding(
pt_embs = [Embedding(input=title, size=self.emb_shape, input=query, size=self.emb_shape, param_attr="emb.w")
param_attr="emb.w") for title in pt_slots] for query in q_slots
nt_embs = [Embedding(input=title, size=self.emb_shape, ]
param_attr="emb.w") for title in nt_slots] pt_embs = [
nn.embedding(
input=title, size=self.emb_shape, param_attr="emb.w")
for title in pt_slots
]
nt_embs = [
nn.embedding(
input=title, size=self.emb_shape, param_attr="emb.w")
for title in nt_slots
]
# encode each embedding field with encoder # encode each embedding field with encoder
q_encodes = [self.query_encoders[i].forward(emb) q_encodes = [
for i, emb in enumerate(q_embs)] self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
pt_encodes = [self.title_encoders[i].forward(emb) ]
for i, emb in enumerate(pt_embs)] pt_encodes = [
nt_encodes = [self.title_encoders[i].forward(emb) self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs)
for i, emb in enumerate(nt_embs)] ]
nt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(nt_embs)
]
# concat multi view for query, pos_title, neg_title # concat multi view for query, pos_title, neg_title
q_concat = Concat(q_encodes) q_concat = nn.concat(q_encodes)
pt_concat = Concat(pt_encodes) pt_concat = nn.concat(pt_encodes)
nt_concat = Concat(nt_encodes) nt_concat = nn.concat(nt_encodes)
# projection of hidden layer # projection of hidden layer
q_hid = FC(q_concat, size=self.hidden_size, param_attr='q_fc.w') q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w')
pt_hid = FC(pt_concat, size=self.hidden_size, param_attr='t_fc.w') pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w')
nt_hid = FC(nt_concat, size=self.hidden_size, param_attr='t_fc.w') nt_hid = nn.fc(nt_concat, size=self.hidden_size, param_attr='t_fc.w')
# cosine of hidden layers # cosine of hidden layers
cos_pos = Cosine(q_hid, pt_hid) cos_pos = nn.cos_sim(q_hid, pt_hid)
cos_neg = Cosine(q_hid, nt_hid) cos_neg = nn.cos_sim(q_hid, nt_hid)
# pairwise hinge_loss # pairwise hinge_loss
loss_part1 = ElemSub(FillConstBatch( loss_part1 = nn.elementwise_sub(
input=cos_pos, tensor.fill_constant_batch_size_like(
shape=[-1, 1], input=cos_pos,
value=self.margin, shape=[-1, 1],
dtype='float32'), cos_pos) value=self.margin,
dtype='float32'),
loss_part2 = ElemAdd(loss_part1, cos_neg) cos_pos)
loss_part3 = ElemMax(FillConstBatch( loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
input=loss_part2,
shape=[-1, 1], loss_part3 = nn.elementwise_max(
value=0.0, tensor.fill_constant_batch_size_like(
dtype='float32'), loss_part2) input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
avg_cost = Mean(loss_part3)
avg_cost = nn.mean(loss_part3)
correct = self.get_correct(cos_pos, cos_neg) correct = self.get_correct(cos_pos, cos_neg)
return q_slots + pt_slots + nt_slots, avg_cost, correct return q_slots + pt_slots + nt_slots, avg_cost, correct
def pred_net(self, def pred_net(self, query_fields, pos_title_fields, neg_title_fields):
query_fields, q_slots = [
pos_title_fields, io.data(
neg_title_fields): name="q%d" % i, shape=[1], lod_level=1, dtype='int64')
q_slots = [Data(name="q%d" % i, shape=[1], lod_level=1, dtype='int64') for i in range(len(self.query_encoders))
for i in range(len(self.query_encoders))] ]
pt_slots = [Data(name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') pt_slots = [
for i in range(len(self.title_encoders))] io.data(
name="pt%d" % i, shape=[1], lod_level=1, dtype='int64')
for i in range(len(self.title_encoders))
]
# lookup embedding for each slot # lookup embedding for each slot
q_embs = [Embedding(input=query, size=self.emb_shape, q_embs = [
param_attr="emb.w") for query in q_slots] nn.embedding(
pt_embs = [Embedding(input=title, size=self.emb_shape, input=query, size=self.emb_shape, param_attr="emb.w")
param_attr="emb.w") for title in pt_slots] for query in q_slots
]
pt_embs = [
nn.embedding(
input=title, size=self.emb_shape, param_attr="emb.w")
for title in pt_slots
]
# encode each embedding field with encoder # encode each embedding field with encoder
q_encodes = [self.query_encoder[i].forward(emb) q_encodes = [
for i, emb in enumerate(q_embs)] self.query_encoder[i].forward(emb) for i, emb in enumerate(q_embs)
pt_encodes = [self.title_encoders[i].forward(emb) ]
for i, emb in enumerate(pt_embs)] pt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs)
]
# concat multi view for query, pos_title, neg_title # concat multi view for query, pos_title, neg_title
q_concat = Concat(q_encodes) q_concat = nn.concat(q_encodes)
pt_concat = Concat(pt_encodes) pt_concat = nn.concat(pt_encodes)
# projection of hidden layer # projection of hidden layer
q_hid = FC(q_concat, size=self.hidden_size, param_attr='q_fc.w') q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w')
pt_hid = FC(pt_concat, size=self.hidden_size, param_attr='t_fc.w') pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w')
# cosine of hidden layers # cosine of hidden layers
cos = Cosine(q_hid, pt_hid) cos = nn.cos_sim(q_hid, pt_hid)
return cos return cos
...@@ -14,14 +14,14 @@ ...@@ -14,14 +14,14 @@
import random import random
class Dataset: class Dataset:
def __init__(self): def __init__(self):
pass pass
class SyntheticDataset(Dataset): class SyntheticDataset(Dataset):
def __init__(self, sparse_feature_dim, def __init__(self, sparse_feature_dim, query_slot_num, title_slot_num):
query_slot_num,
title_slot_num):
# ids are randomly generated # ids are randomly generated
self.ids_per_slot = 10 self.ids_per_slot = 10
self.sparse_feature_dim = sparse_feature_dim self.sparse_feature_dim = sparse_feature_dim
...@@ -39,14 +39,17 @@ class SyntheticDataset(Dataset): ...@@ -39,14 +39,17 @@ class SyntheticDataset(Dataset):
pos_title_slots = [] pos_title_slots = []
neg_title_slots = [] neg_title_slots = []
for i in range(self.query_slot_num): for i in range(self.query_slot_num):
qslot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) qslot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
query_slots.append(qslot) query_slots.append(qslot)
for i in range(self.title_slot_num): for i in range(self.title_slot_num):
pt_slot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) pt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
pos_title_slots.append(pt_slot) pos_title_slots.append(pt_slot)
if is_train: if is_train:
for i in range(self.title_slot_num): for i in range(self.title_slot_num):
nt_slot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) nt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
neg_title_slots.append(nt_slot) neg_title_slots.append(nt_slot)
yield query_slots + pos_title_slots + neg_title_slots yield query_slots + pos_title_slots + neg_title_slots
else: else:
...@@ -62,4 +65,3 @@ class SyntheticDataset(Dataset): ...@@ -62,4 +65,3 @@ class SyntheticDataset(Dataset):
def test(self): def test(self):
return self._reader_creator(False) return self._reader_creator(False)
...@@ -20,96 +20,88 @@ import numpy as np ...@@ -20,96 +20,88 @@ import numpy as np
import math import math
import argparse import argparse
import logging import logging
os.environ["CUDA_VISIBLE_DEVICES"] = ""
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle import paddle
import time import time
import reader as reader import reader as reader
from nets import MultiviewSimnet, SimpleEncoderFactory from nets import MultiviewSimnet, SimpleEncoderFactory
logging.basicConfig( logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid") logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser("multi-view simnet") parser = argparse.ArgumentParser("multi-view simnet")
parser.add_argument("--train_file", parser.add_argument("--train_file", type=str, help="Training file")
type=str, parser.add_argument("--valid_file", type=str, help="Validation file")
help="Training file") parser.add_argument(
parser.add_argument("--valid_file", "--epochs", type=int, default=10, help="Number of epochs for training")
type=str, parser.add_argument(
help="Validation file") "--model_output_dir",
parser.add_argument("--epochs", type=str,
type=int, default='model_output',
default=10, help="Model output folder")
help="Number of epochs for training") parser.add_argument(
parser.add_argument("--model_output_dir", "--query_slots", type=int, default=1, help="Number of query slots")
type=str, parser.add_argument(
default='model_output', "--title_slots", type=int, default=1, help="Number of title slots")
help="Model output folder") parser.add_argument(
parser.add_argument("--query_slots", "--query_encoder",
type=int, type=str,
default=1, default="bow",
help="Number of query slots") help="Encoder module for slot encoding")
parser.add_argument("--title_slots", parser.add_argument(
type=int, "--title_encoder",
default=1, type=str,
help="Number of title slots") default="bow",
parser.add_argument("--query_encoder", help="Encoder module for slot encoding")
type=str, parser.add_argument(
default="bow", "--query_encode_dim",
help="Encoder module for slot encoding") type=int,
parser.add_argument("--title_encoder", default=128,
type=str, help="Dimension of query encoder output")
default="bow", parser.add_argument(
help="Encoder module for slot encoding") "--title_encode_dim",
parser.add_argument("--query_encode_dim", type=int,
type=int, default=128,
default=128, help="Dimension of title encoder output")
help="Dimension of query encoder output") parser.add_argument(
parser.add_argument("--title_encode_dim", "--batch_size", type=int, default=128, help="Batch size for training")
type=int, parser.add_argument(
default=128, "--embedding_dim",
help="Dimension of title encoder output") type=int,
parser.add_argument("--batch_size", default=128,
type=int, help="Default Dimension of Embedding")
default=128, parser.add_argument(
help="Batch size for training") "--sparse_feature_dim",
parser.add_argument("--embedding_dim", type=int,
type=int, default=1000001,
default=128, help="Sparse feature hashing space"
help="Default Dimension of Embedding") "for index processing")
parser.add_argument("--sparse_feature_dim", parser.add_argument(
type=int, "--hidden_size", type=int, default=128, help="Hidden dim")
default=1000001,
help="Sparse feature hashing space"
"for index processing")
parser.add_argument("--hidden_size",
type=int,
default=128,
help="Hidden dim")
return parser.parse_args() return parser.parse_args()
def start_train(args): def start_train(args):
dataset = reader.SyntheticDataset(args.sparse_feature_dim, dataset = reader.SyntheticDataset(args.sparse_feature_dim, args.query_slots,
args.query_slots,
args.title_slots) args.title_slots)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
dataset.train(), dataset.train(), buf_size=args.batch_size * 100),
buf_size=args.batch_size * 100),
batch_size=args.batch_size) batch_size=args.batch_size)
place = fluid.CPUPlace() place = fluid.CPUPlace()
factory = SimpleEncoderFactory() factory = SimpleEncoderFactory()
query_encoders = [factory.create(args.query_encoder, query_encoders = [
args.query_encode_dim) factory.create(args.query_encoder, args.query_encode_dim)
for i in range(args.query_slots)] for i in range(args.query_slots)
title_encoders = [factory.create(args.title_encoder, ]
args.title_encode_dim) title_encoders = [
for i in range(args.title_slots)] factory.create(args.title_encoder, args.title_encode_dim)
m_simnet = MultiviewSimnet(args.sparse_feature_dim, for i in range(args.title_slots)
args.embedding_dim, ]
m_simnet = MultiviewSimnet(args.sparse_feature_dim, args.embedding_dim,
args.hidden_size) args.hidden_size)
m_simnet.set_query_encoder(query_encoders) m_simnet.set_query_encoder(query_encoders)
m_simnet.set_title_encoder(title_encoders) m_simnet.set_title_encoder(title_encoders)
...@@ -125,20 +117,21 @@ def start_train(args): ...@@ -125,20 +117,21 @@ def start_train(args):
for pass_id in range(args.epochs): for pass_id in range(args.epochs):
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
loss_val, correct_val = exe.run( loss_val, correct_val = exe.run(loop_program,
loop_program, feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost, correct]) fetch_list=[avg_cost, correct])
logger.info("TRAIN --> pass: {} batch_id: {} avg_cost: {}, acc: {}" logger.info("TRAIN --> pass: {} batch_id: {} avg_cost: {}, acc: {}"
.format(pass_id, batch_id, loss_val, .format(pass_id, batch_id, loss_val,
float(correct_val) / args.batch_size)) float(correct_val) / args.batch_size))
fluid.io.save_inference_model(args.model_output_dir, fluid.io.save_inference_model(args.model_output_dir,
[var.name for val in all_slots], [var.name for val in all_slots],
[avg_cost, correct], [avg_cost, correct], exe)
exe)
def main(): def main():
args = parse_args() args = parse_args()
start_train(args) start_train(args)
if __name__ == "__main__": if __name__ == "__main__":
main() main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册