提交 329750de 编写于 作者: D dongdaxiang

add multiview simnet

上级 8997bb63
# Multi-view Simnet for Personalized recommendation
## Introduction
In personalized recommendation scenario, a user often is provided with serveral items from personalized interest matching model. In real world application, a user may have multiple views of features, say userid, age, click-history of items. A item, e.g. news, may also have multiple views of features like news title, news category and so on. Multi-view Simnet is matching a model that combine users' and items' multiple views of features into one unified model. The model can be used in many industrial product like baidu's feed news.
## Dataset
Currently, synthetic dataset is provided for proof of concept and we aim to add more real world dataset in this project in the future.
## Model
This project aims to provide practical usage of Paddle in personalized matching scenario. The model provides serveral encoder modules for different views of features. Currenly, Bag-of-Embedding encoder, Temporal-Convolutional encoder, Gated-Recurrent-Unit encoder are provided. We will add more practical encoder for sparse features commonly used in recommender systems. Training algorithms used in this model is pairwise ranking in that a negative item with multiple views will be sampled given a pair of positive user-item pair.
## Train
The command line options for training can be listed by `python train.py -h`
```bash
python train.py
```
## Infer
The command line options for inference can be listed by `python infer.py -h`
## Future work
# Multiple types of pairwise loss will be added in this project. For different views of features between a user and an item, multiple losses will be supported. The model will be verified in real world dataset.
# infer will be added
# Parallel Executor will be added in this project
# Distributed Training will be added
#Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
Embedding=fluid.layers.embedding
FC=fluid.layers.fc
Cast=fluid.layers.cast
ReduceSum=fluid.layers.reduce_sum
Concat=fluid.layers.concat
Cosine=fluid.layers.cos_sim
ElemSub=fluid.layers.elementwise_sub
ElemDiv=fluid.layers.elementwise_div
ElemMax=fluid.layers.elementwise_max
ElemAdd=fluid.layers.elementwise_add
LessThan=fluid.layers.less_than
FillConst=fluid.layers.fill_constant
FillConstBatch=fluid.layers.fill_constant_batch_size_like
Mean=fluid.layers.mean
Data=fluid.layers.data
class BowEncoder(object):
""" bow-encoder """
def __init__(self):
self.param_name = ""
def forward(self, emb):
return fluid.layers.sequence_pool(input=emb,
pool_type='sum')
class CNNEncoder(object):
""" cnn-encoder"""
def __init__(self,
param_name="cnn.w",
win_size=3,
ksize=128,
act='tanh',
pool_type='max'):
self.param_name = param_name
self.win_size = win_size
self.ksize = ksize
self.act = act
self.pool_type = pool_type
def forward(self, emb):
return fluid.nets.sequence_conv_pool(
input=emb,
num_filters=self.ksize,
filter_size=self.win_size,
act=self.act,
pool_type=self.pool_type,
attr=self.param_name)
class GrnnEncoder(object):
""" grnn-encoder """
def __init__(self,
param_name="grnn.w",
hidden_size=128):
self.param_name = args
self.hidden_size = hidden_size
def forward(self, emb):
gru_h = fluid.layers.dynamic_gru(input=emb,
size=self.hidden_size,
is_reverse=False,
attr=self.param_name)
return fluid.layers.sequence_pool(input=gru_h,
pool_type='max')
class SimpleEncoderFactory(object):
def __init__(self):
pass
def create(self, enc_type, enc_hid_size):
if enc_type == "bow":
bow_encode = BowEncoder()
return bow_encode
elif enc_type == "cnn":
cnn_encode = CNNEncoder(ksize=enc_hid_size)
return cnn_encode
elif enc_type == "gru":
rnn_encode = GrnnEncoder(hidden_size=enc_hid_size)
return rnn_encode
class MultiviewSimnet(object):
""" multi-view simnet """
def __init__(self,
embedding_size,
embedding_dim,
hidden_size):
self.embedding_size = embedding_size
self.embedding_dim = embedding_dim
self.emb_shape = [self.embedding_size,
self.embedding_dim]
self.hidden_size = hidden_size
self.margin = 0.1
def set_query_encoder(self, encoders):
self.query_encoders = encoders
def set_title_encoder(self, encoders):
self.title_encoders = encoders
def get_correct(self, x, y):
less = Cast(LessThan(x, y), dtype='float32')
correct = ReduceSum(less)
return correct
def train_net(self):
# input fields for query, pos_title, neg_title
q_slots = [Data(name="q%d" % i, shape=[1], lod_level=1, dtype='int64')
for i in range(len(self.query_encoders))]
pt_slots = [Data(name="pt%d" % i, shape=[1], lod_level=1, dtype='int64')
for i in range(len(self.title_encoders))]
nt_slots = [Data(name="nt%d" % i, shape=[1], lod_level=1, dtype='int64')
for i in range(len(self.title_encoders))]
# lookup embedding for each slot
q_embs = [Embedding(input=query, size=self.emb_shape,
param_attr="emb.w") for query in q_slots]
pt_embs = [Embedding(input=title, size=self.emb_shape,
param_attr="emb.w") for title in pt_slots]
nt_embs = [Embedding(input=title, size=self.emb_shape,
param_attr="emb.w") for title in nt_slots]
# encode each embedding field with encoder
q_encodes = [self.query_encoders[i].forward(emb)
for i, emb in enumerate(q_embs)]
pt_encodes = [self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)]
nt_encodes = [self.title_encoders[i].forward(emb)
for i, emb in enumerate(nt_embs)]
# concat multi view for query, pos_title, neg_title
q_concat = Concat(q_encodes)
pt_concat = Concat(pt_encodes)
nt_concat = Concat(nt_encodes)
# projection of hidden layer
q_hid = FC(q_concat, size=self.hidden_size, param_attr='q_fc.w')
pt_hid = FC(pt_concat, size=self.hidden_size, param_attr='t_fc.w')
nt_hid = FC(nt_concat, size=self.hidden_size, param_attr='t_fc.w')
# cosine of hidden layers
cos_pos = Cosine(q_hid, pt_hid)
cos_neg = Cosine(q_hid, nt_hid)
# pairwise hinge_loss
loss_part1 = ElemSub(FillConstBatch(
input=cos_pos,
shape=[-1, 1],
value=self.margin,
dtype='float32'), cos_pos)
loss_part2 = ElemAdd(loss_part1, cos_neg)
loss_part3 = ElemMax(FillConstBatch(
input=loss_part2,
shape=[-1, 1],
value=0.0,
dtype='float32'), loss_part2)
avg_cost = Mean(loss_part3)
correct = self.get_correct(cos_pos, cos_neg)
return q_slots + pt_slots + nt_slots, avg_cost, correct
def pred_net(self,
query_fields,
pos_title_fields,
neg_title_fields):
q_slots = [Data(name="q%d" % i, shape=[1], lod_level=1, dtype='int64')
for i in range(len(self.query_encoders))]
pt_slots = [Data(name="pt%d" % i, shape=[1], lod_level=1, dtype='int64')
for i in range(len(self.title_encoders))]
# lookup embedding for each slot
q_embs = [Embedding(input=query, size=self.emb_shape,
param_attr="emb.w") for query in q_slots]
pt_embs = [Embedding(input=title, size=self.emb_shape,
param_attr="emb.w") for title in pt_slots]
# encode each embedding field with encoder
q_encodes = [self.query_encoder[i].forward(emb)
for i, emb in enumerate(q_embs)]
pt_encodes = [self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)]
# concat multi view for query, pos_title, neg_title
q_concat = Concat(q_encodes)
pt_concat = Concat(pt_encodes)
# projection of hidden layer
q_hid = FC(q_concat, size=self.hidden_size, param_attr='q_fc.w')
pt_hid = FC(pt_concat, size=self.hidden_size, param_attr='t_fc.w')
# cosine of hidden layers
cos = Cosine(q_hid, pt_hid)
return cos
import random
class Dataset:
def __init__(self):
pass
class SyntheticDataset(Dataset):
def __init__(self, sparse_feature_dim,
query_slot_num,
title_slot_num):
# ids are randomly generated
self.ids_per_slot = 10
self.sparse_feature_dim = sparse_feature_dim
self.query_slot_num = query_slot_num
self.title_slot_num = title_slot_num
self.dataset_size = 10000
def _reader_creator(self, is_train):
def generate_ids(num, space):
return [random.randint(0, space - 1) for i in range(num)]
def reader():
for i in range(self.dataset_size):
query_slots = []
pos_title_slots = []
neg_title_slots = []
for i in range(self.query_slot_num):
qslot = generate_ids(self.ids_per_slot, self.sparse_feature_dim)
query_slots.append(qslot)
for i in range(self.title_slot_num):
pt_slot = generate_ids(self.ids_per_slot, self.sparse_feature_dim)
pos_title_slots.append(pt_slot)
if is_train:
for i in range(self.title_slot_num):
nt_slot = generate_ids(self.ids_per_slot, self.sparse_feature_dim)
neg_title_slots.append(nt_slot)
yield query_slots + pos_title_slots + neg_title_slots
else:
yield query_slots + pos_title_slots
return reader
def train(self):
return self._reader_creator(True)
def valid(self):
return self._reader_creator(True)
def test(self):
return self._reader_creator(False)
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import logging
os.environ["CUDA_VISIBLE_DEVICES"] = ""
import paddle.fluid as fluid
import paddle
import time
import reader as reader
from nets import MultiviewSimnet, SimpleEncoderFactory
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
def parse_args():
parser = argparse.ArgumentParser("multi-view simnet")
parser.add_argument("--train_file",
type=str,
help="Training file")
parser.add_argument("--valid_file",
type=str,
help="Validation file")
parser.add_argument("--epochs",
type=int,
default=10,
help="Number of epochs for training")
parser.add_argument("--model_output_dir",
type=str,
default='model_output',
help="Model output folder")
parser.add_argument("--query_slots",
type=int,
default=1,
help="Number of query slots")
parser.add_argument("--title_slots",
type=int,
default=1,
help="Number of title slots")
parser.add_argument("--query_encoder",
type=str,
default="bow",
help="Encoder module for slot encoding")
parser.add_argument("--title_encoder",
type=str,
default="bow",
help="Encoder module for slot encoding")
parser.add_argument("--query_encode_dim",
type=int,
default=128,
help="Dimension of query encoder output")
parser.add_argument("--title_encode_dim",
type=int,
default=128,
help="Dimension of title encoder output")
parser.add_argument("--batch_size",
type=int,
default=128,
help="Batch size for training")
parser.add_argument("--embedding_dim",
type=int,
default=128,
help="Default Dimension of Embedding")
parser.add_argument("--sparse_feature_dim",
type=int,
default=1000001,
help="Sparse feature hashing space"
"for index processing")
parser.add_argument("--hidden_size",
type=int,
default=128,
help="Hidden dim")
return parser.parse_args()
def start_train(args):
dataset = reader.SyntheticDataset(args.sparse_feature_dim,
args.query_slots,
args.title_slots)
train_reader = paddle.batch(
paddle.reader.shuffle(
dataset.train(),
buf_size=args.batch_size * 100),
batch_size=args.batch_size)
place = fluid.CPUPlace()
factory = SimpleEncoderFactory()
query_encoders = [factory.create(args.query_encoder,
args.query_encode_dim)
for i in range(args.query_slots)]
title_encoders = [factory.create(args.title_encoder,
args.title_encode_dim)
for i in range(args.title_slots)]
m_simnet = MultiviewSimnet(args.sparse_feature_dim,
args.embedding_dim,
args.hidden_size)
m_simnet.set_query_encoder(query_encoders)
m_simnet.set_title_encoder(title_encoders)
all_slots, avg_cost, correct = m_simnet.train_net()
optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
optimizer.minimize(avg_cost)
startup_program = fluid.default_startup_program()
loop_program = fluid.default_main_program()
feeder = fluid.DataFeeder(feed_list=all_slots, place=place)
exe = fluid.Executor(place)
exe.run(startup_program)
for pass_id in range(args.epochs):
for batch_id, data in enumerate(train_reader()):
loss_val, correct_val = exe.run(
loop_program, feed=feeder.feed(data),
fetch_list=[avg_cost, correct])
logger.info("TRAIN --> pass: {} batch_id: {} avg_cost: {}, acc: {}"
.format(pass_id, batch_id, loss_val,
float(correct_val) / args.batch_size))
fluid.io.save_inference_model(args.model_output_dir,
[var.name for val in all_slots],
[avg_cost, correct],
exe)
def main():
args = parse_args()
start_train(args)
if __name__ == "__main__":
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册