From 329750de936a1c518a758af669efa408bcf9edab Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sat, 27 Oct 2018 16:04:17 +0800 Subject: [PATCH] add multiview simnet --- fluid/PaddleRec/multiview-simnet/README.md | 26 +++ fluid/PaddleRec/multiview-simnet/nets.py | 205 +++++++++++++++++++++ fluid/PaddleRec/multiview-simnet/reader.py | 51 +++++ fluid/PaddleRec/multiview-simnet/train.py | 130 +++++++++++++ 4 files changed, 412 insertions(+) create mode 100644 fluid/PaddleRec/multiview-simnet/README.md create mode 100644 fluid/PaddleRec/multiview-simnet/nets.py create mode 100644 fluid/PaddleRec/multiview-simnet/reader.py create mode 100644 fluid/PaddleRec/multiview-simnet/train.py diff --git a/fluid/PaddleRec/multiview-simnet/README.md b/fluid/PaddleRec/multiview-simnet/README.md new file mode 100644 index 00000000..fc38a59c --- /dev/null +++ b/fluid/PaddleRec/multiview-simnet/README.md @@ -0,0 +1,26 @@ +# Multi-view Simnet for Personalized recommendation + +## Introduction +In personalized recommendation scenario, a user often is provided with serveral items from personalized interest matching model. In real world application, a user may have multiple views of features, say userid, age, click-history of items. A item, e.g. news, may also have multiple views of features like news title, news category and so on. Multi-view Simnet is matching a model that combine users' and items' multiple views of features into one unified model. The model can be used in many industrial product like baidu's feed news. + +## Dataset +Currently, synthetic dataset is provided for proof of concept and we aim to add more real world dataset in this project in the future. + +## Model +This project aims to provide practical usage of Paddle in personalized matching scenario. The model provides serveral encoder modules for different views of features. Currenly, Bag-of-Embedding encoder, Temporal-Convolutional encoder, Gated-Recurrent-Unit encoder are provided. We will add more practical encoder for sparse features commonly used in recommender systems. Training algorithms used in this model is pairwise ranking in that a negative item with multiple views will be sampled given a pair of positive user-item pair. + +## Train +The command line options for training can be listed by `python train.py -h` +```bash +python train.py +``` + +## Infer +The command line options for inference can be listed by `python infer.py -h` + +## Future work +# Multiple types of pairwise loss will be added in this project. For different views of features between a user and an item, multiple losses will be supported. The model will be verified in real world dataset. +# infer will be added +# Parallel Executor will be added in this project +# Distributed Training will be added + diff --git a/fluid/PaddleRec/multiview-simnet/nets.py b/fluid/PaddleRec/multiview-simnet/nets.py new file mode 100644 index 00000000..f5700e47 --- /dev/null +++ b/fluid/PaddleRec/multiview-simnet/nets.py @@ -0,0 +1,205 @@ +#Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle.fluid as fluid +Embedding=fluid.layers.embedding +FC=fluid.layers.fc +Cast=fluid.layers.cast +ReduceSum=fluid.layers.reduce_sum +Concat=fluid.layers.concat +Cosine=fluid.layers.cos_sim +ElemSub=fluid.layers.elementwise_sub +ElemDiv=fluid.layers.elementwise_div +ElemMax=fluid.layers.elementwise_max +ElemAdd=fluid.layers.elementwise_add +LessThan=fluid.layers.less_than +FillConst=fluid.layers.fill_constant +FillConstBatch=fluid.layers.fill_constant_batch_size_like +Mean=fluid.layers.mean +Data=fluid.layers.data + +class BowEncoder(object): + """ bow-encoder """ + def __init__(self): + self.param_name = "" + + def forward(self, emb): + return fluid.layers.sequence_pool(input=emb, + pool_type='sum') + +class CNNEncoder(object): + """ cnn-encoder""" + def __init__(self, + param_name="cnn.w", + win_size=3, + ksize=128, + act='tanh', + pool_type='max'): + self.param_name = param_name + self.win_size = win_size + self.ksize = ksize + self.act = act + self.pool_type = pool_type + + def forward(self, emb): + return fluid.nets.sequence_conv_pool( + input=emb, + num_filters=self.ksize, + filter_size=self.win_size, + act=self.act, + pool_type=self.pool_type, + attr=self.param_name) + +class GrnnEncoder(object): + """ grnn-encoder """ + def __init__(self, + param_name="grnn.w", + hidden_size=128): + self.param_name = args + self.hidden_size = hidden_size + + def forward(self, emb): + gru_h = fluid.layers.dynamic_gru(input=emb, + size=self.hidden_size, + is_reverse=False, + attr=self.param_name) + return fluid.layers.sequence_pool(input=gru_h, + pool_type='max') + +class SimpleEncoderFactory(object): + def __init__(self): + pass + + def create(self, enc_type, enc_hid_size): + if enc_type == "bow": + bow_encode = BowEncoder() + return bow_encode + elif enc_type == "cnn": + cnn_encode = CNNEncoder(ksize=enc_hid_size) + return cnn_encode + elif enc_type == "gru": + rnn_encode = GrnnEncoder(hidden_size=enc_hid_size) + return rnn_encode + +class MultiviewSimnet(object): + """ multi-view simnet """ + def __init__(self, + embedding_size, + embedding_dim, + hidden_size): + self.embedding_size = embedding_size + self.embedding_dim = embedding_dim + self.emb_shape = [self.embedding_size, + self.embedding_dim] + self.hidden_size = hidden_size + self.margin = 0.1 + + def set_query_encoder(self, encoders): + self.query_encoders = encoders + + def set_title_encoder(self, encoders): + self.title_encoders = encoders + + def get_correct(self, x, y): + less = Cast(LessThan(x, y), dtype='float32') + correct = ReduceSum(less) + return correct + + def train_net(self): + # input fields for query, pos_title, neg_title + q_slots = [Data(name="q%d" % i, shape=[1], lod_level=1, dtype='int64') + for i in range(len(self.query_encoders))] + pt_slots = [Data(name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') + for i in range(len(self.title_encoders))] + nt_slots = [Data(name="nt%d" % i, shape=[1], lod_level=1, dtype='int64') + for i in range(len(self.title_encoders))] + + # lookup embedding for each slot + q_embs = [Embedding(input=query, size=self.emb_shape, + param_attr="emb.w") for query in q_slots] + pt_embs = [Embedding(input=title, size=self.emb_shape, + param_attr="emb.w") for title in pt_slots] + nt_embs = [Embedding(input=title, size=self.emb_shape, + param_attr="emb.w") for title in nt_slots] + + # encode each embedding field with encoder + q_encodes = [self.query_encoders[i].forward(emb) + for i, emb in enumerate(q_embs)] + pt_encodes = [self.title_encoders[i].forward(emb) + for i, emb in enumerate(pt_embs)] + nt_encodes = [self.title_encoders[i].forward(emb) + for i, emb in enumerate(nt_embs)] + + # concat multi view for query, pos_title, neg_title + q_concat = Concat(q_encodes) + pt_concat = Concat(pt_encodes) + nt_concat = Concat(nt_encodes) + + # projection of hidden layer + q_hid = FC(q_concat, size=self.hidden_size, param_attr='q_fc.w') + pt_hid = FC(pt_concat, size=self.hidden_size, param_attr='t_fc.w') + nt_hid = FC(nt_concat, size=self.hidden_size, param_attr='t_fc.w') + + # cosine of hidden layers + cos_pos = Cosine(q_hid, pt_hid) + cos_neg = Cosine(q_hid, nt_hid) + + # pairwise hinge_loss + loss_part1 = ElemSub(FillConstBatch( + input=cos_pos, + shape=[-1, 1], + value=self.margin, + dtype='float32'), cos_pos) + + loss_part2 = ElemAdd(loss_part1, cos_neg) + + loss_part3 = ElemMax(FillConstBatch( + input=loss_part2, + shape=[-1, 1], + value=0.0, + dtype='float32'), loss_part2) + + avg_cost = Mean(loss_part3) + correct = self.get_correct(cos_pos, cos_neg) + + return q_slots + pt_slots + nt_slots, avg_cost, correct + + def pred_net(self, + query_fields, + pos_title_fields, + neg_title_fields): + q_slots = [Data(name="q%d" % i, shape=[1], lod_level=1, dtype='int64') + for i in range(len(self.query_encoders))] + pt_slots = [Data(name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') + for i in range(len(self.title_encoders))] + # lookup embedding for each slot + q_embs = [Embedding(input=query, size=self.emb_shape, + param_attr="emb.w") for query in q_slots] + pt_embs = [Embedding(input=title, size=self.emb_shape, + param_attr="emb.w") for title in pt_slots] + # encode each embedding field with encoder + q_encodes = [self.query_encoder[i].forward(emb) + for i, emb in enumerate(q_embs)] + pt_encodes = [self.title_encoders[i].forward(emb) + for i, emb in enumerate(pt_embs)] + # concat multi view for query, pos_title, neg_title + q_concat = Concat(q_encodes) + pt_concat = Concat(pt_encodes) + # projection of hidden layer + q_hid = FC(q_concat, size=self.hidden_size, param_attr='q_fc.w') + pt_hid = FC(pt_concat, size=self.hidden_size, param_attr='t_fc.w') + # cosine of hidden layers + cos = Cosine(q_hid, pt_hid) + return cos + + diff --git a/fluid/PaddleRec/multiview-simnet/reader.py b/fluid/PaddleRec/multiview-simnet/reader.py new file mode 100644 index 00000000..9e4b7633 --- /dev/null +++ b/fluid/PaddleRec/multiview-simnet/reader.py @@ -0,0 +1,51 @@ +import random + +class Dataset: + def __init__(self): + pass + +class SyntheticDataset(Dataset): + def __init__(self, sparse_feature_dim, + query_slot_num, + title_slot_num): + # ids are randomly generated + self.ids_per_slot = 10 + self.sparse_feature_dim = sparse_feature_dim + self.query_slot_num = query_slot_num + self.title_slot_num = title_slot_num + self.dataset_size = 10000 + + def _reader_creator(self, is_train): + def generate_ids(num, space): + return [random.randint(0, space - 1) for i in range(num)] + + def reader(): + for i in range(self.dataset_size): + query_slots = [] + pos_title_slots = [] + neg_title_slots = [] + for i in range(self.query_slot_num): + qslot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) + query_slots.append(qslot) + for i in range(self.title_slot_num): + pt_slot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) + pos_title_slots.append(pt_slot) + if is_train: + for i in range(self.title_slot_num): + nt_slot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) + neg_title_slots.append(nt_slot) + yield query_slots + pos_title_slots + neg_title_slots + else: + yield query_slots + pos_title_slots + + return reader + + def train(self): + return self._reader_creator(True) + + def valid(self): + return self._reader_creator(True) + + def test(self): + return self._reader_creator(False) + diff --git a/fluid/PaddleRec/multiview-simnet/train.py b/fluid/PaddleRec/multiview-simnet/train.py new file mode 100644 index 00000000..cecf6425 --- /dev/null +++ b/fluid/PaddleRec/multiview-simnet/train.py @@ -0,0 +1,130 @@ +import os +import sys +import time +import six +import numpy as np +import math +import argparse +import logging +os.environ["CUDA_VISIBLE_DEVICES"] = "" +import paddle.fluid as fluid +import paddle +import time +import reader as reader +from nets import MultiviewSimnet, SimpleEncoderFactory + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) + +def parse_args(): + parser = argparse.ArgumentParser("multi-view simnet") + parser.add_argument("--train_file", + type=str, + help="Training file") + parser.add_argument("--valid_file", + type=str, + help="Validation file") + parser.add_argument("--epochs", + type=int, + default=10, + help="Number of epochs for training") + parser.add_argument("--model_output_dir", + type=str, + default='model_output', + help="Model output folder") + parser.add_argument("--query_slots", + type=int, + default=1, + help="Number of query slots") + parser.add_argument("--title_slots", + type=int, + default=1, + help="Number of title slots") + parser.add_argument("--query_encoder", + type=str, + default="bow", + help="Encoder module for slot encoding") + parser.add_argument("--title_encoder", + type=str, + default="bow", + help="Encoder module for slot encoding") + parser.add_argument("--query_encode_dim", + type=int, + default=128, + help="Dimension of query encoder output") + parser.add_argument("--title_encode_dim", + type=int, + default=128, + help="Dimension of title encoder output") + parser.add_argument("--batch_size", + type=int, + default=128, + help="Batch size for training") + parser.add_argument("--embedding_dim", + type=int, + default=128, + help="Default Dimension of Embedding") + parser.add_argument("--sparse_feature_dim", + type=int, + default=1000001, + help="Sparse feature hashing space" + "for index processing") + parser.add_argument("--hidden_size", + type=int, + default=128, + help="Hidden dim") + return parser.parse_args() + +def start_train(args): + dataset = reader.SyntheticDataset(args.sparse_feature_dim, + args.query_slots, + args.title_slots) + train_reader = paddle.batch( + paddle.reader.shuffle( + dataset.train(), + buf_size=args.batch_size * 100), + batch_size=args.batch_size) + place = fluid.CPUPlace() + factory = SimpleEncoderFactory() + query_encoders = [factory.create(args.query_encoder, + args.query_encode_dim) + for i in range(args.query_slots)] + title_encoders = [factory.create(args.title_encoder, + args.title_encode_dim) + for i in range(args.title_slots)] + m_simnet = MultiviewSimnet(args.sparse_feature_dim, + args.embedding_dim, + args.hidden_size) + m_simnet.set_query_encoder(query_encoders) + m_simnet.set_title_encoder(title_encoders) + all_slots, avg_cost, correct = m_simnet.train_net() + optimizer = fluid.optimizer.Adam(learning_rate=1e-4) + optimizer.minimize(avg_cost) + startup_program = fluid.default_startup_program() + loop_program = fluid.default_main_program() + + feeder = fluid.DataFeeder(feed_list=all_slots, place=place) + exe = fluid.Executor(place) + exe.run(startup_program) + + for pass_id in range(args.epochs): + for batch_id, data in enumerate(train_reader()): + loss_val, correct_val = exe.run( + loop_program, feed=feeder.feed(data), + fetch_list=[avg_cost, correct]) + logger.info("TRAIN --> pass: {} batch_id: {} avg_cost: {}, acc: {}" + .format(pass_id, batch_id, loss_val, + float(correct_val) / args.batch_size)) + fluid.io.save_inference_model(args.model_output_dir, + [var.name for val in all_slots], + [avg_cost, correct], + exe) + +def main(): + args = parse_args() + start_train(args) + +if __name__ == "__main__": + main() -- GitLab