diff --git a/fluid/PaddleRec/multiview-simnet/.pre-commit-config.yaml b/fluid/PaddleRec/multiview-simnet/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e2c69d4954cb52a07c249797f573798dccef2992 --- /dev/null +++ b/fluid/PaddleRec/multiview-simnet/.pre-commit-config.yaml @@ -0,0 +1,5 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v1.2.3 + hooks: + - id: trailing-whitespace \ No newline at end of file diff --git a/fluid/PaddleRec/multiview-simnet/README.cn.md b/fluid/PaddleRec/multiview-simnet/README.cn.md index e6a0c58483e5097e33dea82af59ffcc273ba2953..3724ede23c4044658ee9b12a55bf7948c03ccaa3 100644 --- a/fluid/PaddleRec/multiview-simnet/README.cn.md +++ b/fluid/PaddleRec/multiview-simnet/README.cn.md @@ -13,7 +13,7 @@ 如下 如下命令行可以获得训练工具的具体选项,`python train.py -h`内容可以参考说明 ```bash -python train.py +python train.py ``` ## 未来的工作 - 多种pairwise的损失函数会被加入到这个项目中。对于不同视角的特征,用户-项目之间的匹配关系可以使用不同的损失函数进行联合优化。整个模型会在真实数据中进行验证。 diff --git a/fluid/PaddleRec/multiview-simnet/README.md b/fluid/PaddleRec/multiview-simnet/README.md index 53d7c013ac33f075e4505565de9163f58d8e1eb6..253f4cbae76dfcfabd1dcf6690fe9d6a43dbda35 100644 --- a/fluid/PaddleRec/multiview-simnet/README.md +++ b/fluid/PaddleRec/multiview-simnet/README.md @@ -1,7 +1,7 @@ # Multi-view Simnet for Personalized recommendation ## Introduction -In personalized recommendation scenario, a user often is provided with several items from personalized interest matching model. In real world application, a user may have multiple views of features, say user-id, age, click-history of items. A item, e.g. news, may also have multiple views of features like news title, news category and so on. Multi-view Simnet is matching a model that combine users' and items' multiple views of features into one unified model. The model can be used in many industrial product like Baidu's feed news. +In personalized recommendation scenario, a user often is provided with several items from personalized interest matching model. In real world application, a user may have multiple views of features, say user-id, age, click-history of items, search queries. A item, e.g. news, may also have multiple views of features like news title, news category, images in news and so on. Multi-view Simnet is matching a model that combine users' and items' multiple views of features into one unified model. The model can be used in many industrial product like Baidu's feed news. The model is adapted from the paper A Multi-View Deep Learning(MV-DNN) Approach for Cross Domain User Modeling in Recommendation Systems, WWW 2015. The difference between our model and the MV-DNN is that we also consider multiple feature views of users. ## Dataset Currently, synthetic dataset is provided for proof of concept and we aim to add more real world dataset in this project in the future. @@ -12,7 +12,7 @@ This project aims to provide practical usage of Paddle in personalized matching ## Train The command line options for training can be listed by `python train.py -h` ```bash -python train.py +python train.py ``` ## Future work @@ -20,4 +20,3 @@ python train.py - infer will be added - Parallel Executor will be added in this project - Distributed Training will be added - diff --git a/fluid/PaddleRec/multiview-simnet/nets.py b/fluid/PaddleRec/multiview-simnet/nets.py index 9cb2e791c90198547b136ecc1382352a5cd16686..919345b51ac5ce6fda35e8da396a4848a14c5999 100644 --- a/fluid/PaddleRec/multiview-simnet/nets.py +++ b/fluid/PaddleRec/multiview-simnet/nets.py @@ -13,34 +13,26 @@ # limitations under the License. import paddle.fluid as fluid -Embedding=fluid.layers.embedding -FC=fluid.layers.fc -Cast=fluid.layers.cast -ReduceSum=fluid.layers.reduce_sum -Concat=fluid.layers.concat -Cosine=fluid.layers.cos_sim -ElemSub=fluid.layers.elementwise_sub -ElemDiv=fluid.layers.elementwise_div -ElemMax=fluid.layers.elementwise_max -ElemAdd=fluid.layers.elementwise_add -LessThan=fluid.layers.less_than -FillConst=fluid.layers.fill_constant -FillConstBatch=fluid.layers.fill_constant_batch_size_like -Mean=fluid.layers.mean -Data=fluid.layers.data +import paddle.fluid.layers.nn as nn +import paddle.fluid.layers.tensor as tensor +import paddle.fluid.layers.control_flow as cf +import paddle.fluid.layers.io as io + class BowEncoder(object): """ bow-encoder """ + def __init__(self): self.param_name = "" def forward(self, emb): - return fluid.layers.sequence_pool(input=emb, - pool_type='sum') + return nn.sequence_pool(input=emb, pool_type='sum') + class CNNEncoder(object): """ cnn-encoder""" - def __init__(self, + + def __init__(self, param_name="cnn.w", win_size=3, ksize=128, @@ -51,7 +43,7 @@ class CNNEncoder(object): self.ksize = ksize self.act = act self.pool_type = pool_type - + def forward(self, emb): return fluid.nets.sequence_conv_pool( input=emb, @@ -61,29 +53,34 @@ class CNNEncoder(object): pool_type=self.pool_type, attr=self.param_name) + class GrnnEncoder(object): """ grnn-encoder """ - def __init__(self, - param_name="grnn.w", - hidden_size=128): + + def __init__(self, param_name="grnn.w", hidden_size=128): self.param_name = args self.hidden_size = hidden_size - + def forward(self, emb): - gru_h = fluid.layers.dynamic_gru(input=emb, - size=self.hidden_size, - is_reverse=False, - attr=self.param_name) - return fluid.layers.sequence_pool(input=gru_h, - pool_type='max') + fc0 = nn.fc(input=emb, size=self.hidden_size * 3) + gru_h = nn.dynamic_gru( + input=emb, + size=self.hidden_size, + is_reverse=False, + attr=self.param_name) + return nn.sequence_pool(input=gru_h, pool_type='max') + '''this is a very simple Encoder factory most default argument values are used''' + + class SimpleEncoderFactory(object): def __init__(self): pass ''' create an encoder through create function ''' + def create(self, enc_type, enc_hid_size): if enc_type == "bow": bow_encode = BowEncoder() @@ -95,16 +92,14 @@ class SimpleEncoderFactory(object): rnn_encode = GrnnEncoder(hidden_size=enc_hid_size) return rnn_encode + class MultiviewSimnet(object): """ multi-view simnet """ - def __init__(self, - embedding_size, - embedding_dim, - hidden_size): + + def __init__(self, embedding_size, embedding_dim, hidden_size): self.embedding_size = embedding_size self.embedding_dim = embedding_dim - self.emb_shape = [self.embedding_size, - self.embedding_dim] + self.emb_shape = [self.embedding_size, self.embedding_dim] self.hidden_size = hidden_size self.margin = 0.1 @@ -115,95 +110,126 @@ class MultiviewSimnet(object): self.title_encoders = encoders def get_correct(self, x, y): - less = Cast(LessThan(x, y), dtype='float32') - correct = ReduceSum(less) + less = tensor.cast(cf.less_than(x, y), dtype='float32') + correct = nn.reduce_sum(less) return correct def train_net(self): # input fields for query, pos_title, neg_title - q_slots = [Data(name="q%d" % i, shape=[1], lod_level=1, dtype='int64') - for i in range(len(self.query_encoders))] - pt_slots = [Data(name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') - for i in range(len(self.title_encoders))] - nt_slots = [Data(name="nt%d" % i, shape=[1], lod_level=1, dtype='int64') - for i in range(len(self.title_encoders))] + q_slots = [ + io.data( + name="q%d" % i, shape=[1], lod_level=1, dtype='int64') + for i in range(len(self.query_encoders)) + ] + pt_slots = [ + io.data( + name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') + for i in range(len(self.title_encoders)) + ] + nt_slots = [ + io.data( + name="nt%d" % i, shape=[1], lod_level=1, dtype='int64') + for i in range(len(self.title_encoders)) + ] # lookup embedding for each slot - q_embs = [Embedding(input=query, size=self.emb_shape, - param_attr="emb.w") for query in q_slots] - pt_embs = [Embedding(input=title, size=self.emb_shape, - param_attr="emb.w") for title in pt_slots] - nt_embs = [Embedding(input=title, size=self.emb_shape, - param_attr="emb.w") for title in nt_slots] - + q_embs = [ + nn.embedding( + input=query, size=self.emb_shape, param_attr="emb.w") + for query in q_slots + ] + pt_embs = [ + nn.embedding( + input=title, size=self.emb_shape, param_attr="emb.w") + for title in pt_slots + ] + nt_embs = [ + nn.embedding( + input=title, size=self.emb_shape, param_attr="emb.w") + for title in nt_slots + ] + # encode each embedding field with encoder - q_encodes = [self.query_encoders[i].forward(emb) - for i, emb in enumerate(q_embs)] - pt_encodes = [self.title_encoders[i].forward(emb) - for i, emb in enumerate(pt_embs)] - nt_encodes = [self.title_encoders[i].forward(emb) - for i, emb in enumerate(nt_embs)] + q_encodes = [ + self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) + ] + pt_encodes = [ + self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) + ] + nt_encodes = [ + self.title_encoders[i].forward(emb) for i, emb in enumerate(nt_embs) + ] # concat multi view for query, pos_title, neg_title - q_concat = Concat(q_encodes) - pt_concat = Concat(pt_encodes) - nt_concat = Concat(nt_encodes) + q_concat = nn.concat(q_encodes) + pt_concat = nn.concat(pt_encodes) + nt_concat = nn.concat(nt_encodes) # projection of hidden layer - q_hid = FC(q_concat, size=self.hidden_size, param_attr='q_fc.w') - pt_hid = FC(pt_concat, size=self.hidden_size, param_attr='t_fc.w') - nt_hid = FC(nt_concat, size=self.hidden_size, param_attr='t_fc.w') + q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w') + pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w') + nt_hid = nn.fc(nt_concat, size=self.hidden_size, param_attr='t_fc.w') # cosine of hidden layers - cos_pos = Cosine(q_hid, pt_hid) - cos_neg = Cosine(q_hid, nt_hid) - + cos_pos = nn.cos_sim(q_hid, pt_hid) + cos_neg = nn.cos_sim(q_hid, nt_hid) + # pairwise hinge_loss - loss_part1 = ElemSub(FillConstBatch( - input=cos_pos, - shape=[-1, 1], - value=self.margin, - dtype='float32'), cos_pos) - - loss_part2 = ElemAdd(loss_part1, cos_neg) - - loss_part3 = ElemMax(FillConstBatch( - input=loss_part2, - shape=[-1, 1], - value=0.0, - dtype='float32'), loss_part2) - - avg_cost = Mean(loss_part3) + loss_part1 = nn.elementwise_sub( + tensor.fill_constant_batch_size_like( + input=cos_pos, + shape=[-1, 1], + value=self.margin, + dtype='float32'), + cos_pos) + + loss_part2 = nn.elementwise_add(loss_part1, cos_neg) + + loss_part3 = nn.elementwise_max( + tensor.fill_constant_batch_size_like( + input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'), + loss_part2) + + avg_cost = nn.mean(loss_part3) correct = self.get_correct(cos_pos, cos_neg) return q_slots + pt_slots + nt_slots, avg_cost, correct - - def pred_net(self, - query_fields, - pos_title_fields, - neg_title_fields): - q_slots = [Data(name="q%d" % i, shape=[1], lod_level=1, dtype='int64') - for i in range(len(self.query_encoders))] - pt_slots = [Data(name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') - for i in range(len(self.title_encoders))] + + def pred_net(self, query_fields, pos_title_fields, neg_title_fields): + q_slots = [ + io.data( + name="q%d" % i, shape=[1], lod_level=1, dtype='int64') + for i in range(len(self.query_encoders)) + ] + pt_slots = [ + io.data( + name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') + for i in range(len(self.title_encoders)) + ] # lookup embedding for each slot - q_embs = [Embedding(input=query, size=self.emb_shape, - param_attr="emb.w") for query in q_slots] - pt_embs = [Embedding(input=title, size=self.emb_shape, - param_attr="emb.w") for title in pt_slots] + q_embs = [ + nn.embedding( + input=query, size=self.emb_shape, param_attr="emb.w") + for query in q_slots + ] + pt_embs = [ + nn.embedding( + input=title, size=self.emb_shape, param_attr="emb.w") + for title in pt_slots + ] # encode each embedding field with encoder - q_encodes = [self.query_encoder[i].forward(emb) - for i, emb in enumerate(q_embs)] - pt_encodes = [self.title_encoders[i].forward(emb) - for i, emb in enumerate(pt_embs)] + q_encodes = [ + self.query_encoder[i].forward(emb) for i, emb in enumerate(q_embs) + ] + pt_encodes = [ + self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) + ] # concat multi view for query, pos_title, neg_title - q_concat = Concat(q_encodes) - pt_concat = Concat(pt_encodes) + q_concat = nn.concat(q_encodes) + pt_concat = nn.concat(pt_encodes) # projection of hidden layer - q_hid = FC(q_concat, size=self.hidden_size, param_attr='q_fc.w') - pt_hid = FC(pt_concat, size=self.hidden_size, param_attr='t_fc.w') + q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w') + pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w') # cosine of hidden layers - cos = Cosine(q_hid, pt_hid) + cos = nn.cos_sim(q_hid, pt_hid) return cos - - diff --git a/fluid/PaddleRec/multiview-simnet/reader.py b/fluid/PaddleRec/multiview-simnet/reader.py index b80d0947ebeb2f5e4e8130963bb75b8568af1bab..10b5c4391a57316e526b1f7d3f974d8a5f327d6b 100644 --- a/fluid/PaddleRec/multiview-simnet/reader.py +++ b/fluid/PaddleRec/multiview-simnet/reader.py @@ -14,14 +14,14 @@ import random + class Dataset: def __init__(self): pass + class SyntheticDataset(Dataset): - def __init__(self, sparse_feature_dim, - query_slot_num, - title_slot_num): + def __init__(self, sparse_feature_dim, query_slot_num, title_slot_num): # ids are randomly generated self.ids_per_slot = 10 self.sparse_feature_dim = sparse_feature_dim @@ -39,14 +39,17 @@ class SyntheticDataset(Dataset): pos_title_slots = [] neg_title_slots = [] for i in range(self.query_slot_num): - qslot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) + qslot = generate_ids(self.ids_per_slot, + self.sparse_feature_dim) query_slots.append(qslot) for i in range(self.title_slot_num): - pt_slot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) + pt_slot = generate_ids(self.ids_per_slot, + self.sparse_feature_dim) pos_title_slots.append(pt_slot) if is_train: for i in range(self.title_slot_num): - nt_slot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) + nt_slot = generate_ids(self.ids_per_slot, + self.sparse_feature_dim) neg_title_slots.append(nt_slot) yield query_slots + pos_title_slots + neg_title_slots else: @@ -62,4 +65,3 @@ class SyntheticDataset(Dataset): def test(self): return self._reader_creator(False) - diff --git a/fluid/PaddleRec/multiview-simnet/train.py b/fluid/PaddleRec/multiview-simnet/train.py index fb2f291a1e5aef70e6b826cabdcf412a249df163..70aec67c9ba5dbc8a9048fcac9cfe7f724e0b812 100644 --- a/fluid/PaddleRec/multiview-simnet/train.py +++ b/fluid/PaddleRec/multiview-simnet/train.py @@ -20,96 +20,88 @@ import numpy as np import math import argparse import logging -os.environ["CUDA_VISIBLE_DEVICES"] = "" import paddle.fluid as fluid import paddle import time import reader as reader from nets import MultiviewSimnet, SimpleEncoderFactory -logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(message)s") +logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("fluid") logger.setLevel(logging.INFO) + def parse_args(): parser = argparse.ArgumentParser("multi-view simnet") - parser.add_argument("--train_file", - type=str, - help="Training file") - parser.add_argument("--valid_file", - type=str, - help="Validation file") - parser.add_argument("--epochs", - type=int, - default=10, - help="Number of epochs for training") - parser.add_argument("--model_output_dir", - type=str, - default='model_output', - help="Model output folder") - parser.add_argument("--query_slots", - type=int, - default=1, - help="Number of query slots") - parser.add_argument("--title_slots", - type=int, - default=1, - help="Number of title slots") - parser.add_argument("--query_encoder", - type=str, - default="bow", - help="Encoder module for slot encoding") - parser.add_argument("--title_encoder", - type=str, - default="bow", - help="Encoder module for slot encoding") - parser.add_argument("--query_encode_dim", - type=int, - default=128, - help="Dimension of query encoder output") - parser.add_argument("--title_encode_dim", - type=int, - default=128, - help="Dimension of title encoder output") - parser.add_argument("--batch_size", - type=int, - default=128, - help="Batch size for training") - parser.add_argument("--embedding_dim", - type=int, - default=128, - help="Default Dimension of Embedding") - parser.add_argument("--sparse_feature_dim", - type=int, - default=1000001, - help="Sparse feature hashing space" - "for index processing") - parser.add_argument("--hidden_size", - type=int, - default=128, - help="Hidden dim") + parser.add_argument("--train_file", type=str, help="Training file") + parser.add_argument("--valid_file", type=str, help="Validation file") + parser.add_argument( + "--epochs", type=int, default=10, help="Number of epochs for training") + parser.add_argument( + "--model_output_dir", + type=str, + default='model_output', + help="Model output folder") + parser.add_argument( + "--query_slots", type=int, default=1, help="Number of query slots") + parser.add_argument( + "--title_slots", type=int, default=1, help="Number of title slots") + parser.add_argument( + "--query_encoder", + type=str, + default="bow", + help="Encoder module for slot encoding") + parser.add_argument( + "--title_encoder", + type=str, + default="bow", + help="Encoder module for slot encoding") + parser.add_argument( + "--query_encode_dim", + type=int, + default=128, + help="Dimension of query encoder output") + parser.add_argument( + "--title_encode_dim", + type=int, + default=128, + help="Dimension of title encoder output") + parser.add_argument( + "--batch_size", type=int, default=128, help="Batch size for training") + parser.add_argument( + "--embedding_dim", + type=int, + default=128, + help="Default Dimension of Embedding") + parser.add_argument( + "--sparse_feature_dim", + type=int, + default=1000001, + help="Sparse feature hashing space" + "for index processing") + parser.add_argument( + "--hidden_size", type=int, default=128, help="Hidden dim") return parser.parse_args() + def start_train(args): - dataset = reader.SyntheticDataset(args.sparse_feature_dim, - args.query_slots, + dataset = reader.SyntheticDataset(args.sparse_feature_dim, args.query_slots, args.title_slots) train_reader = paddle.batch( paddle.reader.shuffle( - dataset.train(), - buf_size=args.batch_size * 100), + dataset.train(), buf_size=args.batch_size * 100), batch_size=args.batch_size) place = fluid.CPUPlace() factory = SimpleEncoderFactory() - query_encoders = [factory.create(args.query_encoder, - args.query_encode_dim) - for i in range(args.query_slots)] - title_encoders = [factory.create(args.title_encoder, - args.title_encode_dim) - for i in range(args.title_slots)] - m_simnet = MultiviewSimnet(args.sparse_feature_dim, - args.embedding_dim, + query_encoders = [ + factory.create(args.query_encoder, args.query_encode_dim) + for i in range(args.query_slots) + ] + title_encoders = [ + factory.create(args.title_encoder, args.title_encode_dim) + for i in range(args.title_slots) + ] + m_simnet = MultiviewSimnet(args.sparse_feature_dim, args.embedding_dim, args.hidden_size) m_simnet.set_query_encoder(query_encoders) m_simnet.set_title_encoder(title_encoders) @@ -125,20 +117,21 @@ def start_train(args): for pass_id in range(args.epochs): for batch_id, data in enumerate(train_reader()): - loss_val, correct_val = exe.run( - loop_program, feed=feeder.feed(data), - fetch_list=[avg_cost, correct]) + loss_val, correct_val = exe.run(loop_program, + feed=feeder.feed(data), + fetch_list=[avg_cost, correct]) logger.info("TRAIN --> pass: {} batch_id: {} avg_cost: {}, acc: {}" - .format(pass_id, batch_id, loss_val, + .format(pass_id, batch_id, loss_val, float(correct_val) / args.batch_size)) - fluid.io.save_inference_model(args.model_output_dir, + fluid.io.save_inference_model(args.model_output_dir, [var.name for val in all_slots], - [avg_cost, correct], - exe) + [avg_cost, correct], exe) + def main(): args = parse_args() start_train(args) + if __name__ == "__main__": main()