From c1d55974c15a4c25b200aabd91cd9b60a261b9f5 Mon Sep 17 00:00:00 2001 From: zhang wenhui Date: Tue, 15 Oct 2019 11:00:18 +0800 Subject: [PATCH] update api in PaddleRec, test=release/1.6 (#3573) --- PaddleRec/gru4rec/README.md | 4 ++ PaddleRec/gru4rec/infer.py | 1 + PaddleRec/gru4rec/infer_sample_neg.py | 1 + PaddleRec/gru4rec/net.py | 59 +++++++++++++++------------ PaddleRec/gru4rec/train.py | 1 + PaddleRec/gru4rec/train_sample_neg.py | 1 + PaddleRec/gru4rec/utils.py | 16 +++++++- PaddleRec/multiview_simnet/README.md | 3 ++ PaddleRec/multiview_simnet/infer.py | 18 ++++++++ PaddleRec/multiview_simnet/nets.py | 30 +++++++------- PaddleRec/multiview_simnet/train.py | 16 ++++++++ PaddleRec/ssr/README.md | 4 ++ PaddleRec/ssr/infer.py | 1 + PaddleRec/ssr/nets.py | 16 ++++---- PaddleRec/ssr/train.py | 1 + PaddleRec/ssr/utils.py | 18 +++++++- PaddleRec/tagspace/README.md | 1 + PaddleRec/tagspace/infer.py | 1 + PaddleRec/tagspace/net.py | 17 ++++---- PaddleRec/tagspace/train.py | 1 + PaddleRec/tagspace/utils.py | 15 +++++++ PaddleRec/word2vec/README.md | 9 ++-- PaddleRec/word2vec/infer.py | 17 ++++---- PaddleRec/word2vec/net.py | 54 ++++++++++++------------ PaddleRec/word2vec/train.py | 1 + PaddleRec/word2vec/utils.py | 16 +++++++- 26 files changed, 221 insertions(+), 101 deletions(-) diff --git a/PaddleRec/gru4rec/README.md b/PaddleRec/gru4rec/README.md index 35378156..afff1f06 100644 --- a/PaddleRec/gru4rec/README.md +++ b/PaddleRec/gru4rec/README.md @@ -44,6 +44,10 @@ session-based推荐应用场景非常广泛,比如用户的商品浏览、新 运行样例程序可跳过'RSC15 数据下载及预处理'部分 + + +**要求使用PaddlePaddle 1.6及以上版本或适当的develop版本。** + ## RSC15 数据下载及预处理 运行命令 下载RSC15官网数据集 diff --git a/PaddleRec/gru4rec/infer.py b/PaddleRec/gru4rec/infer.py index bc459c28..032205cf 100644 --- a/PaddleRec/gru4rec/infer.py +++ b/PaddleRec/gru4rec/infer.py @@ -71,6 +71,7 @@ def infer(test_reader, use_cuda, model_path): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/gru4rec/infer_sample_neg.py b/PaddleRec/gru4rec/infer_sample_neg.py index 48458e82..b77f3685 100644 --- a/PaddleRec/gru4rec/infer_sample_neg.py +++ b/PaddleRec/gru4rec/infer_sample_neg.py @@ -84,6 +84,7 @@ def infer(args, vocab_size, test_reader, use_cuda): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/gru4rec/net.py b/PaddleRec/gru4rec/net.py index 6a715443..f0496434 100644 --- a/PaddleRec/gru4rec/net.py +++ b/PaddleRec/gru4rec/net.py @@ -10,12 +10,12 @@ def all_vocab_network(vocab_size, gru_lr_x = 1.0 fc_lr_x = 1.0 # Input data - src_wordseq = fluid.layers.data( - name="src_wordseq", shape=[1], dtype="int64", lod_level=1) - dst_wordseq = fluid.layers.data( - name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) + src_wordseq = fluid.data( + name="src_wordseq", shape=[None, 1], dtype="int64", lod_level=1) + dst_wordseq = fluid.data( + name="dst_wordseq", shape=[None, 1], dtype="int64", lod_level=1) - emb = fluid.layers.embedding( + emb = fluid.embedding( input=src_wordseq, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( @@ -56,19 +56,21 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2): gru_lr_x = 1.0 fc_lr_x = 1.0 # Input data - src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1) - pos_label = fluid.layers.data( - name="pos_label", shape=[1], dtype="int64", lod_level=1) - label = fluid.layers.data( - name="label", shape=[neg_size + 1], dtype="int64", lod_level=1) + src = fluid.data(name="src", shape=[None, 1], dtype="int64", lod_level=1) + pos_label = fluid.data( + name="pos_label", shape=[None, 1], dtype="int64", lod_level=1) + label = fluid.data( + name="label", shape=[None, neg_size + 1], dtype="int64", lod_level=1) - emb_src = fluid.layers.embedding( + emb_src = fluid.embedding( input=src, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( name="emb", initializer=fluid.initializer.XavierInitializer(), learning_rate=emb_lr_x)) + emb_src = fluid.layers.squeeze(input=emb_src, axes=[1]) + emb_src_drop = fluid.layers.dropout(emb_src, dropout_prob=drop_out) @@ -90,7 +92,7 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2): gru_h0_drop = fluid.layers.dropout(gru_h0, dropout_prob=drop_out) label_re = fluid.layers.sequence_reshape(input=label, new_dim=1) - emb_label = fluid.layers.embedding( + emb_label1 = fluid.embedding( input=label_re, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( @@ -98,6 +100,7 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2): initializer=fluid.initializer.XavierInitializer(), learning_rate=emb_lr_x)) + emb_label = fluid.layers.squeeze(input=emb_label1, axes=[1]) emb_label_drop = fluid.layers.dropout(emb_label, dropout_prob=drop_out) gru_exp = fluid.layers.expand( @@ -120,19 +123,20 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2): gru_lr_x = 1.0 fc_lr_x = 1.0 # Input data - src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1) - pos_label = fluid.layers.data( - name="pos_label", shape=[1], dtype="int64", lod_level=1) - label = fluid.layers.data( - name="label", shape=[neg_size + 1], dtype="int64", lod_level=1) + src = fluid.data(name="src", shape=[None, 1], dtype="int64", lod_level=1) + pos_label = fluid.data( + name="pos_label", shape=[None, 1], dtype="int64", lod_level=1) + label = fluid.data( + name="label", shape=[None, neg_size + 1], dtype="int64", lod_level=1) - emb_src = fluid.layers.embedding( + emb_src = fluid.embedding( input=src, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( name="emb", initializer=fluid.initializer.XavierInitializer(), learning_rate=emb_lr_x)) + emb_src = fluid.layers.squeeze(input=emb_src, axes=[1]) emb_src_drop = fluid.layers.dropout(emb_src, dropout_prob=drop_out) @@ -154,13 +158,14 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2): gru_h0_drop = fluid.layers.dropout(gru_h0, dropout_prob=drop_out) label_re = fluid.layers.sequence_reshape(input=label, new_dim=1) - emb_label = fluid.layers.embedding( + emb_label1 = fluid.embedding( input=label_re, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( name="emb", initializer=fluid.initializer.XavierInitializer(), learning_rate=emb_lr_x)) + emb_label = fluid.layers.squeeze(input=emb_label1, axes=[1]) emb_label_drop = fluid.layers.dropout(emb_label, dropout_prob=drop_out) @@ -180,8 +185,8 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2): def infer_network(vocab_size, batch_size, hid_size, dropout=0.2): - src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1) - emb_src = fluid.layers.embedding( + src = fluid.data(name="src", shape=[None, 1], dtype="int64", lod_level=1) + emb_src = fluid.embedding( input=src, size=[vocab_size, hid_size], param_attr="emb") emb_src_drop = fluid.layers.dropout( emb_src, dropout_prob=dropout, is_test=True) @@ -198,20 +203,20 @@ def infer_network(vocab_size, batch_size, hid_size, dropout=0.2): gru_h0_drop = fluid.layers.dropout( gru_h0, dropout_prob=dropout, is_test=True) - all_label = fluid.layers.data( + all_label = fluid.data( name="all_label", shape=[vocab_size, 1], - dtype="int64", - append_batch_size=False) - emb_all_label = fluid.layers.embedding( + dtype="int64") + emb_all_label = fluid.embedding( input=all_label, size=[vocab_size, hid_size], param_attr="emb") + emb_all_label = fluid.layers.squeeze(input=emb_all_label, axes=[1]) emb_all_label_drop = fluid.layers.dropout( emb_all_label, dropout_prob=dropout, is_test=True) all_pre = fluid.layers.matmul( gru_h0_drop, emb_all_label_drop, transpose_y=True) - pos_label = fluid.layers.data( - name="pos_label", shape=[1], dtype="int64", lod_level=1) + pos_label = fluid.data( + name="pos_label", shape=[None, 1], dtype="int64", lod_level=1) acc = fluid.layers.accuracy(input=all_pre, label=pos_label, k=20) return acc diff --git a/PaddleRec/gru4rec/train.py b/PaddleRec/gru4rec/train.py index b43926b6..d7124eba 100644 --- a/PaddleRec/gru4rec/train.py +++ b/PaddleRec/gru4rec/train.py @@ -169,4 +169,5 @@ def get_device(args): if __name__ == "__main__": + utils.check_version() train() diff --git a/PaddleRec/gru4rec/train_sample_neg.py b/PaddleRec/gru4rec/train_sample_neg.py index 26424520..fbb68705 100644 --- a/PaddleRec/gru4rec/train_sample_neg.py +++ b/PaddleRec/gru4rec/train_sample_neg.py @@ -128,4 +128,5 @@ def train(): if __name__ == "__main__": + utils.check_version() train() diff --git a/PaddleRec/gru4rec/utils.py b/PaddleRec/gru4rec/utils.py index 1cd6a313..ffd05fc5 100644 --- a/PaddleRec/gru4rec/utils.py +++ b/PaddleRec/gru4rec/utils.py @@ -110,11 +110,25 @@ def prepare_data(file_dir, batch_size * 20) else: vocab_size = get_vocab_size(vocab_path) - reader = paddle.batch( + reader = fluid.io.batch( test( file_dir, buffer_size, data_type=DataType.SEQ), batch_size) return vocab_size, reader +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) def sort_batch(reader, batch_size, sort_group_size, drop_last=False): """ diff --git a/PaddleRec/multiview_simnet/README.md b/PaddleRec/multiview_simnet/README.md index 31d6fd54..d9cf7e10 100644 --- a/PaddleRec/multiview_simnet/README.md +++ b/PaddleRec/multiview_simnet/README.md @@ -3,6 +3,9 @@ ## Introduction In personalized recommendation scenario, a user often is provided with several items from personalized interest matching model. In real world application, a user may have multiple views of features, say user-id, age, click-history of items, search queries. A item, e.g. news, may also have multiple views of features like news title, news category, images in news and so on. Multi-view Simnet is matching a model that combine users' and items' multiple views of features into one unified model. The model can be used in many industrial product like Baidu's feed news. The model is adapted from the paper A Multi-View Deep Learning(MV-DNN) Approach for Cross Domain User Modeling in Recommendation Systems, WWW 2015. The difference between our model and the MV-DNN is that we also consider multiple feature views of users. +**Now all models in PaddleRec require PaddlePaddle version 1.6 or higher, or suitable develop version.** + + ## Dataset Currently, synthetic dataset is provided for proof of concept and we aim to add more real world dataset in this project in the future. The result is inaccurate because of synthetic dataset. diff --git a/PaddleRec/multiview_simnet/infer.py b/PaddleRec/multiview_simnet/infer.py index 7b5bb080..2e871d9e 100644 --- a/PaddleRec/multiview_simnet/infer.py +++ b/PaddleRec/multiview_simnet/infer.py @@ -31,6 +31,23 @@ logger = logging.getLogger("fluid") logger.setLevel(logging.INFO) +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + + + def parse_args(): parser = argparse.ArgumentParser("multi-view simnet") parser.add_argument("--train_file", type=str, help="Training file") @@ -116,4 +133,5 @@ def main(): if __name__ == "__main__": + check_version() main() diff --git a/PaddleRec/multiview_simnet/nets.py b/PaddleRec/multiview_simnet/nets.py index fed17784..d7813d8d 100644 --- a/PaddleRec/multiview_simnet/nets.py +++ b/PaddleRec/multiview_simnet/nets.py @@ -125,34 +125,34 @@ class MultiviewSimnet(object): def train_net(self): # input fields for query, pos_title, neg_title q_slots = [ - io.data( - name="q%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="q%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.query_encoders)) ] pt_slots = [ - io.data( - name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="pt%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.title_encoders)) ] nt_slots = [ - io.data( - name="nt%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="nt%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.title_encoders)) ] # lookup embedding for each slot q_embs = [ - nn.embedding( + fluid.embedding( input=query, size=self.emb_shape, param_attr="emb") for query in q_slots ] pt_embs = [ - nn.embedding( + fluid.embedding( input=title, size=self.emb_shape, param_attr="emb") for title in pt_slots ] nt_embs = [ - nn.embedding( + fluid.embedding( input=title, size=self.emb_shape, param_attr="emb") for title in nt_slots ] @@ -205,23 +205,23 @@ class MultiviewSimnet(object): def pred_net(self, query_fields, pos_title_fields, neg_title_fields): q_slots = [ - io.data( - name="q%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="q%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.query_encoders)) ] pt_slots = [ - io.data( - name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="pt%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.title_encoders)) ] # lookup embedding for each slot q_embs = [ - nn.embedding( + fluid.embedding( input=query, size=self.emb_shape, param_attr="emb") for query in q_slots ] pt_embs = [ - nn.embedding( + fluid.embedding( input=title, size=self.emb_shape, param_attr="emb") for title in pt_slots ] diff --git a/PaddleRec/multiview_simnet/train.py b/PaddleRec/multiview_simnet/train.py index f098fd10..d7920083 100644 --- a/PaddleRec/multiview_simnet/train.py +++ b/PaddleRec/multiview_simnet/train.py @@ -88,6 +88,21 @@ def parse_args(): return parser.parse_args() +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + def start_train(args): if args.enable_ce: SEED = 102 @@ -170,4 +185,5 @@ def main(): if __name__ == "__main__": + check_version() main() diff --git a/PaddleRec/ssr/README.md b/PaddleRec/ssr/README.md index d0b4dfb4..6ded50b9 100644 --- a/PaddleRec/ssr/README.md +++ b/PaddleRec/ssr/README.md @@ -12,6 +12,10 @@ Sequence Semantic Retrieval(SSR) Model shares the similar idea with Multi-Rate D - The idea of SSR is to model a user's personalized interest of an item through matching model structure, and the representation of a news item can be computed online even the news item does not exist in training dataset. - With the representation of news items, we are able to build an vector indexing service online for news prediction and this is the retrieval part of SSR. +## Version +**Now all models in PaddleRec require PaddlePaddle version 1.6 or higher, or suitable develop version.** + + ## Dataset Dataset preprocessing follows the method of [GRU4Rec Project](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/gru4rec). Note that you should reuse scripts from GRU4Rec project for data preprocessing. diff --git a/PaddleRec/ssr/infer.py b/PaddleRec/ssr/infer.py index 09dee039..3a44fad7 100644 --- a/PaddleRec/ssr/infer.py +++ b/PaddleRec/ssr/infer.py @@ -120,6 +120,7 @@ def infer(args, vocab_size, test_reader): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/ssr/nets.py b/PaddleRec/ssr/nets.py index 4df23573..0e853429 100644 --- a/PaddleRec/ssr/nets.py +++ b/PaddleRec/ssr/nets.py @@ -86,16 +86,16 @@ class SequenceSemanticRetrieval(object): return correct def train(self): - user_data = io.data(name="user", shape=[1], dtype="int64", lod_level=1) - pos_item_data = io.data( - name="p_item", shape=[1], dtype="int64", lod_level=1) - neg_item_data = io.data( - name="n_item", shape=[1], dtype="int64", lod_level=1) - user_emb = nn.embedding( + user_data = fluid.data(name="user", shape=[None, 1], dtype="int64", lod_level=1) + pos_item_data = fluid.data( + name="p_item", shape=[None, 1], dtype="int64", lod_level=1) + neg_item_data = fluid.data( + name="n_item", shape=[None, 1], dtype="int64", lod_level=1) + user_emb = fluid.embedding( input=user_data, size=self.emb_shape, param_attr="emb.item") - pos_item_emb = nn.embedding( + pos_item_emb = fluid.embedding( input=pos_item_data, size=self.emb_shape, param_attr="emb.item") - neg_item_emb = nn.embedding( + neg_item_emb = fluid.embedding( input=neg_item_data, size=self.emb_shape, param_attr="emb.item") user_enc = self.user_encoder.forward(user_emb) pos_item_enc = self.item_encoder.forward(pos_item_emb) diff --git a/PaddleRec/ssr/train.py b/PaddleRec/ssr/train.py index 1c0c9f8c..3c441039 100644 --- a/PaddleRec/ssr/train.py +++ b/PaddleRec/ssr/train.py @@ -165,4 +165,5 @@ def main(): if __name__ == "__main__": + utils.check_version() main() diff --git a/PaddleRec/ssr/utils.py b/PaddleRec/ssr/utils.py index 4fe9ef47..353cf336 100644 --- a/PaddleRec/ssr/utils.py +++ b/PaddleRec/ssr/utils.py @@ -16,7 +16,7 @@ def construct_train_data(file_dir, vocab_path, batch_size): vocab_size = get_vocab_size(vocab_path) files = [file_dir + '/' + f for f in os.listdir(file_dir)] y_data = reader.YoochooseDataset(vocab_size) - train_reader = paddle.batch( + train_reader = fluid.io.batch( paddle.reader.shuffle( y_data.train(files), buf_size=batch_size * 100), batch_size=batch_size) @@ -27,9 +27,23 @@ def construct_test_data(file_dir, vocab_path, batch_size): vocab_size = get_vocab_size(vocab_path) files = [file_dir + '/' + f for f in os.listdir(file_dir)] y_data = reader.YoochooseDataset(vocab_size) - test_reader = paddle.batch(y_data.test(files), batch_size=batch_size) + test_reader = fluid.io.batch(y_data.test(files), batch_size=batch_size) return test_reader, vocab_size +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) def infer_data(raw_data, place): data = [dat[0] for dat in raw_data] diff --git a/PaddleRec/tagspace/README.md b/PaddleRec/tagspace/README.md index 4263065b..818ab0cd 100644 --- a/PaddleRec/tagspace/README.md +++ b/PaddleRec/tagspace/README.md @@ -26,6 +26,7 @@ TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Ha Tagspace模型学习文本及标签的embedding表示,应用于工业级的标签推荐,具体应用场景有feed新闻标签推荐。 +**Now all models in PaddleRec require PaddlePaddle version 1.6 or higher, or suitable develop version.** ## 数据下载及预处理 diff --git a/PaddleRec/tagspace/infer.py b/PaddleRec/tagspace/infer.py index e8522b09..66412fc5 100644 --- a/PaddleRec/tagspace/infer.py +++ b/PaddleRec/tagspace/infer.py @@ -71,6 +71,7 @@ def infer(test_reader, vocab_tag, use_cuda, model_path, epoch): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/tagspace/net.py b/PaddleRec/tagspace/net.py index 797ae634..8a48ab5a 100644 --- a/PaddleRec/tagspace/net.py +++ b/PaddleRec/tagspace/net.py @@ -2,19 +2,22 @@ import paddle.fluid as fluid import paddle.fluid.layers.nn as nn import paddle.fluid.layers.tensor as tensor import paddle.fluid.layers.control_flow as cf -import paddle.fluid.layers.io as io def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1, neg_size=5): """ network definition """ - text = io.data(name="text", shape=[1], lod_level=1, dtype='int64') - pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64') - neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64') - text_emb = nn.embedding( + text = fluid.data(name="text", shape=[None, 1], lod_level=1, dtype='int64') + pos_tag = fluid.data(name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64') + neg_tag = fluid.data(name="neg_tag", shape=[None, 1], lod_level=1, dtype='int64') + text_emb = fluid.embedding( input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb") - pos_tag_emb = nn.embedding( + text_emb = fluid.layers.squeeze(input=text_emb, axes=[1]) + pos_tag_emb = fluid.embedding( input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") - neg_tag_emb = nn.embedding( + pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1]) + neg_tag_emb = fluid.embedding( input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") + neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1]) + conv_1d = fluid.nets.sequence_conv_pool( input=text_emb, diff --git a/PaddleRec/tagspace/train.py b/PaddleRec/tagspace/train.py index 419bb1c4..2230407e 100644 --- a/PaddleRec/tagspace/train.py +++ b/PaddleRec/tagspace/train.py @@ -168,4 +168,5 @@ def get_device(args): if __name__ == "__main__": + utils.check_version() train() diff --git a/PaddleRec/tagspace/utils.py b/PaddleRec/tagspace/utils.py index f5b7e647..80b77c8e 100644 --- a/PaddleRec/tagspace/utils.py +++ b/PaddleRec/tagspace/utils.py @@ -29,6 +29,21 @@ def get_vocab_size(vocab_path): line = rf.readline() return int(line.strip()) +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + def prepare_data(file_dir, vocab_text_path, diff --git a/PaddleRec/word2vec/README.md b/PaddleRec/word2vec/README.md index 35d749b5..e8f99428 100644 --- a/PaddleRec/word2vec/README.md +++ b/PaddleRec/word2vec/README.md @@ -20,6 +20,7 @@ ## 介绍 本例实现了skip-gram模式的word2vector模型。 +**目前模型库下模型均要求使用PaddlePaddle 1.6及以上版本或适当的develop版本。** ## 数据下载 全量数据集使用的是来自1 Billion Word Language Model Benchmark的(http://www.statmt.org/lm-benchmark) 的数据集. @@ -35,7 +36,7 @@ mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tok ```bash mkdir data -wget https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar +wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ data/ ``` @@ -44,7 +45,7 @@ mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tok ```bash mkdir data -wget https://paddlerec.bj.bcebos.com/word2vec/text.tar +wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/text.tar tar xvf text.tar mv text data/ ``` @@ -105,9 +106,9 @@ sh cluster_train.sh ```bash #全量数据集测试集 -wget https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar +wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar #样本数据集测试集 -wget https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar +wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar ``` 预测命令,注意词典名称需要加后缀"_word_to_id_", 此文件是预处理阶段生成的。 diff --git a/PaddleRec/word2vec/infer.py b/PaddleRec/word2vec/infer.py index 1b329002..36357dd6 100644 --- a/PaddleRec/word2vec/infer.py +++ b/PaddleRec/word2vec/infer.py @@ -78,13 +78,13 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w): b_size = len([dat[0] for dat in data]) wa = np.array( [dat[0] for dat in data]).astype("int64").reshape( - b_size, 1) + b_size) wb = np.array( [dat[1] for dat in data]).astype("int64").reshape( - b_size, 1) + b_size) wc = np.array( [dat[2] for dat in data]).astype("int64").reshape( - b_size, 1) + b_size) label = [dat[3] for dat in data] input_word = [dat[4] for dat in data] @@ -95,7 +95,7 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w): "analogy_c": wc, "all_label": np.arange(vocab_size).reshape( - vocab_size, 1).astype("int64"), + vocab_size).astype("int64"), }, fetch_list=[pred.name, values], return_numpy=False) @@ -145,13 +145,13 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w): b_size = len([dat[0] for dat in data]) wa = np.array( [dat[0] for dat in data]).astype("int64").reshape( - b_size, 1) + b_size) wb = np.array( [dat[1] for dat in data]).astype("int64").reshape( - b_size, 1) + b_size) wc = np.array( [dat[2] for dat in data]).astype("int64").reshape( - b_size, 1) + b_size) label = [dat[3] for dat in data] input_word = [dat[4] for dat in data] @@ -162,7 +162,7 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w): "analogy_b": wb, "analogy_c": wc, "all_label": - np.arange(vocab_size).reshape(vocab_size, 1), + np.arange(vocab_size).reshape(vocab_size), }, fetch_list=[pred.name, values], return_numpy=False) @@ -185,6 +185,7 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/word2vec/net.py b/PaddleRec/word2vec/net.py index ab2abbc7..b20b88fd 100644 --- a/PaddleRec/word2vec/net.py +++ b/PaddleRec/word2vec/net.py @@ -23,10 +23,10 @@ import paddle.fluid as fluid def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): datas = [] - input_word = fluid.layers.data(name="input_word", shape=[1], dtype='int64') - true_word = fluid.layers.data(name='true_label', shape=[1], dtype='int64') - neg_word = fluid.layers.data( - name="neg_label", shape=[neg_num], dtype='int64') + input_word = fluid.data(name="input_word", shape=[None, 1], dtype='int64') + true_word = fluid.data(name='true_label', shape=[None, 1], dtype='int64') + neg_word = fluid.data( + name="neg_label", shape=[None, neg_num], dtype='int64') datas.append(input_word) datas.append(true_word) @@ -37,7 +37,7 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): words = fluid.layers.read_file(py_reader) init_width = 0.5 / embedding_size - input_emb = fluid.layers.embedding( + input_emb = fluid.embedding( input=words[0], is_sparse=is_sparse, size=[dict_size, embedding_size], @@ -45,38 +45,37 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): name='emb', initializer=fluid.initializer.Uniform(-init_width, init_width))) - true_emb_w = fluid.layers.embedding( + true_emb_w = fluid.embedding( input=words[1], is_sparse=is_sparse, size=[dict_size, embedding_size], param_attr=fluid.ParamAttr( name='emb_w', initializer=fluid.initializer.Constant(value=0.0))) - true_emb_b = fluid.layers.embedding( + true_emb_b = fluid.embedding( input=words[1], is_sparse=is_sparse, size=[dict_size, 1], param_attr=fluid.ParamAttr( name='emb_b', initializer=fluid.initializer.Constant(value=0.0))) - neg_word_reshape = fluid.layers.reshape(words[2], shape=[-1, 1]) - neg_word_reshape.stop_gradient = True + input_emb = fluid.layers.squeeze(input=input_emb, axes=[1]) + true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1]) + true_emb_b = fluid.layers.squeeze(input=true_emb_b, axes=[1]) - neg_emb_w = fluid.layers.embedding( - input=neg_word_reshape, + neg_emb_w = fluid.embedding( + input=words[2], is_sparse=is_sparse, size=[dict_size, embedding_size], param_attr=fluid.ParamAttr( name='emb_w', learning_rate=1.0)) - - neg_emb_w_re = fluid.layers.reshape( - neg_emb_w, shape=[-1, neg_num, embedding_size]) - neg_emb_b = fluid.layers.embedding( - input=neg_word_reshape, + neg_emb_b = fluid.embedding( + input=words[2], is_sparse=is_sparse, size=[dict_size, 1], param_attr=fluid.ParamAttr( name='emb_b', learning_rate=1.0)) + neg_emb_b_vec = fluid.layers.reshape(neg_emb_b, shape=[-1, neg_num]) true_logits = fluid.layers.elementwise_add( fluid.layers.reduce_sum( @@ -87,7 +86,7 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): input_emb_re = fluid.layers.reshape( input_emb, shape=[-1, 1, embedding_size]) neg_matmul = fluid.layers.matmul( - input_emb_re, neg_emb_w_re, transpose_y=True) + input_emb_re, neg_emb_w, transpose_y=True) neg_matmul_re = fluid.layers.reshape(neg_matmul, shape=[-1, neg_num]) neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec) #nce loss @@ -111,22 +110,21 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): def infer_network(vocab_size, emb_size): - analogy_a = fluid.layers.data(name="analogy_a", shape=[1], dtype='int64') - analogy_b = fluid.layers.data(name="analogy_b", shape=[1], dtype='int64') - analogy_c = fluid.layers.data(name="analogy_c", shape=[1], dtype='int64') - all_label = fluid.layers.data( + analogy_a = fluid.data(name="analogy_a", shape=[None], dtype='int64') + analogy_b = fluid.data(name="analogy_b", shape=[None], dtype='int64') + analogy_c = fluid.data(name="analogy_c", shape=[None], dtype='int64') + all_label = fluid.data( name="all_label", - shape=[vocab_size, 1], - dtype='int64', - append_batch_size=False) - emb_all_label = fluid.layers.embedding( + shape=[vocab_size], + dtype='int64') + emb_all_label = fluid.embedding( input=all_label, size=[vocab_size, emb_size], param_attr="emb") - emb_a = fluid.layers.embedding( + emb_a = fluid.embedding( input=analogy_a, size=[vocab_size, emb_size], param_attr="emb") - emb_b = fluid.layers.embedding( + emb_b = fluid.embedding( input=analogy_b, size=[vocab_size, emb_size], param_attr="emb") - emb_c = fluid.layers.embedding( + emb_c = fluid.embedding( input=analogy_c, size=[vocab_size, emb_size], param_attr="emb") target = fluid.layers.elementwise_add( fluid.layers.elementwise_sub(emb_b, emb_a), emb_c) diff --git a/PaddleRec/word2vec/train.py b/PaddleRec/word2vec/train.py index 430ec132..bcc99654 100644 --- a/PaddleRec/word2vec/train.py +++ b/PaddleRec/word2vec/train.py @@ -224,5 +224,6 @@ def train(args): if __name__ == '__main__': + utils.check_version() args = parse_args() train(args) diff --git a/PaddleRec/word2vec/utils.py b/PaddleRec/word2vec/utils.py index 01cd04e4..0d173005 100644 --- a/PaddleRec/word2vec/utils.py +++ b/PaddleRec/word2vec/utils.py @@ -22,9 +22,23 @@ def BuildWord_IdMap(dict_path): def prepare_data(file_dir, dict_path, batch_size): w2i, i2w = BuildWord_IdMap(dict_path) vocab_size = len(i2w) - reader = paddle.batch(test(file_dir, w2i), batch_size) + reader = fluid.io.batch(test(file_dir, w2i), batch_size) return vocab_size, reader, i2w +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) def native_to_unicode(s): if _is_unicode(s): -- GitLab