diff --git a/PaddleRec/gru4rec/README.md b/PaddleRec/gru4rec/README.md index 353781567f7012996199e51169233b306cd18722..afff1f06d074a598edb7da48bc7e31612eb27942 100644 --- a/PaddleRec/gru4rec/README.md +++ b/PaddleRec/gru4rec/README.md @@ -44,6 +44,10 @@ session-based推荐应用场景非常广泛,比如用户的商品浏览、新 运行样例程序可跳过'RSC15 数据下载及预处理'部分 + + +**要求使用PaddlePaddle 1.6及以上版本或适当的develop版本。** + ## RSC15 数据下载及预处理 运行命令 下载RSC15官网数据集 diff --git a/PaddleRec/gru4rec/infer.py b/PaddleRec/gru4rec/infer.py index bc459c28a9b24761b202dc5d8110d583322abdeb..032205cf7b6f9cc1015583e13a29c2361f889897 100644 --- a/PaddleRec/gru4rec/infer.py +++ b/PaddleRec/gru4rec/infer.py @@ -71,6 +71,7 @@ def infer(test_reader, use_cuda, model_path): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/gru4rec/infer_sample_neg.py b/PaddleRec/gru4rec/infer_sample_neg.py index 48458e82b4fe2bbc7141c3e45469b8414d87ece4..b77f3685576e129926a8529d99adbc06185acd91 100644 --- a/PaddleRec/gru4rec/infer_sample_neg.py +++ b/PaddleRec/gru4rec/infer_sample_neg.py @@ -84,6 +84,7 @@ def infer(args, vocab_size, test_reader, use_cuda): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/gru4rec/net.py b/PaddleRec/gru4rec/net.py index 6a715443ff1e72ae77aba51d5eaffe4eefee9687..f049643402c38c7f5501c3e7965fca262310bb6f 100644 --- a/PaddleRec/gru4rec/net.py +++ b/PaddleRec/gru4rec/net.py @@ -10,12 +10,12 @@ def all_vocab_network(vocab_size, gru_lr_x = 1.0 fc_lr_x = 1.0 # Input data - src_wordseq = fluid.layers.data( - name="src_wordseq", shape=[1], dtype="int64", lod_level=1) - dst_wordseq = fluid.layers.data( - name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) + src_wordseq = fluid.data( + name="src_wordseq", shape=[None, 1], dtype="int64", lod_level=1) + dst_wordseq = fluid.data( + name="dst_wordseq", shape=[None, 1], dtype="int64", lod_level=1) - emb = fluid.layers.embedding( + emb = fluid.embedding( input=src_wordseq, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( @@ -56,19 +56,21 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2): gru_lr_x = 1.0 fc_lr_x = 1.0 # Input data - src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1) - pos_label = fluid.layers.data( - name="pos_label", shape=[1], dtype="int64", lod_level=1) - label = fluid.layers.data( - name="label", shape=[neg_size + 1], dtype="int64", lod_level=1) + src = fluid.data(name="src", shape=[None, 1], dtype="int64", lod_level=1) + pos_label = fluid.data( + name="pos_label", shape=[None, 1], dtype="int64", lod_level=1) + label = fluid.data( + name="label", shape=[None, neg_size + 1], dtype="int64", lod_level=1) - emb_src = fluid.layers.embedding( + emb_src = fluid.embedding( input=src, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( name="emb", initializer=fluid.initializer.XavierInitializer(), learning_rate=emb_lr_x)) + emb_src = fluid.layers.squeeze(input=emb_src, axes=[1]) + emb_src_drop = fluid.layers.dropout(emb_src, dropout_prob=drop_out) @@ -90,7 +92,7 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2): gru_h0_drop = fluid.layers.dropout(gru_h0, dropout_prob=drop_out) label_re = fluid.layers.sequence_reshape(input=label, new_dim=1) - emb_label = fluid.layers.embedding( + emb_label1 = fluid.embedding( input=label_re, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( @@ -98,6 +100,7 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2): initializer=fluid.initializer.XavierInitializer(), learning_rate=emb_lr_x)) + emb_label = fluid.layers.squeeze(input=emb_label1, axes=[1]) emb_label_drop = fluid.layers.dropout(emb_label, dropout_prob=drop_out) gru_exp = fluid.layers.expand( @@ -120,19 +123,20 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2): gru_lr_x = 1.0 fc_lr_x = 1.0 # Input data - src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1) - pos_label = fluid.layers.data( - name="pos_label", shape=[1], dtype="int64", lod_level=1) - label = fluid.layers.data( - name="label", shape=[neg_size + 1], dtype="int64", lod_level=1) + src = fluid.data(name="src", shape=[None, 1], dtype="int64", lod_level=1) + pos_label = fluid.data( + name="pos_label", shape=[None, 1], dtype="int64", lod_level=1) + label = fluid.data( + name="label", shape=[None, neg_size + 1], dtype="int64", lod_level=1) - emb_src = fluid.layers.embedding( + emb_src = fluid.embedding( input=src, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( name="emb", initializer=fluid.initializer.XavierInitializer(), learning_rate=emb_lr_x)) + emb_src = fluid.layers.squeeze(input=emb_src, axes=[1]) emb_src_drop = fluid.layers.dropout(emb_src, dropout_prob=drop_out) @@ -154,13 +158,14 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2): gru_h0_drop = fluid.layers.dropout(gru_h0, dropout_prob=drop_out) label_re = fluid.layers.sequence_reshape(input=label, new_dim=1) - emb_label = fluid.layers.embedding( + emb_label1 = fluid.embedding( input=label_re, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( name="emb", initializer=fluid.initializer.XavierInitializer(), learning_rate=emb_lr_x)) + emb_label = fluid.layers.squeeze(input=emb_label1, axes=[1]) emb_label_drop = fluid.layers.dropout(emb_label, dropout_prob=drop_out) @@ -180,8 +185,8 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2): def infer_network(vocab_size, batch_size, hid_size, dropout=0.2): - src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1) - emb_src = fluid.layers.embedding( + src = fluid.data(name="src", shape=[None, 1], dtype="int64", lod_level=1) + emb_src = fluid.embedding( input=src, size=[vocab_size, hid_size], param_attr="emb") emb_src_drop = fluid.layers.dropout( emb_src, dropout_prob=dropout, is_test=True) @@ -198,20 +203,20 @@ def infer_network(vocab_size, batch_size, hid_size, dropout=0.2): gru_h0_drop = fluid.layers.dropout( gru_h0, dropout_prob=dropout, is_test=True) - all_label = fluid.layers.data( + all_label = fluid.data( name="all_label", shape=[vocab_size, 1], - dtype="int64", - append_batch_size=False) - emb_all_label = fluid.layers.embedding( + dtype="int64") + emb_all_label = fluid.embedding( input=all_label, size=[vocab_size, hid_size], param_attr="emb") + emb_all_label = fluid.layers.squeeze(input=emb_all_label, axes=[1]) emb_all_label_drop = fluid.layers.dropout( emb_all_label, dropout_prob=dropout, is_test=True) all_pre = fluid.layers.matmul( gru_h0_drop, emb_all_label_drop, transpose_y=True) - pos_label = fluid.layers.data( - name="pos_label", shape=[1], dtype="int64", lod_level=1) + pos_label = fluid.data( + name="pos_label", shape=[None, 1], dtype="int64", lod_level=1) acc = fluid.layers.accuracy(input=all_pre, label=pos_label, k=20) return acc diff --git a/PaddleRec/gru4rec/train.py b/PaddleRec/gru4rec/train.py index b43926b69eaf002380a261a0689be91ec3f6ff90..d7124eba1e0a53cb9ba3a53b468655f16db1e292 100644 --- a/PaddleRec/gru4rec/train.py +++ b/PaddleRec/gru4rec/train.py @@ -169,4 +169,5 @@ def get_device(args): if __name__ == "__main__": + utils.check_version() train() diff --git a/PaddleRec/gru4rec/train_sample_neg.py b/PaddleRec/gru4rec/train_sample_neg.py index 2642452024810fe16cfa1154e273febdb1d63254..fbb687052771fd6ca642fda7637c103e120136ff 100644 --- a/PaddleRec/gru4rec/train_sample_neg.py +++ b/PaddleRec/gru4rec/train_sample_neg.py @@ -128,4 +128,5 @@ def train(): if __name__ == "__main__": + utils.check_version() train() diff --git a/PaddleRec/gru4rec/utils.py b/PaddleRec/gru4rec/utils.py index 1cd6a313b2a5097b16c473722737e0e6936f4e31..ffd05fc582b6a29b7c09c8bafb519ed0787b7685 100644 --- a/PaddleRec/gru4rec/utils.py +++ b/PaddleRec/gru4rec/utils.py @@ -110,11 +110,25 @@ def prepare_data(file_dir, batch_size * 20) else: vocab_size = get_vocab_size(vocab_path) - reader = paddle.batch( + reader = fluid.io.batch( test( file_dir, buffer_size, data_type=DataType.SEQ), batch_size) return vocab_size, reader +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) def sort_batch(reader, batch_size, sort_group_size, drop_last=False): """ diff --git a/PaddleRec/multiview_simnet/README.md b/PaddleRec/multiview_simnet/README.md index 31d6fd5436fc6a8c93a0cddc196b6088d198de96..d9cf7e10edb316a9981a782abb580efda139a877 100644 --- a/PaddleRec/multiview_simnet/README.md +++ b/PaddleRec/multiview_simnet/README.md @@ -3,6 +3,9 @@ ## Introduction In personalized recommendation scenario, a user often is provided with several items from personalized interest matching model. In real world application, a user may have multiple views of features, say user-id, age, click-history of items, search queries. A item, e.g. news, may also have multiple views of features like news title, news category, images in news and so on. Multi-view Simnet is matching a model that combine users' and items' multiple views of features into one unified model. The model can be used in many industrial product like Baidu's feed news. The model is adapted from the paper A Multi-View Deep Learning(MV-DNN) Approach for Cross Domain User Modeling in Recommendation Systems, WWW 2015. The difference between our model and the MV-DNN is that we also consider multiple feature views of users. +**Now all models in PaddleRec require PaddlePaddle version 1.6 or higher, or suitable develop version.** + + ## Dataset Currently, synthetic dataset is provided for proof of concept and we aim to add more real world dataset in this project in the future. The result is inaccurate because of synthetic dataset. diff --git a/PaddleRec/multiview_simnet/infer.py b/PaddleRec/multiview_simnet/infer.py index 7b5bb080d278ba5fffbe678841037b71b02b3069..2e871d9eddf1d4e69e44e4e9c7a094b3fe567c91 100644 --- a/PaddleRec/multiview_simnet/infer.py +++ b/PaddleRec/multiview_simnet/infer.py @@ -31,6 +31,23 @@ logger = logging.getLogger("fluid") logger.setLevel(logging.INFO) +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + + + def parse_args(): parser = argparse.ArgumentParser("multi-view simnet") parser.add_argument("--train_file", type=str, help="Training file") @@ -116,4 +133,5 @@ def main(): if __name__ == "__main__": + check_version() main() diff --git a/PaddleRec/multiview_simnet/nets.py b/PaddleRec/multiview_simnet/nets.py index fed177844bdd247d163aee9e8625cd0ec74378b3..d7813d8d002b0b5e93ab9519479741a4299b96ff 100644 --- a/PaddleRec/multiview_simnet/nets.py +++ b/PaddleRec/multiview_simnet/nets.py @@ -125,34 +125,34 @@ class MultiviewSimnet(object): def train_net(self): # input fields for query, pos_title, neg_title q_slots = [ - io.data( - name="q%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="q%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.query_encoders)) ] pt_slots = [ - io.data( - name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="pt%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.title_encoders)) ] nt_slots = [ - io.data( - name="nt%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="nt%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.title_encoders)) ] # lookup embedding for each slot q_embs = [ - nn.embedding( + fluid.embedding( input=query, size=self.emb_shape, param_attr="emb") for query in q_slots ] pt_embs = [ - nn.embedding( + fluid.embedding( input=title, size=self.emb_shape, param_attr="emb") for title in pt_slots ] nt_embs = [ - nn.embedding( + fluid.embedding( input=title, size=self.emb_shape, param_attr="emb") for title in nt_slots ] @@ -205,23 +205,23 @@ class MultiviewSimnet(object): def pred_net(self, query_fields, pos_title_fields, neg_title_fields): q_slots = [ - io.data( - name="q%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="q%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.query_encoders)) ] pt_slots = [ - io.data( - name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="pt%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.title_encoders)) ] # lookup embedding for each slot q_embs = [ - nn.embedding( + fluid.embedding( input=query, size=self.emb_shape, param_attr="emb") for query in q_slots ] pt_embs = [ - nn.embedding( + fluid.embedding( input=title, size=self.emb_shape, param_attr="emb") for title in pt_slots ] diff --git a/PaddleRec/multiview_simnet/train.py b/PaddleRec/multiview_simnet/train.py index f098fd109e8813ffbfb40753122acbef3cd896a6..d79200834915680a649e2135d1704ffa9ccd49fc 100644 --- a/PaddleRec/multiview_simnet/train.py +++ b/PaddleRec/multiview_simnet/train.py @@ -88,6 +88,21 @@ def parse_args(): return parser.parse_args() +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + def start_train(args): if args.enable_ce: SEED = 102 @@ -170,4 +185,5 @@ def main(): if __name__ == "__main__": + check_version() main() diff --git a/PaddleRec/ssr/README.md b/PaddleRec/ssr/README.md index d0b4dfb41b4cea19efa42c4a233c9544349d1770..6ded50b9fc706972d1ced8453e0b749807972b55 100644 --- a/PaddleRec/ssr/README.md +++ b/PaddleRec/ssr/README.md @@ -12,6 +12,10 @@ Sequence Semantic Retrieval(SSR) Model shares the similar idea with Multi-Rate D - The idea of SSR is to model a user's personalized interest of an item through matching model structure, and the representation of a news item can be computed online even the news item does not exist in training dataset. - With the representation of news items, we are able to build an vector indexing service online for news prediction and this is the retrieval part of SSR. +## Version +**Now all models in PaddleRec require PaddlePaddle version 1.6 or higher, or suitable develop version.** + + ## Dataset Dataset preprocessing follows the method of [GRU4Rec Project](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/gru4rec). Note that you should reuse scripts from GRU4Rec project for data preprocessing. diff --git a/PaddleRec/ssr/infer.py b/PaddleRec/ssr/infer.py index 09dee039f4da1e08de0169b3370aff174c89556b..3a44fad7196336a71ce0ed484d5869b1633541f4 100644 --- a/PaddleRec/ssr/infer.py +++ b/PaddleRec/ssr/infer.py @@ -120,6 +120,7 @@ def infer(args, vocab_size, test_reader): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/ssr/nets.py b/PaddleRec/ssr/nets.py index 4df23573c91fcf16a4ef95d1bab1ac01e437d148..0e8534298401460f2b331f970a5b6b0b52f9ba9d 100644 --- a/PaddleRec/ssr/nets.py +++ b/PaddleRec/ssr/nets.py @@ -86,16 +86,16 @@ class SequenceSemanticRetrieval(object): return correct def train(self): - user_data = io.data(name="user", shape=[1], dtype="int64", lod_level=1) - pos_item_data = io.data( - name="p_item", shape=[1], dtype="int64", lod_level=1) - neg_item_data = io.data( - name="n_item", shape=[1], dtype="int64", lod_level=1) - user_emb = nn.embedding( + user_data = fluid.data(name="user", shape=[None, 1], dtype="int64", lod_level=1) + pos_item_data = fluid.data( + name="p_item", shape=[None, 1], dtype="int64", lod_level=1) + neg_item_data = fluid.data( + name="n_item", shape=[None, 1], dtype="int64", lod_level=1) + user_emb = fluid.embedding( input=user_data, size=self.emb_shape, param_attr="emb.item") - pos_item_emb = nn.embedding( + pos_item_emb = fluid.embedding( input=pos_item_data, size=self.emb_shape, param_attr="emb.item") - neg_item_emb = nn.embedding( + neg_item_emb = fluid.embedding( input=neg_item_data, size=self.emb_shape, param_attr="emb.item") user_enc = self.user_encoder.forward(user_emb) pos_item_enc = self.item_encoder.forward(pos_item_emb) diff --git a/PaddleRec/ssr/train.py b/PaddleRec/ssr/train.py index 1c0c9f8cc3ed6750d21ba43985fb142dc527cf00..3c441039683e87630f48be654116a79e8ce54d40 100644 --- a/PaddleRec/ssr/train.py +++ b/PaddleRec/ssr/train.py @@ -165,4 +165,5 @@ def main(): if __name__ == "__main__": + utils.check_version() main() diff --git a/PaddleRec/ssr/utils.py b/PaddleRec/ssr/utils.py index 4fe9ef470ed0a2a5da7bef6a975f45e5a04ab18e..353cf336564a2f8027d540d4820a60a5949c33a0 100644 --- a/PaddleRec/ssr/utils.py +++ b/PaddleRec/ssr/utils.py @@ -16,7 +16,7 @@ def construct_train_data(file_dir, vocab_path, batch_size): vocab_size = get_vocab_size(vocab_path) files = [file_dir + '/' + f for f in os.listdir(file_dir)] y_data = reader.YoochooseDataset(vocab_size) - train_reader = paddle.batch( + train_reader = fluid.io.batch( paddle.reader.shuffle( y_data.train(files), buf_size=batch_size * 100), batch_size=batch_size) @@ -27,9 +27,23 @@ def construct_test_data(file_dir, vocab_path, batch_size): vocab_size = get_vocab_size(vocab_path) files = [file_dir + '/' + f for f in os.listdir(file_dir)] y_data = reader.YoochooseDataset(vocab_size) - test_reader = paddle.batch(y_data.test(files), batch_size=batch_size) + test_reader = fluid.io.batch(y_data.test(files), batch_size=batch_size) return test_reader, vocab_size +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) def infer_data(raw_data, place): data = [dat[0] for dat in raw_data] diff --git a/PaddleRec/tagspace/README.md b/PaddleRec/tagspace/README.md index 4263065bee2c5492684147f532e92c7c8083e16f..818ab0cd7f5a5ca184f30887aa0a9f840efc4c45 100644 --- a/PaddleRec/tagspace/README.md +++ b/PaddleRec/tagspace/README.md @@ -26,6 +26,7 @@ TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Ha Tagspace模型学习文本及标签的embedding表示,应用于工业级的标签推荐,具体应用场景有feed新闻标签推荐。 +**Now all models in PaddleRec require PaddlePaddle version 1.6 or higher, or suitable develop version.** ## 数据下载及预处理 diff --git a/PaddleRec/tagspace/infer.py b/PaddleRec/tagspace/infer.py index e8522b095826622721de9f2e329c8c361f6f7c41..66412fc5b20a2146227c39572b53b841a5983a6b 100644 --- a/PaddleRec/tagspace/infer.py +++ b/PaddleRec/tagspace/infer.py @@ -71,6 +71,7 @@ def infer(test_reader, vocab_tag, use_cuda, model_path, epoch): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/tagspace/net.py b/PaddleRec/tagspace/net.py index 797ae63442643ad1a8ce1f0dcf374eff24dbbe67..8a48ab5afb95e46e456f8430a25208829e070d72 100644 --- a/PaddleRec/tagspace/net.py +++ b/PaddleRec/tagspace/net.py @@ -2,19 +2,22 @@ import paddle.fluid as fluid import paddle.fluid.layers.nn as nn import paddle.fluid.layers.tensor as tensor import paddle.fluid.layers.control_flow as cf -import paddle.fluid.layers.io as io def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1, neg_size=5): """ network definition """ - text = io.data(name="text", shape=[1], lod_level=1, dtype='int64') - pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64') - neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64') - text_emb = nn.embedding( + text = fluid.data(name="text", shape=[None, 1], lod_level=1, dtype='int64') + pos_tag = fluid.data(name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64') + neg_tag = fluid.data(name="neg_tag", shape=[None, 1], lod_level=1, dtype='int64') + text_emb = fluid.embedding( input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb") - pos_tag_emb = nn.embedding( + text_emb = fluid.layers.squeeze(input=text_emb, axes=[1]) + pos_tag_emb = fluid.embedding( input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") - neg_tag_emb = nn.embedding( + pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1]) + neg_tag_emb = fluid.embedding( input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") + neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1]) + conv_1d = fluid.nets.sequence_conv_pool( input=text_emb, diff --git a/PaddleRec/tagspace/train.py b/PaddleRec/tagspace/train.py index 419bb1c4b156c148f8bc4bc3a48385b6722f5c68..2230407e0cca56e44d578a001c303279b956caa4 100644 --- a/PaddleRec/tagspace/train.py +++ b/PaddleRec/tagspace/train.py @@ -168,4 +168,5 @@ def get_device(args): if __name__ == "__main__": + utils.check_version() train() diff --git a/PaddleRec/tagspace/utils.py b/PaddleRec/tagspace/utils.py index f5b7e64753d331df57e2ef0a86b5a1dff1cea37a..80b77c8e5061c6c0b75e3a6bd51aaa9d2e933a32 100644 --- a/PaddleRec/tagspace/utils.py +++ b/PaddleRec/tagspace/utils.py @@ -29,6 +29,21 @@ def get_vocab_size(vocab_path): line = rf.readline() return int(line.strip()) +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + def prepare_data(file_dir, vocab_text_path, diff --git a/PaddleRec/word2vec/README.md b/PaddleRec/word2vec/README.md index 35d749b5c1ef8859c4e88da4f5d171269e66c2b5..e8f9942881ad654bf0882f6693a837b0d34d3882 100644 --- a/PaddleRec/word2vec/README.md +++ b/PaddleRec/word2vec/README.md @@ -20,6 +20,7 @@ ## 介绍 本例实现了skip-gram模式的word2vector模型。 +**目前模型库下模型均要求使用PaddlePaddle 1.6及以上版本或适当的develop版本。** ## 数据下载 全量数据集使用的是来自1 Billion Word Language Model Benchmark的(http://www.statmt.org/lm-benchmark) 的数据集. @@ -35,7 +36,7 @@ mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tok ```bash mkdir data -wget https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar +wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ data/ ``` @@ -44,7 +45,7 @@ mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tok ```bash mkdir data -wget https://paddlerec.bj.bcebos.com/word2vec/text.tar +wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/text.tar tar xvf text.tar mv text data/ ``` @@ -105,9 +106,9 @@ sh cluster_train.sh ```bash #全量数据集测试集 -wget https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar +wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar #样本数据集测试集 -wget https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar +wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar ``` 预测命令,注意词典名称需要加后缀"_word_to_id_", 此文件是预处理阶段生成的。 diff --git a/PaddleRec/word2vec/infer.py b/PaddleRec/word2vec/infer.py index 1b3290029d620d130d2fe7b7c2bcfd8bbeae54c2..36357dd6678442548f0c51bd5ab3fad206ce42a6 100644 --- a/PaddleRec/word2vec/infer.py +++ b/PaddleRec/word2vec/infer.py @@ -78,13 +78,13 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w): b_size = len([dat[0] for dat in data]) wa = np.array( [dat[0] for dat in data]).astype("int64").reshape( - b_size, 1) + b_size) wb = np.array( [dat[1] for dat in data]).astype("int64").reshape( - b_size, 1) + b_size) wc = np.array( [dat[2] for dat in data]).astype("int64").reshape( - b_size, 1) + b_size) label = [dat[3] for dat in data] input_word = [dat[4] for dat in data] @@ -95,7 +95,7 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w): "analogy_c": wc, "all_label": np.arange(vocab_size).reshape( - vocab_size, 1).astype("int64"), + vocab_size).astype("int64"), }, fetch_list=[pred.name, values], return_numpy=False) @@ -145,13 +145,13 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w): b_size = len([dat[0] for dat in data]) wa = np.array( [dat[0] for dat in data]).astype("int64").reshape( - b_size, 1) + b_size) wb = np.array( [dat[1] for dat in data]).astype("int64").reshape( - b_size, 1) + b_size) wc = np.array( [dat[2] for dat in data]).astype("int64").reshape( - b_size, 1) + b_size) label = [dat[3] for dat in data] input_word = [dat[4] for dat in data] @@ -162,7 +162,7 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w): "analogy_b": wb, "analogy_c": wc, "all_label": - np.arange(vocab_size).reshape(vocab_size, 1), + np.arange(vocab_size).reshape(vocab_size), }, fetch_list=[pred.name, values], return_numpy=False) @@ -185,6 +185,7 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/word2vec/net.py b/PaddleRec/word2vec/net.py index ab2abbc76bde8e03c9a6e1e0abb062aa467d2c91..b20b88fdeea9bdc1a2c74c0f196f94b03cb65ee2 100644 --- a/PaddleRec/word2vec/net.py +++ b/PaddleRec/word2vec/net.py @@ -23,10 +23,10 @@ import paddle.fluid as fluid def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): datas = [] - input_word = fluid.layers.data(name="input_word", shape=[1], dtype='int64') - true_word = fluid.layers.data(name='true_label', shape=[1], dtype='int64') - neg_word = fluid.layers.data( - name="neg_label", shape=[neg_num], dtype='int64') + input_word = fluid.data(name="input_word", shape=[None, 1], dtype='int64') + true_word = fluid.data(name='true_label', shape=[None, 1], dtype='int64') + neg_word = fluid.data( + name="neg_label", shape=[None, neg_num], dtype='int64') datas.append(input_word) datas.append(true_word) @@ -37,7 +37,7 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): words = fluid.layers.read_file(py_reader) init_width = 0.5 / embedding_size - input_emb = fluid.layers.embedding( + input_emb = fluid.embedding( input=words[0], is_sparse=is_sparse, size=[dict_size, embedding_size], @@ -45,38 +45,37 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): name='emb', initializer=fluid.initializer.Uniform(-init_width, init_width))) - true_emb_w = fluid.layers.embedding( + true_emb_w = fluid.embedding( input=words[1], is_sparse=is_sparse, size=[dict_size, embedding_size], param_attr=fluid.ParamAttr( name='emb_w', initializer=fluid.initializer.Constant(value=0.0))) - true_emb_b = fluid.layers.embedding( + true_emb_b = fluid.embedding( input=words[1], is_sparse=is_sparse, size=[dict_size, 1], param_attr=fluid.ParamAttr( name='emb_b', initializer=fluid.initializer.Constant(value=0.0))) - neg_word_reshape = fluid.layers.reshape(words[2], shape=[-1, 1]) - neg_word_reshape.stop_gradient = True + input_emb = fluid.layers.squeeze(input=input_emb, axes=[1]) + true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1]) + true_emb_b = fluid.layers.squeeze(input=true_emb_b, axes=[1]) - neg_emb_w = fluid.layers.embedding( - input=neg_word_reshape, + neg_emb_w = fluid.embedding( + input=words[2], is_sparse=is_sparse, size=[dict_size, embedding_size], param_attr=fluid.ParamAttr( name='emb_w', learning_rate=1.0)) - - neg_emb_w_re = fluid.layers.reshape( - neg_emb_w, shape=[-1, neg_num, embedding_size]) - neg_emb_b = fluid.layers.embedding( - input=neg_word_reshape, + neg_emb_b = fluid.embedding( + input=words[2], is_sparse=is_sparse, size=[dict_size, 1], param_attr=fluid.ParamAttr( name='emb_b', learning_rate=1.0)) + neg_emb_b_vec = fluid.layers.reshape(neg_emb_b, shape=[-1, neg_num]) true_logits = fluid.layers.elementwise_add( fluid.layers.reduce_sum( @@ -87,7 +86,7 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): input_emb_re = fluid.layers.reshape( input_emb, shape=[-1, 1, embedding_size]) neg_matmul = fluid.layers.matmul( - input_emb_re, neg_emb_w_re, transpose_y=True) + input_emb_re, neg_emb_w, transpose_y=True) neg_matmul_re = fluid.layers.reshape(neg_matmul, shape=[-1, neg_num]) neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec) #nce loss @@ -111,22 +110,21 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): def infer_network(vocab_size, emb_size): - analogy_a = fluid.layers.data(name="analogy_a", shape=[1], dtype='int64') - analogy_b = fluid.layers.data(name="analogy_b", shape=[1], dtype='int64') - analogy_c = fluid.layers.data(name="analogy_c", shape=[1], dtype='int64') - all_label = fluid.layers.data( + analogy_a = fluid.data(name="analogy_a", shape=[None], dtype='int64') + analogy_b = fluid.data(name="analogy_b", shape=[None], dtype='int64') + analogy_c = fluid.data(name="analogy_c", shape=[None], dtype='int64') + all_label = fluid.data( name="all_label", - shape=[vocab_size, 1], - dtype='int64', - append_batch_size=False) - emb_all_label = fluid.layers.embedding( + shape=[vocab_size], + dtype='int64') + emb_all_label = fluid.embedding( input=all_label, size=[vocab_size, emb_size], param_attr="emb") - emb_a = fluid.layers.embedding( + emb_a = fluid.embedding( input=analogy_a, size=[vocab_size, emb_size], param_attr="emb") - emb_b = fluid.layers.embedding( + emb_b = fluid.embedding( input=analogy_b, size=[vocab_size, emb_size], param_attr="emb") - emb_c = fluid.layers.embedding( + emb_c = fluid.embedding( input=analogy_c, size=[vocab_size, emb_size], param_attr="emb") target = fluid.layers.elementwise_add( fluid.layers.elementwise_sub(emb_b, emb_a), emb_c) diff --git a/PaddleRec/word2vec/train.py b/PaddleRec/word2vec/train.py index 430ec132d2f810eed0025f16e9b87a8f742c455c..bcc996548947714bca2a2c01083fb72f1a83b0e3 100644 --- a/PaddleRec/word2vec/train.py +++ b/PaddleRec/word2vec/train.py @@ -224,5 +224,6 @@ def train(args): if __name__ == '__main__': + utils.check_version() args = parse_args() train(args) diff --git a/PaddleRec/word2vec/utils.py b/PaddleRec/word2vec/utils.py index 01cd04e493b09e880303d7b0c87f5ed71cf86357..0d1730053f0f9854a2d11f98543702eae3f32426 100644 --- a/PaddleRec/word2vec/utils.py +++ b/PaddleRec/word2vec/utils.py @@ -22,9 +22,23 @@ def BuildWord_IdMap(dict_path): def prepare_data(file_dir, dict_path, batch_size): w2i, i2w = BuildWord_IdMap(dict_path) vocab_size = len(i2w) - reader = paddle.batch(test(file_dir, w2i), batch_size) + reader = fluid.io.batch(test(file_dir, w2i), batch_size) return vocab_size, reader, i2w +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) def native_to_unicode(s): if _is_unicode(s):