diff --git a/PaddleRec/gru4rec/README.md b/PaddleRec/gru4rec/README.md index b60e4f292a5e426436bd09f79bba2a8b7add88bf..15a9b106e7d891942f2cfee975361ade0a79c44b 100644 --- a/PaddleRec/gru4rec/README.md +++ b/PaddleRec/gru4rec/README.md @@ -45,6 +45,9 @@ session-based推荐应用场景非常广泛,比如用户的商品浏览、新 运行样例程序可跳过'RSC15 数据下载及预处理'部分 + +**要求使用PaddlePaddle 1.6及以上版本或适当的develop版本。** + 同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122296) ## RSC15 数据下载及预处理 @@ -278,7 +281,7 @@ model:model_r@20/epoch_10 recall@20:0.681 time_cost(s):12.2 可参考cluster_train.py 配置其他多机环境 -运行命令本地模拟多机场景 +运行命令本地模拟多机场景, 暂不支持windows ``` sh cluster_train.sh ``` diff --git a/PaddleRec/gru4rec/convert_format.py b/PaddleRec/gru4rec/convert_format.py index b5db511ef087e59724e765f9fc9275fda6428b27..7bca1d527ab756903382c1314e945e8ddff8a7a9 100644 --- a/PaddleRec/gru4rec/convert_format.py +++ b/PaddleRec/gru4rec/convert_format.py @@ -2,8 +2,8 @@ import sys def convert_format(input, output): - with open(input) as rf: - with open(output, "w") as wf: + with open(input, "r", encoding='utf-8') as rf: + with open(output, "w", encoding='utf-8') as wf: last_sess = -1 sign = 1 i = 0 diff --git a/PaddleRec/gru4rec/infer.py b/PaddleRec/gru4rec/infer.py index bc459c28a9b24761b202dc5d8110d583322abdeb..032205cf7b6f9cc1015583e13a29c2361f889897 100644 --- a/PaddleRec/gru4rec/infer.py +++ b/PaddleRec/gru4rec/infer.py @@ -71,6 +71,7 @@ def infer(test_reader, use_cuda, model_path): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/gru4rec/infer_sample_neg.py b/PaddleRec/gru4rec/infer_sample_neg.py index 48458e82b4fe2bbc7141c3e45469b8414d87ece4..b77f3685576e129926a8529d99adbc06185acd91 100644 --- a/PaddleRec/gru4rec/infer_sample_neg.py +++ b/PaddleRec/gru4rec/infer_sample_neg.py @@ -84,6 +84,7 @@ def infer(args, vocab_size, test_reader, use_cuda): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/gru4rec/net.py b/PaddleRec/gru4rec/net.py index 6a715443ff1e72ae77aba51d5eaffe4eefee9687..4bdfcdfa9182e7b0311873f5c3b88a18e7451ad3 100644 --- a/PaddleRec/gru4rec/net.py +++ b/PaddleRec/gru4rec/net.py @@ -10,12 +10,12 @@ def all_vocab_network(vocab_size, gru_lr_x = 1.0 fc_lr_x = 1.0 # Input data - src_wordseq = fluid.layers.data( - name="src_wordseq", shape=[1], dtype="int64", lod_level=1) - dst_wordseq = fluid.layers.data( - name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) + src_wordseq = fluid.data( + name="src_wordseq", shape=[None, 1], dtype="int64", lod_level=1) + dst_wordseq = fluid.data( + name="dst_wordseq", shape=[None, 1], dtype="int64", lod_level=1) - emb = fluid.layers.embedding( + emb = fluid.embedding( input=src_wordseq, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( @@ -56,19 +56,20 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2): gru_lr_x = 1.0 fc_lr_x = 1.0 # Input data - src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1) - pos_label = fluid.layers.data( - name="pos_label", shape=[1], dtype="int64", lod_level=1) - label = fluid.layers.data( - name="label", shape=[neg_size + 1], dtype="int64", lod_level=1) + src = fluid.data(name="src", shape=[None, 1], dtype="int64", lod_level=1) + pos_label = fluid.data( + name="pos_label", shape=[None, 1], dtype="int64", lod_level=1) + label = fluid.data( + name="label", shape=[None, neg_size + 1], dtype="int64", lod_level=1) - emb_src = fluid.layers.embedding( + emb_src = fluid.embedding( input=src, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( name="emb", initializer=fluid.initializer.XavierInitializer(), learning_rate=emb_lr_x)) + emb_src = fluid.layers.squeeze(input=emb_src, axes=[1]) emb_src_drop = fluid.layers.dropout(emb_src, dropout_prob=drop_out) @@ -90,7 +91,7 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2): gru_h0_drop = fluid.layers.dropout(gru_h0, dropout_prob=drop_out) label_re = fluid.layers.sequence_reshape(input=label, new_dim=1) - emb_label = fluid.layers.embedding( + emb_label1 = fluid.embedding( input=label_re, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( @@ -98,6 +99,7 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2): initializer=fluid.initializer.XavierInitializer(), learning_rate=emb_lr_x)) + emb_label = fluid.layers.squeeze(input=emb_label1, axes=[1]) emb_label_drop = fluid.layers.dropout(emb_label, dropout_prob=drop_out) gru_exp = fluid.layers.expand( @@ -120,19 +122,20 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2): gru_lr_x = 1.0 fc_lr_x = 1.0 # Input data - src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1) - pos_label = fluid.layers.data( - name="pos_label", shape=[1], dtype="int64", lod_level=1) - label = fluid.layers.data( - name="label", shape=[neg_size + 1], dtype="int64", lod_level=1) + src = fluid.data(name="src", shape=[None, 1], dtype="int64", lod_level=1) + pos_label = fluid.data( + name="pos_label", shape=[None, 1], dtype="int64", lod_level=1) + label = fluid.data( + name="label", shape=[None, neg_size + 1], dtype="int64", lod_level=1) - emb_src = fluid.layers.embedding( + emb_src = fluid.embedding( input=src, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( name="emb", initializer=fluid.initializer.XavierInitializer(), learning_rate=emb_lr_x)) + emb_src = fluid.layers.squeeze(input=emb_src, axes=[1]) emb_src_drop = fluid.layers.dropout(emb_src, dropout_prob=drop_out) @@ -154,13 +157,14 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2): gru_h0_drop = fluid.layers.dropout(gru_h0, dropout_prob=drop_out) label_re = fluid.layers.sequence_reshape(input=label, new_dim=1) - emb_label = fluid.layers.embedding( + emb_label1 = fluid.embedding( input=label_re, size=[vocab_size, hid_size], param_attr=fluid.ParamAttr( name="emb", initializer=fluid.initializer.XavierInitializer(), learning_rate=emb_lr_x)) + emb_label = fluid.layers.squeeze(input=emb_label1, axes=[1]) emb_label_drop = fluid.layers.dropout(emb_label, dropout_prob=drop_out) @@ -180,8 +184,8 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2): def infer_network(vocab_size, batch_size, hid_size, dropout=0.2): - src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1) - emb_src = fluid.layers.embedding( + src = fluid.data(name="src", shape=[None, 1], dtype="int64", lod_level=1) + emb_src = fluid.embedding( input=src, size=[vocab_size, hid_size], param_attr="emb") emb_src_drop = fluid.layers.dropout( emb_src, dropout_prob=dropout, is_test=True) @@ -198,20 +202,18 @@ def infer_network(vocab_size, batch_size, hid_size, dropout=0.2): gru_h0_drop = fluid.layers.dropout( gru_h0, dropout_prob=dropout, is_test=True) - all_label = fluid.layers.data( - name="all_label", - shape=[vocab_size, 1], - dtype="int64", - append_batch_size=False) - emb_all_label = fluid.layers.embedding( + all_label = fluid.data( + name="all_label", shape=[vocab_size, 1], dtype="int64") + emb_all_label = fluid.embedding( input=all_label, size=[vocab_size, hid_size], param_attr="emb") + emb_all_label = fluid.layers.squeeze(input=emb_all_label, axes=[1]) emb_all_label_drop = fluid.layers.dropout( emb_all_label, dropout_prob=dropout, is_test=True) all_pre = fluid.layers.matmul( gru_h0_drop, emb_all_label_drop, transpose_y=True) - pos_label = fluid.layers.data( - name="pos_label", shape=[1], dtype="int64", lod_level=1) + pos_label = fluid.data( + name="pos_label", shape=[None, 1], dtype="int64", lod_level=1) acc = fluid.layers.accuracy(input=all_pre, label=pos_label, k=20) return acc diff --git a/PaddleRec/gru4rec/text2paddle.py b/PaddleRec/gru4rec/text2paddle.py index 563a8cadbbed335d9fccabff8401c3e20f90f6d4..47cc8285fe74789e27480f613083fdda7a2ebeec 100644 --- a/PaddleRec/gru4rec/text2paddle.py +++ b/PaddleRec/gru4rec/text2paddle.py @@ -2,6 +2,11 @@ import sys import six import collections import os +import sys +if six.PY2: + reload(sys) + sys.setdefaultencoding('utf-8') + def word_count(input_file, word_freq=None): """ @@ -25,11 +30,11 @@ def build_dict(min_word_freq=0, train_dir="", test_dir=""): word_freq = collections.defaultdict(int) files = os.listdir(train_dir) for fi in files: - with open(train_dir + '/' + fi, "r") as f: + with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: word_freq = word_count(f, word_freq) files = os.listdir(test_dir) for fi in files: - with open(test_dir + '/' + fi, "r") as f: + with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: word_freq = word_count(f, word_freq) word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq] @@ -39,13 +44,16 @@ def build_dict(min_word_freq=0, train_dir="", test_dir=""): return word_idx -def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_dir): +def write_paddle(word_idx, train_dir, test_dir, output_train_dir, + output_test_dir): files = os.listdir(train_dir) if not os.path.exists(output_train_dir): os.mkdir(output_train_dir) for fi in files: - with open(train_dir + '/' + fi, "r") as f: - with open(output_train_dir + '/' + fi, "w") as wf: + with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: + with open( + os.path.join(output_train_dir, fi), "w", + encoding='utf-8') as wf: for l in f: l = l.strip().split() l = [word_idx.get(w) for w in l] @@ -57,8 +65,10 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di if not os.path.exists(output_test_dir): os.mkdir(output_test_dir) for fi in files: - with open(test_dir + '/' + fi, "r") as f: - with open(output_test_dir + '/' + fi, "w") as wf: + with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: + with open( + os.path.join(output_test_dir, fi), "w", + encoding='utf-8') as wf: for l in f: l = l.strip().split() l = [word_idx.get(w) for w in l] @@ -66,9 +76,11 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di wf.write(str(w) + " ") wf.write("\n") -def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab): + +def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, + output_vocab): vocab = build_dict(0, train_dir, test_dir) - with open(output_vocab, "w") as wf: + with open(output_vocab, "w", encoding='utf-8') as wf: wf.write(str(len(vocab)) + "\n") #wf.write(str(vocab)) write_paddle(vocab, train_dir, test_dir, output_train_dir, output_test_dir) @@ -79,4 +91,5 @@ test_dir = sys.argv[2] output_train_dir = sys.argv[3] output_test_dir = sys.argv[4] output_vocab = sys.argv[5] -text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab) +text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, + output_vocab) diff --git a/PaddleRec/gru4rec/train.py b/PaddleRec/gru4rec/train.py index b43926b69eaf002380a261a0689be91ec3f6ff90..686f7a117730ebc96f315717d6ebfc7b158444d0 100644 --- a/PaddleRec/gru4rec/train.py +++ b/PaddleRec/gru4rec/train.py @@ -58,8 +58,8 @@ def train(): """ do training """ args = parse_args() if args.enable_ce: - fluid.default_startup_program().random_seed = SEED - fluid.default_main_program().random_seed = SEED + fluid.default_startup_program().random_seed = SEED + fluid.default_main_program().random_seed = SEED hid_size = args.hid_size train_dir = args.train_dir vocab_path = args.vocab_path @@ -143,17 +143,16 @@ def train(): if args.use_cuda: gpu_num = device[1] print("kpis\teach_pass_duration_gpu%s\t%s" % - (gpu_num, total_time / epoch_idx)) - print("kpis\ttrain_ppl_gpu%s\t%s" % - (gpu_num, ce_ppl)) + (gpu_num, total_time / epoch_idx)) + print("kpis\ttrain_ppl_gpu%s\t%s" % (gpu_num, ce_ppl)) else: cpu_num = device[1] threads_num = device[2] print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" % - (cpu_num, threads_num, total_time / epoch_idx)) + (cpu_num, threads_num, total_time / epoch_idx)) print("kpis\ttrain_ppl_cpu%s_thread%s\t%s" % - (cpu_num, threads_num, ce_ppl)) - + (cpu_num, threads_num, ce_ppl)) + print("finish training") @@ -166,7 +165,8 @@ def get_device(args): threads_num = os.environ.get('NUM_THREADS', 1) cpu_num = os.environ.get('CPU_NUM', 1) return "cpu", int(cpu_num), int(threads_num) - + if __name__ == "__main__": + utils.check_version() train() diff --git a/PaddleRec/gru4rec/train_sample_neg.py b/PaddleRec/gru4rec/train_sample_neg.py index 2642452024810fe16cfa1154e273febdb1d63254..fbb687052771fd6ca642fda7637c103e120136ff 100644 --- a/PaddleRec/gru4rec/train_sample_neg.py +++ b/PaddleRec/gru4rec/train_sample_neg.py @@ -128,4 +128,5 @@ def train(): if __name__ == "__main__": + utils.check_version() train() diff --git a/PaddleRec/gru4rec/utils.py b/PaddleRec/gru4rec/utils.py index 1cd6a313b2a5097b16c473722737e0e6936f4e31..6996df22a285e1598800bedd07fcccfbd29b6090 100644 --- a/PaddleRec/gru4rec/utils.py +++ b/PaddleRec/gru4rec/utils.py @@ -6,6 +6,7 @@ import numpy as np import paddle.fluid as fluid import paddle import os +import io def to_lodtensor(data, place): @@ -86,7 +87,7 @@ def to_lodtensor_bpr_test(raw_data, vocab_size, place): def get_vocab_size(vocab_path): - with open(vocab_path, "r") as rf: + with io.open(vocab_path, "r", encoding='utf-8') as rf: line = rf.readline() return int(line.strip()) @@ -110,12 +111,28 @@ def prepare_data(file_dir, batch_size * 20) else: vocab_size = get_vocab_size(vocab_path) - reader = paddle.batch( + reader = fluid.io.batch( test( file_dir, buffer_size, data_type=DataType.SEQ), batch_size) return vocab_size, reader +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + + def sort_batch(reader, batch_size, sort_group_size, drop_last=False): """ Create a batched reader. @@ -170,7 +187,8 @@ def reader_creator(file_dir, n, data_type): def reader(): files = os.listdir(file_dir) for fi in files: - with open(file_dir + '/' + fi, "r") as f: + with io.open( + os.path.join(file_dir, fi), "r", encoding='utf-8') as f: for l in f: if DataType.SEQ == data_type: l = l.strip().split() diff --git a/PaddleRec/multiview_simnet/README.md b/PaddleRec/multiview_simnet/README.md index a3bceaba646d13b11e66c475e35e091d4cd68d03..1c682eed1b630de7047f7002e0d709cf715d1e14 100644 --- a/PaddleRec/multiview_simnet/README.md +++ b/PaddleRec/multiview_simnet/README.md @@ -3,6 +3,8 @@ ## Introduction In personalized recommendation scenario, a user often is provided with several items from personalized interest matching model. In real world application, a user may have multiple views of features, say user-id, age, click-history of items, search queries. A item, e.g. news, may also have multiple views of features like news title, news category, images in news and so on. Multi-view Simnet is matching a model that combine users' and items' multiple views of features into one unified model. The model can be used in many industrial product like Baidu's feed news. The model is adapted from the paper A Multi-View Deep Learning(MV-DNN) Approach for Cross Domain User Modeling in Recommendation Systems, WWW 2015. The difference between our model and the MV-DNN is that we also consider multiple feature views of users. +**Now all models in PaddleRec require PaddlePaddle version 1.6 or higher, or suitable develop version.** + We also recommend users to take a look at the [IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122294) ## Dataset diff --git a/PaddleRec/multiview_simnet/infer.py b/PaddleRec/multiview_simnet/infer.py index 7b5bb080d278ba5fffbe678841037b71b02b3069..e9136588add2815c65e5b9e7e707de1f6fce8707 100644 --- a/PaddleRec/multiview_simnet/infer.py +++ b/PaddleRec/multiview_simnet/infer.py @@ -31,6 +31,22 @@ logger = logging.getLogger("fluid") logger.setLevel(logging.INFO) +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + + def parse_args(): parser = argparse.ArgumentParser("multi-view simnet") parser.add_argument("--train_file", type=str, help="Training file") @@ -116,4 +132,5 @@ def main(): if __name__ == "__main__": + check_version() main() diff --git a/PaddleRec/multiview_simnet/nets.py b/PaddleRec/multiview_simnet/nets.py index fed177844bdd247d163aee9e8625cd0ec74378b3..86d1497b907c4dd082ef4ed789265db79045c23f 100644 --- a/PaddleRec/multiview_simnet/nets.py +++ b/PaddleRec/multiview_simnet/nets.py @@ -53,7 +53,6 @@ class CNNEncoder(object): pool_type=self.pool_type, param_attr=self.param_name + ".param", bias_attr=self.param_name + ".bias") - class GrnnEncoder(object): @@ -64,12 +63,11 @@ class GrnnEncoder(object): self.hidden_size = hidden_size def forward(self, emb): - fc0 = nn.fc( - input=emb, - size=self.hidden_size * 3, - param_attr=self.param_name + "_fc.w", - bias_attr=False) - + fc0 = nn.fc(input=emb, + size=self.hidden_size * 3, + param_attr=self.param_name + "_fc.w", + bias_attr=False) + gru_h = nn.dynamic_gru( input=fc0, size=self.hidden_size, @@ -125,34 +123,34 @@ class MultiviewSimnet(object): def train_net(self): # input fields for query, pos_title, neg_title q_slots = [ - io.data( - name="q%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="q%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.query_encoders)) ] pt_slots = [ - io.data( - name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="pt%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.title_encoders)) ] nt_slots = [ - io.data( - name="nt%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="nt%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.title_encoders)) ] # lookup embedding for each slot q_embs = [ - nn.embedding( + fluid.embedding( input=query, size=self.emb_shape, param_attr="emb") for query in q_slots ] pt_embs = [ - nn.embedding( + fluid.embedding( input=title, size=self.emb_shape, param_attr="emb") for title in pt_slots ] nt_embs = [ - nn.embedding( + fluid.embedding( input=title, size=self.emb_shape, param_attr="emb") for title in nt_slots ] @@ -174,9 +172,18 @@ class MultiviewSimnet(object): nt_concat = nn.concat(nt_encodes) # projection of hidden layer - q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', bias_attr='q_fc.b') - pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b') - nt_hid = nn.fc(nt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b') + q_hid = nn.fc(q_concat, + size=self.hidden_size, + param_attr='q_fc.w', + bias_attr='q_fc.b') + pt_hid = nn.fc(pt_concat, + size=self.hidden_size, + param_attr='t_fc.w', + bias_attr='t_fc.b') + nt_hid = nn.fc(nt_concat, + size=self.hidden_size, + param_attr='t_fc.w', + bias_attr='t_fc.b') # cosine of hidden layers cos_pos = nn.cos_sim(q_hid, pt_hid) @@ -205,23 +212,23 @@ class MultiviewSimnet(object): def pred_net(self, query_fields, pos_title_fields, neg_title_fields): q_slots = [ - io.data( - name="q%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="q%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.query_encoders)) ] pt_slots = [ - io.data( - name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') + fluid.data( + name="pt%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.title_encoders)) ] # lookup embedding for each slot q_embs = [ - nn.embedding( + fluid.embedding( input=query, size=self.emb_shape, param_attr="emb") for query in q_slots ] pt_embs = [ - nn.embedding( + fluid.embedding( input=title, size=self.emb_shape, param_attr="emb") for title in pt_slots ] @@ -236,8 +243,14 @@ class MultiviewSimnet(object): q_concat = nn.concat(q_encodes) pt_concat = nn.concat(pt_encodes) # projection of hidden layer - q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', bias_attr='q_fc.b') - pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b') + q_hid = nn.fc(q_concat, + size=self.hidden_size, + param_attr='q_fc.w', + bias_attr='q_fc.b') + pt_hid = nn.fc(pt_concat, + size=self.hidden_size, + param_attr='t_fc.w', + bias_attr='t_fc.b') # cosine of hidden layers cos = nn.cos_sim(q_hid, pt_hid) return cos diff --git a/PaddleRec/multiview_simnet/train.py b/PaddleRec/multiview_simnet/train.py index f098fd109e8813ffbfb40753122acbef3cd896a6..8f4072addf7ecbbfda162298e87a834eb2855637 100644 --- a/PaddleRec/multiview_simnet/train.py +++ b/PaddleRec/multiview_simnet/train.py @@ -88,6 +88,22 @@ def parse_args(): return parser.parse_args() +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + + def start_train(args): if args.enable_ce: SEED = 102 @@ -145,7 +161,7 @@ def start_train(args): # only for ce if args.enable_ce: threads_num, cpu_num = get_cards(args) - epoch_idx = args.epochs + epoch_idx = args.epochs ce_loss = 0 try: ce_loss = ce_info[-2] @@ -153,9 +169,9 @@ def start_train(args): logger.error("ce info error") print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" % - (cpu_num, threads_num, total_time / epoch_idx)) + (cpu_num, threads_num, total_time / epoch_idx)) print("kpis\ttrain_loss_cpu%s_thread%s\t%s" % - (cpu_num, threads_num, ce_loss)) + (cpu_num, threads_num, ce_loss)) def get_cards(args): @@ -170,4 +186,5 @@ def main(): if __name__ == "__main__": + check_version() main() diff --git a/PaddleRec/ssr/README.md b/PaddleRec/ssr/README.md index d0b4dfb41b4cea19efa42c4a233c9544349d1770..6abc52405a3bf6a288bae2b3675d84fe33bd00ac 100644 --- a/PaddleRec/ssr/README.md +++ b/PaddleRec/ssr/README.md @@ -12,6 +12,10 @@ Sequence Semantic Retrieval(SSR) Model shares the similar idea with Multi-Rate D - The idea of SSR is to model a user's personalized interest of an item through matching model structure, and the representation of a news item can be computed online even the news item does not exist in training dataset. - With the representation of news items, we are able to build an vector indexing service online for news prediction and this is the retrieval part of SSR. +## Version +**Now all models in PaddleRec require PaddlePaddle version 1.6 or higher, or suitable develop version.** + + ## Dataset Dataset preprocessing follows the method of [GRU4Rec Project](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/gru4rec). Note that you should reuse scripts from GRU4Rec project for data preprocessing. @@ -39,7 +43,7 @@ cpu 单机多卡训练 CPU_NUM=10 python train.py --train_dir train_data --use_cuda 0 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 10 ``` -本地模拟多机训练 +本地模拟多机训练, 不支持windows. ``` bash sh cluster_train.sh ``` diff --git a/PaddleRec/ssr/infer.py b/PaddleRec/ssr/infer.py index 09dee039f4da1e08de0169b3370aff174c89556b..3a44fad7196336a71ce0ed484d5869b1633541f4 100644 --- a/PaddleRec/ssr/infer.py +++ b/PaddleRec/ssr/infer.py @@ -120,6 +120,7 @@ def infer(args, vocab_size, test_reader): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/ssr/nets.py b/PaddleRec/ssr/nets.py index 4df23573c91fcf16a4ef95d1bab1ac01e437d148..6580fb1467058b4b80ff318a30298c9dc8becbdb 100644 --- a/PaddleRec/ssr/nets.py +++ b/PaddleRec/ssr/nets.py @@ -86,16 +86,17 @@ class SequenceSemanticRetrieval(object): return correct def train(self): - user_data = io.data(name="user", shape=[1], dtype="int64", lod_level=1) - pos_item_data = io.data( - name="p_item", shape=[1], dtype="int64", lod_level=1) - neg_item_data = io.data( - name="n_item", shape=[1], dtype="int64", lod_level=1) - user_emb = nn.embedding( + user_data = fluid.data( + name="user", shape=[None, 1], dtype="int64", lod_level=1) + pos_item_data = fluid.data( + name="p_item", shape=[None, 1], dtype="int64", lod_level=1) + neg_item_data = fluid.data( + name="n_item", shape=[None, 1], dtype="int64", lod_level=1) + user_emb = fluid.embedding( input=user_data, size=self.emb_shape, param_attr="emb.item") - pos_item_emb = nn.embedding( + pos_item_emb = fluid.embedding( input=pos_item_data, size=self.emb_shape, param_attr="emb.item") - neg_item_emb = nn.embedding( + neg_item_emb = fluid.embedding( input=neg_item_data, size=self.emb_shape, param_attr="emb.item") user_enc = self.user_encoder.forward(user_emb) pos_item_enc = self.item_encoder.forward(pos_item_emb) diff --git a/PaddleRec/ssr/reader.py b/PaddleRec/ssr/reader.py index 15989fd8cec366b2c3b71672f134035c42bf79da..6d73440bea1699733ace31d0a5f21943f7665918 100644 --- a/PaddleRec/ssr/reader.py +++ b/PaddleRec/ssr/reader.py @@ -13,6 +13,7 @@ # limitations under the License. import random +import io class Dataset: @@ -33,7 +34,7 @@ class YoochooseVocab(Vocab): def load(self, filelist): idx = 0 for f in filelist: - with open(f, "r") as fin: + with io.open(f, "r", encoding='utf-8') as fin: for line in fin: group = line.strip().split() for item in group: @@ -64,7 +65,7 @@ class YoochooseDataset(Dataset): def _reader_creator(self, filelist, is_train): def reader(): for f in filelist: - with open(f, 'r') as fin: + with io.open(f, 'r', encoding='utf-8') as fin: line_idx = 0 for line in fin: ids = line.strip().split() diff --git a/PaddleRec/ssr/train.py b/PaddleRec/ssr/train.py index 1c0c9f8cc3ed6750d21ba43985fb142dc527cf00..a3e85d63ac65baf0e59cfa65d6049761bb2581d2 100644 --- a/PaddleRec/ssr/train.py +++ b/PaddleRec/ssr/train.py @@ -68,9 +68,9 @@ def get_cards(args): def train(args): if args.enable_ce: - SEED = 102 - fluid.default_startup_program().random_seed = SEED - fluid.default_main_program().random_seed = SEED + SEED = 102 + fluid.default_startup_program().random_seed = SEED + fluid.default_main_program().random_seed = SEED use_cuda = True if args.use_cuda else False parallel = True if args.parallel else False print("use_cuda:", use_cuda, "parallel:", parallel) @@ -136,17 +136,16 @@ def train(args): if args.use_cuda: gpu_num = device[1] print("kpis\teach_pass_duration_gpu%s\t%s" % - (gpu_num, total_time / epoch_idx)) - print("kpis\ttrain_acc_gpu%s\t%s" % - (gpu_num, ce_acc)) + (gpu_num, total_time / epoch_idx)) + print("kpis\ttrain_acc_gpu%s\t%s" % (gpu_num, ce_acc)) else: cpu_num = device[1] threads_num = device[2] print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" % - (cpu_num, threads_num, total_time / epoch_idx)) + (cpu_num, threads_num, total_time / epoch_idx)) print("kpis\ttrain_acc_cpu%s_thread%s\t%s" % - (cpu_num, threads_num, ce_acc)) - + (cpu_num, threads_num, ce_acc)) + def get_device(args): if args.use_cuda: @@ -157,7 +156,7 @@ def get_device(args): threads_num = os.environ.get('NUM_THREADS', 1) cpu_num = os.environ.get('CPU_NUM', 1) return "cpu", int(cpu_num), int(threads_num) - + def main(): args = parse_args() @@ -165,4 +164,5 @@ def main(): if __name__ == "__main__": + utils.check_version() main() diff --git a/PaddleRec/ssr/utils.py b/PaddleRec/ssr/utils.py index 4fe9ef470ed0a2a5da7bef6a975f45e5a04ab18e..65571cb08a930d520c82c881bf5c68ca7c53b152 100644 --- a/PaddleRec/ssr/utils.py +++ b/PaddleRec/ssr/utils.py @@ -4,10 +4,11 @@ import os import logging import paddle.fluid as fluid import paddle +import io def get_vocab_size(vocab_path): - with open(vocab_path, "r") as rf: + with io.open(vocab_path, "r", encoding='utf-8') as rf: line = rf.readline() return int(line.strip()) @@ -16,7 +17,7 @@ def construct_train_data(file_dir, vocab_path, batch_size): vocab_size = get_vocab_size(vocab_path) files = [file_dir + '/' + f for f in os.listdir(file_dir)] y_data = reader.YoochooseDataset(vocab_size) - train_reader = paddle.batch( + train_reader = fluid.io.batch( paddle.reader.shuffle( y_data.train(files), buf_size=batch_size * 100), batch_size=batch_size) @@ -27,10 +28,26 @@ def construct_test_data(file_dir, vocab_path, batch_size): vocab_size = get_vocab_size(vocab_path) files = [file_dir + '/' + f for f in os.listdir(file_dir)] y_data = reader.YoochooseDataset(vocab_size) - test_reader = paddle.batch(y_data.test(files), batch_size=batch_size) + test_reader = fluid.io.batch(y_data.test(files), batch_size=batch_size) return test_reader, vocab_size +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + + def infer_data(raw_data, place): data = [dat[0] for dat in raw_data] seq_lens = [len(seq) for seq in data] diff --git a/PaddleRec/tagspace/README.md b/PaddleRec/tagspace/README.md index b72a05b8cbf8152f530d18e75b4414c13d514086..cd7307f1e30311c80efd44cd9ee98f5e494caa7f 100644 --- a/PaddleRec/tagspace/README.md +++ b/PaddleRec/tagspace/README.md @@ -26,6 +26,8 @@ TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Ha Tagspace模型学习文本及标签的embedding表示,应用于工业级的标签推荐,具体应用场景有feed新闻标签推荐。 +**Now all models in PaddleRec require PaddlePaddle version 1.6 or higher, or suitable develop version.** + 同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122298) ## 数据下载及预处理 diff --git a/PaddleRec/tagspace/infer.py b/PaddleRec/tagspace/infer.py index e8522b095826622721de9f2e329c8c361f6f7c41..66412fc5b20a2146227c39572b53b841a5983a6b 100644 --- a/PaddleRec/tagspace/infer.py +++ b/PaddleRec/tagspace/infer.py @@ -71,6 +71,7 @@ def infer(test_reader, vocab_tag, use_cuda, model_path, epoch): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/tagspace/net.py b/PaddleRec/tagspace/net.py index 797ae63442643ad1a8ce1f0dcf374eff24dbbe67..479d6620aaf1ddf3b6ca5decf56f85e604bd0878 100644 --- a/PaddleRec/tagspace/net.py +++ b/PaddleRec/tagspace/net.py @@ -2,42 +2,53 @@ import paddle.fluid as fluid import paddle.fluid.layers.nn as nn import paddle.fluid.layers.tensor as tensor import paddle.fluid.layers.control_flow as cf -import paddle.fluid.layers.io as io -def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1, neg_size=5): + +def network(vocab_text_size, + vocab_tag_size, + emb_dim=10, + hid_dim=1000, + win_size=5, + margin=0.1, + neg_size=5): """ network definition """ - text = io.data(name="text", shape=[1], lod_level=1, dtype='int64') - pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64') - neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64') - text_emb = nn.embedding( - input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb") - pos_tag_emb = nn.embedding( - input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") - neg_tag_emb = nn.embedding( - input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") + text = fluid.data(name="text", shape=[None, 1], lod_level=1, dtype='int64') + pos_tag = fluid.data( + name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64') + neg_tag = fluid.data( + name="neg_tag", shape=[None, 1], lod_level=1, dtype='int64') + text_emb = fluid.embedding( + input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb") + text_emb = fluid.layers.squeeze(input=text_emb, axes=[1]) + pos_tag_emb = fluid.embedding( + input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") + pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1]) + neg_tag_emb = fluid.embedding( + input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") + neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1]) conv_1d = fluid.nets.sequence_conv_pool( - input=text_emb, - num_filters=hid_dim, - filter_size=win_size, - act="tanh", - pool_type="max", - param_attr="cnn") - text_hid = fluid.layers.fc(input=conv_1d, size=emb_dim, param_attr="text_hid") + input=text_emb, + num_filters=hid_dim, + filter_size=win_size, + act="tanh", + pool_type="max", + param_attr="cnn") + text_hid = fluid.layers.fc(input=conv_1d, + size=emb_dim, + param_attr="text_hid") cos_pos = nn.cos_sim(pos_tag_emb, text_hid) mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb) mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid) - cos_neg_all = fluid.layers.sequence_reshape(input=mul_cos_neg, new_dim=neg_size) + cos_neg_all = fluid.layers.sequence_reshape( + input=mul_cos_neg, new_dim=neg_size) #choose max negtive cosine cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True) #calculate hinge loss loss_part1 = nn.elementwise_sub( - tensor.fill_constant_batch_size_like( - input=cos_pos, - shape=[-1, 1], - value=margin, - dtype='float32'), - cos_pos) + tensor.fill_constant_batch_size_like( + input=cos_pos, shape=[-1, 1], value=margin, dtype='float32'), + cos_pos) loss_part2 = nn.elementwise_add(loss_part1, cos_neg) loss_part3 = nn.elementwise_max( tensor.fill_constant_batch_size_like( diff --git a/PaddleRec/tagspace/text2paddle.py b/PaddleRec/tagspace/text2paddle.py index 6aa040c02aae715549593f41dc3bcf0509aa5c6f..b7db212c9a14f5030f143aea53992b8b63123aa3 100644 --- a/PaddleRec/tagspace/text2paddle.py +++ b/PaddleRec/tagspace/text2paddle.py @@ -2,8 +2,13 @@ import sys import six import collections import os -import csv +import csv import re +import sys +if six.PY2: + reload(sys) + sys.setdefaultencoding('utf-8') + def word_count(column_num, input_file, word_freq=None): """ @@ -13,10 +18,11 @@ def word_count(column_num, input_file, word_freq=None): word_freq = collections.defaultdict(int) data_file = csv.reader(input_file) for row in data_file: - for w in re.split(r'\W+',row[column_num].strip()): - word_freq[w]+= 1 + for w in re.split(r'\W+', row[column_num].strip()): + word_freq[w] += 1 return word_freq + def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""): """ Build a word dictionary from the corpus, Keys of the dictionary are words, @@ -25,11 +31,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""): word_freq = collections.defaultdict(int) files = os.listdir(train_dir) for fi in files: - with open(train_dir + '/' + fi, "r") as f: + with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: word_freq = word_count(column_num, f, word_freq) files = os.listdir(test_dir) for fi in files: - with open(test_dir + '/' + fi, "r") as f: + with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: word_freq = word_count(column_num, f, word_freq) word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq] @@ -39,13 +45,16 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""): return word_idx -def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, output_test_dir): +def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, + output_test_dir): files = os.listdir(train_dir) if not os.path.exists(output_train_dir): os.mkdir(output_train_dir) for fi in files: - with open(train_dir + '/' + fi, "r") as f: - with open(output_train_dir + '/' + fi, "w") as wf: + with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: + with open( + os.path.join(output_train_dir, fi), "w", + encoding='utf-8') as wf: data_file = csv.reader(f) for row in data_file: tag_raw = re.split(r'\W+', row[0].strip()) @@ -61,8 +70,10 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu if not os.path.exists(output_test_dir): os.mkdir(output_test_dir) for fi in files: - with open(test_dir + '/' + fi, "r") as f: - with open(output_test_dir + '/' + fi, "w") as wf: + with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: + with open( + os.path.join(output_test_dir, fi), "w", + encoding='utf-8') as wf: data_file = csv.reader(f) for row in data_file: tag_raw = re.split(r'\W+', row[0].strip()) @@ -74,18 +85,21 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu wf.write(str(w) + " ") wf.write("\n") -def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag): + +def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, + output_vocab_text, output_vocab_tag): print("start constuct word dict") vocab_text = build_dict(2, 0, train_dir, test_dir) - with open(output_vocab_text, "w") as wf: + with open(output_vocab_text, "w", encoding='utf-8') as wf: wf.write(str(len(vocab_text)) + "\n") vocab_tag = build_dict(0, 0, train_dir, test_dir) - with open(output_vocab_tag, "w") as wf: + with open(output_vocab_tag, "w", encoding='utf-8') as wf: wf.write(str(len(vocab_tag)) + "\n") print("construct word dict done\n") - write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir, output_test_dir) + write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir, + output_test_dir) train_dir = sys.argv[1] @@ -94,4 +108,5 @@ output_train_dir = sys.argv[3] output_test_dir = sys.argv[4] output_vocab_text = sys.argv[5] output_vocab_tag = sys.argv[6] -text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag) +text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, + output_vocab_text, output_vocab_tag) diff --git a/PaddleRec/tagspace/train.py b/PaddleRec/tagspace/train.py index 419bb1c4b156c148f8bc4bc3a48385b6722f5c68..da3563e096ac3da3cb94721494daf8855be552a9 100644 --- a/PaddleRec/tagspace/train.py +++ b/PaddleRec/tagspace/train.py @@ -56,8 +56,8 @@ def train(): """ do training """ args = parse_args() if args.enable_ce: - fluid.default_startup_program().random_seed = SEED - fluid.default_main_program().random_seed = SEED + fluid.default_startup_program().random_seed = SEED + fluid.default_main_program().random_seed = SEED train_dir = args.train_dir vocab_text_path = args.vocab_text_path vocab_tag_path = args.vocab_tag_path @@ -114,7 +114,8 @@ def train(): "neg_tag": lod_neg_tag }, fetch_list=[avg_cost.name, correct.name]) - ce_info.append(float(np.sum(correct_val)) / (args.num_devices * batch_size)) + ce_info.append( + float(np.sum(correct_val)) / (args.num_devices * batch_size)) if batch_id % args.print_batch == 0: print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}" .format(pass_idx, (batch_id + 10) * batch_size, @@ -128,8 +129,7 @@ def train(): save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) feed_var_names = ["text", "pos_tag"] fetch_vars = [cos_pos] - fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, - exe) + fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) # only for ce if args.enable_ce: ce_acc = 0 @@ -142,17 +142,16 @@ def train(): if args.use_cuda: gpu_num = device[1] print("kpis\teach_pass_duration_gpu%s\t%s" % - (gpu_num, total_time / epoch_idx)) - print("kpis\ttrain_acc_gpu%s\t%s" % - (gpu_num, ce_acc)) + (gpu_num, total_time / epoch_idx)) + print("kpis\ttrain_acc_gpu%s\t%s" % (gpu_num, ce_acc)) else: cpu_num = device[1] threads_num = device[2] print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" % - (cpu_num, threads_num, total_time / epoch_idx)) + (cpu_num, threads_num, total_time / epoch_idx)) print("kpis\ttrain_acc_cpu%s_thread%s\t%s" % - (cpu_num, threads_num, ce_acc)) - + (cpu_num, threads_num, ce_acc)) + print("finish training") @@ -165,7 +164,8 @@ def get_device(args): threads_num = os.environ.get('NUM_THREADS', 1) cpu_num = os.environ.get('CPU_NUM', 1) return "cpu", int(cpu_num), int(threads_num) - + if __name__ == "__main__": + utils.check_version() train() diff --git a/PaddleRec/tagspace/utils.py b/PaddleRec/tagspace/utils.py index f5b7e64753d331df57e2ef0a86b5a1dff1cea37a..7ae71249e41ec2d6f42be07f3a5a472a1d3ba8a2 100644 --- a/PaddleRec/tagspace/utils.py +++ b/PaddleRec/tagspace/utils.py @@ -8,6 +8,11 @@ import numpy as np import paddle.fluid as fluid import paddle import csv +import io +if six.PY2: + reload(sys) + sys.setdefaultencoding('utf-8') + def to_lodtensor(data, place): """ convert to LODtensor """ @@ -24,12 +29,29 @@ def to_lodtensor(data, place): res.set_lod([lod]) return res + def get_vocab_size(vocab_path): - with open(vocab_path, "r") as rf: + with io.open(vocab_path, "r") as rf: line = rf.readline() return int(line.strip()) +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + + def prepare_data(file_dir, vocab_text_path, vocab_tag_path, @@ -45,19 +67,25 @@ def prepare_data(file_dir, reader = sort_batch( paddle.reader.shuffle( train( - file_dir, vocab_tag_size, neg_size, - buffer_size, data_type=DataType.SEQ), + file_dir, + vocab_tag_size, + neg_size, + buffer_size, + data_type=DataType.SEQ), buf_size=buffer_size), - batch_size, batch_size * 20) + batch_size, + batch_size * 20) else: vocab_tag_size = get_vocab_size(vocab_tag_path) vocab_text_size = 0 reader = sort_batch( test( file_dir, vocab_tag_size, buffer_size, data_type=DataType.SEQ), - batch_size, batch_size * 20) + batch_size, + batch_size * 20) return vocab_text_size, vocab_tag_size, reader + def sort_batch(reader, batch_size, sort_group_size, drop_last=False): """ Create a batched reader. @@ -107,11 +135,13 @@ def sort_batch(reader, batch_size, sort_group_size, drop_last=False): class DataType(object): SEQ = 2 + def train_reader_creator(file_dir, tag_size, neg_size, n, data_type): def reader(): files = os.listdir(file_dir) for fi in files: - with open(file_dir + '/' + fi, "r") as f: + with io.open( + os.path.join(file_dir, fi), "r", encoding='utf-8') as f: for l in f: l = l.strip().split(",") pos_index = int(l[0]) @@ -123,7 +153,7 @@ def train_reader_creator(file_dir, tag_size, neg_size, n, data_type): max_iter = 100 now_iter = 0 sum_n = 0 - while(sum_n < neg_size) : + while (sum_n < neg_size): now_iter += 1 if now_iter > max_iter: print("error : only one class") @@ -135,13 +165,16 @@ def train_reader_creator(file_dir, tag_size, neg_size, n, data_type): sum_n += 1 if n > 0 and len(text) > n: continue yield text, pos_tag, neg_tag + return reader + def test_reader_creator(file_dir, tag_size, n, data_type): def reader(): files = os.listdir(file_dir) for fi in files: - with open(file_dir + '/' + fi, "r") as f: + with io.open( + os.path.join(file_dir, fi), "r", encoding='utf-8') as f: for l in f: l = l.strip().split(",") pos_index = int(l[0]) @@ -153,11 +186,13 @@ def test_reader_creator(file_dir, tag_size, n, data_type): tag = [] tag.append(ii) yield text, tag, pos_tag + return reader def train(train_dir, tag_size, neg_size, n, data_type=DataType.SEQ): return train_reader_creator(train_dir, tag_size, neg_size, n, data_type) + def test(test_dir, tag_size, n, data_type=DataType.SEQ): return test_reader_creator(test_dir, tag_size, n, data_type) diff --git a/PaddleRec/word2vec/README.md b/PaddleRec/word2vec/README.md index 724e7bc17d21a701c82b82676c89bb2331082262..581c81aca5f9a7b5cd1396a1a719b477f84c766a 100644 --- a/PaddleRec/word2vec/README.md +++ b/PaddleRec/word2vec/README.md @@ -20,6 +20,8 @@ ## 介绍 本例实现了skip-gram模式的word2vector模型。 +**目前模型库下模型均要求使用PaddlePaddle 1.6及以上版本或适当的develop版本。** + 同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/124377) ## 数据下载 @@ -36,7 +38,7 @@ mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tok ```bash mkdir data -wget https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar +wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ data/ ``` @@ -45,7 +47,7 @@ mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tok ```bash mkdir data -wget https://paddlerec.bj.bcebos.com/word2vec/text.tar +wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/text.tar tar xvf text.tar mv text data/ ``` @@ -95,7 +97,7 @@ python train.py -h OPENBLAS_NUM_THREADS=1 CPU_NUM=5 python train.py --train_data_dir data/convert_text8 --dict_path data/test_build_dict --num_passes 10 --batch_size 100 --model_output_dir v1_cpu5_b100_lr1dir --base_lr 1.0 --print_batch 1000 --with_speed --is_sparse ``` -本地单机模拟多机训练 +本地单机模拟多机训练, 目前暂不支持windows。 ```bash sh cluster_train.sh @@ -106,9 +108,9 @@ sh cluster_train.sh ```bash #全量数据集测试集 -wget https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar +wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar #样本数据集测试集 -wget https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar +wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar ``` 预测命令,注意词典名称需要加后缀"_word_to_id_", 此文件是预处理阶段生成的。 diff --git a/PaddleRec/word2vec/infer.py b/PaddleRec/word2vec/infer.py index 1b3290029d620d130d2fe7b7c2bcfd8bbeae54c2..32f65d17de585ef4f1fc14797e300ef7f55ac877 100644 --- a/PaddleRec/word2vec/infer.py +++ b/PaddleRec/word2vec/infer.py @@ -10,6 +10,9 @@ import paddle.fluid as fluid import paddle import net import utils +if six.PY2: + reload(sys) + sys.setdefaultencoding('utf-8') def parse_args(): @@ -76,15 +79,12 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w): for data in test_reader(): step_id += 1 b_size = len([dat[0] for dat in data]) - wa = np.array( - [dat[0] for dat in data]).astype("int64").reshape( - b_size, 1) - wb = np.array( - [dat[1] for dat in data]).astype("int64").reshape( - b_size, 1) - wc = np.array( - [dat[2] for dat in data]).astype("int64").reshape( - b_size, 1) + wa = np.array([dat[0] for dat in data]).astype( + "int64").reshape(b_size) + wb = np.array([dat[1] for dat in data]).astype( + "int64").reshape(b_size) + wc = np.array([dat[2] for dat in data]).astype( + "int64").reshape(b_size) label = [dat[3] for dat in data] input_word = [dat[4] for dat in data] @@ -93,9 +93,8 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w): "analogy_a": wa, "analogy_b": wb, "analogy_c": wc, - "all_label": - np.arange(vocab_size).reshape( - vocab_size, 1).astype("int64"), + "all_label": np.arange(vocab_size) + .reshape(vocab_size).astype("int64"), }, fetch_list=[pred.name, values], return_numpy=False) @@ -143,15 +142,12 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w): for data in test_reader(): step_id += 1 b_size = len([dat[0] for dat in data]) - wa = np.array( - [dat[0] for dat in data]).astype("int64").reshape( - b_size, 1) - wb = np.array( - [dat[1] for dat in data]).astype("int64").reshape( - b_size, 1) - wc = np.array( - [dat[2] for dat in data]).astype("int64").reshape( - b_size, 1) + wa = np.array([dat[0] for dat in data]).astype( + "int64").reshape(b_size) + wb = np.array([dat[1] for dat in data]).astype( + "int64").reshape(b_size) + wc = np.array([dat[2] for dat in data]).astype( + "int64").reshape(b_size) label = [dat[3] for dat in data] input_word = [dat[4] for dat in data] @@ -162,7 +158,7 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w): "analogy_b": wb, "analogy_c": wc, "all_label": - np.arange(vocab_size).reshape(vocab_size, 1), + np.arange(vocab_size).reshape(vocab_size), }, fetch_list=[pred.name, values], return_numpy=False) @@ -185,6 +181,7 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w): if __name__ == "__main__": + utils.check_version() args = parse_args() start_index = args.start_index last_index = args.last_index diff --git a/PaddleRec/word2vec/net.py b/PaddleRec/word2vec/net.py index ab2abbc76bde8e03c9a6e1e0abb062aa467d2c91..b8379b6696b3253f60079ccc4b4042e253d5ace4 100644 --- a/PaddleRec/word2vec/net.py +++ b/PaddleRec/word2vec/net.py @@ -23,10 +23,10 @@ import paddle.fluid as fluid def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): datas = [] - input_word = fluid.layers.data(name="input_word", shape=[1], dtype='int64') - true_word = fluid.layers.data(name='true_label', shape=[1], dtype='int64') - neg_word = fluid.layers.data( - name="neg_label", shape=[neg_num], dtype='int64') + input_word = fluid.data(name="input_word", shape=[None, 1], dtype='int64') + true_word = fluid.data(name='true_label', shape=[None, 1], dtype='int64') + neg_word = fluid.data( + name="neg_label", shape=[None, neg_num], dtype='int64') datas.append(input_word) datas.append(true_word) @@ -37,7 +37,7 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): words = fluid.layers.read_file(py_reader) init_width = 0.5 / embedding_size - input_emb = fluid.layers.embedding( + input_emb = fluid.embedding( input=words[0], is_sparse=is_sparse, size=[dict_size, embedding_size], @@ -45,33 +45,31 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): name='emb', initializer=fluid.initializer.Uniform(-init_width, init_width))) - true_emb_w = fluid.layers.embedding( + true_emb_w = fluid.embedding( input=words[1], is_sparse=is_sparse, size=[dict_size, embedding_size], param_attr=fluid.ParamAttr( name='emb_w', initializer=fluid.initializer.Constant(value=0.0))) - true_emb_b = fluid.layers.embedding( + true_emb_b = fluid.embedding( input=words[1], is_sparse=is_sparse, size=[dict_size, 1], param_attr=fluid.ParamAttr( name='emb_b', initializer=fluid.initializer.Constant(value=0.0))) - neg_word_reshape = fluid.layers.reshape(words[2], shape=[-1, 1]) - neg_word_reshape.stop_gradient = True + input_emb = fluid.layers.squeeze(input=input_emb, axes=[1]) + true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1]) + true_emb_b = fluid.layers.squeeze(input=true_emb_b, axes=[1]) - neg_emb_w = fluid.layers.embedding( - input=neg_word_reshape, + neg_emb_w = fluid.embedding( + input=words[2], is_sparse=is_sparse, size=[dict_size, embedding_size], param_attr=fluid.ParamAttr( name='emb_w', learning_rate=1.0)) - - neg_emb_w_re = fluid.layers.reshape( - neg_emb_w, shape=[-1, neg_num, embedding_size]) - neg_emb_b = fluid.layers.embedding( - input=neg_word_reshape, + neg_emb_b = fluid.embedding( + input=words[2], is_sparse=is_sparse, size=[dict_size, 1], param_attr=fluid.ParamAttr( @@ -86,8 +84,7 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): true_emb_b) input_emb_re = fluid.layers.reshape( input_emb, shape=[-1, 1, embedding_size]) - neg_matmul = fluid.layers.matmul( - input_emb_re, neg_emb_w_re, transpose_y=True) + neg_matmul = fluid.layers.matmul(input_emb_re, neg_emb_w, transpose_y=True) neg_matmul_re = fluid.layers.reshape(neg_matmul, shape=[-1, neg_num]) neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec) #nce loss @@ -111,22 +108,18 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): def infer_network(vocab_size, emb_size): - analogy_a = fluid.layers.data(name="analogy_a", shape=[1], dtype='int64') - analogy_b = fluid.layers.data(name="analogy_b", shape=[1], dtype='int64') - analogy_c = fluid.layers.data(name="analogy_c", shape=[1], dtype='int64') - all_label = fluid.layers.data( - name="all_label", - shape=[vocab_size, 1], - dtype='int64', - append_batch_size=False) - emb_all_label = fluid.layers.embedding( + analogy_a = fluid.data(name="analogy_a", shape=[None], dtype='int64') + analogy_b = fluid.data(name="analogy_b", shape=[None], dtype='int64') + analogy_c = fluid.data(name="analogy_c", shape=[None], dtype='int64') + all_label = fluid.data(name="all_label", shape=[vocab_size], dtype='int64') + emb_all_label = fluid.embedding( input=all_label, size=[vocab_size, emb_size], param_attr="emb") - emb_a = fluid.layers.embedding( + emb_a = fluid.embedding( input=analogy_a, size=[vocab_size, emb_size], param_attr="emb") - emb_b = fluid.layers.embedding( + emb_b = fluid.embedding( input=analogy_b, size=[vocab_size, emb_size], param_attr="emb") - emb_c = fluid.layers.embedding( + emb_c = fluid.embedding( input=analogy_c, size=[vocab_size, emb_size], param_attr="emb") target = fluid.layers.elementwise_add( fluid.layers.elementwise_sub(emb_b, emb_a), emb_c) diff --git a/PaddleRec/word2vec/preprocess.py b/PaddleRec/word2vec/preprocess.py index 1d5ad03c0ab2b562bc6cd53d4cac62a16d181e3b..030ac0bea42fef2cd50ea559934f2be450078ea2 100644 --- a/PaddleRec/word2vec/preprocess.py +++ b/PaddleRec/word2vec/preprocess.py @@ -6,6 +6,10 @@ import six import argparse import io import math +import sys +if six.PY2: + reload(sys) + sys.setdefaultencoding('utf-8') prog = re.compile("[^a-z ]", flags=0) @@ -110,10 +114,14 @@ def filter_corpus(args): if not os.path.exists(args.output_corpus_dir): os.makedirs(args.output_corpus_dir) for file in os.listdir(args.input_corpus_dir): - with io.open(args.output_corpus_dir + '/convert_' + file, "w") as wf: + with io.open( + os.path.join(args.output_corpus_dir, 'convert_' + file), + "w", + encoding='utf-8') as wf: with io.open( - args.input_corpus_dir + '/' + file, encoding='utf-8') as rf: - print(args.input_corpus_dir + '/' + file) + os.path.join(args.input_corpus_dir, file), + encoding='utf-8') as rf: + print(os.path.join(args.input_corpus_dir, file)) for line in rf: signal = False line = text_strip(line) diff --git a/PaddleRec/word2vec/train.py b/PaddleRec/word2vec/train.py index 430ec132d2f810eed0025f16e9b87a8f742c455c..087c9b1d9b739df91f05480b2b7a059e59ca24a1 100644 --- a/PaddleRec/word2vec/train.py +++ b/PaddleRec/word2vec/train.py @@ -12,6 +12,12 @@ import six import reader from net import skip_gram_word2vec +import utils +import sys +if six.PY2: + reload(sys) + sys.setdefaultencoding('utf-8') + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger("fluid") logger.setLevel(logging.INFO) @@ -224,5 +230,6 @@ def train(args): if __name__ == '__main__': + utils.check_version() args = parse_args() train(args) diff --git a/PaddleRec/word2vec/utils.py b/PaddleRec/word2vec/utils.py index 01cd04e493b09e880303d7b0c87f5ed71cf86357..c09e30d7e7115f4772eac94af868d190e6555ea7 100644 --- a/PaddleRec/word2vec/utils.py +++ b/PaddleRec/word2vec/utils.py @@ -7,12 +7,13 @@ import paddle.fluid as fluid import paddle import os import preprocess +import io def BuildWord_IdMap(dict_path): word_to_id = dict() id_to_word = dict() - with open(dict_path, 'r') as f: + with io.open(dict_path, 'r', encoding='utf-8') as f: for line in f: word_to_id[line.split(' ')[0]] = int(line.split(' ')[1]) id_to_word[int(line.split(' ')[1])] = line.split(' ')[0] @@ -22,10 +23,26 @@ def BuildWord_IdMap(dict_path): def prepare_data(file_dir, dict_path, batch_size): w2i, i2w = BuildWord_IdMap(dict_path) vocab_size = len(i2w) - reader = paddle.batch(test(file_dir, w2i), batch_size) + reader = fluid.io.batch(test(file_dir, w2i), batch_size) return vocab_size, reader, i2w +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + + def native_to_unicode(s): if _is_unicode(s): return s @@ -75,7 +92,8 @@ def reader_creator(file_dir, word_to_id): def reader(): files = os.listdir(file_dir) for fi in files: - with open(file_dir + '/' + fi, "r") as f: + with io.open( + os.path.join(file_dir, fi), "r", encoding='utf-8') as f: for line in f: if ':' in line: pass