From bbf4aa7f7e86038d6d3eb47177f2e86bab7fe213 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Mon, 7 Jan 2019 13:49:14 +0000 Subject: [PATCH] fix reader bug --- fluid/PaddleRec/word2vec/infer.py | 24 ++++++++++++--- fluid/PaddleRec/word2vec/preprocess.py | 25 +++++++-------- fluid/PaddleRec/word2vec/reader.py | 42 ++++++++++++++++---------- fluid/PaddleRec/word2vec/train.py | 1 - 4 files changed, 56 insertions(+), 36 deletions(-) diff --git a/fluid/PaddleRec/word2vec/infer.py b/fluid/PaddleRec/word2vec/infer.py index 69844c20..c0dd82ef 100644 --- a/fluid/PaddleRec/word2vec/infer.py +++ b/fluid/PaddleRec/word2vec/infer.py @@ -131,9 +131,25 @@ def build_small_test_case(emb): desc5 = "old - older + deeper = deep" label5 = word_to_id["deep"] - test_cases = [emb1, emb2, emb3, emb4, emb5] - test_case_desc = [desc1, desc2, desc3, desc4, desc5] - test_labels = [label1, label2, label3, label4, label5] + emb6 = emb[word_to_id['boy']] + desc6 = "boy" + label6 = word_to_id["boy"] + emb7 = emb[word_to_id['king']] + desc7 = "king" + label7 = word_to_id["king"] + emb8 = emb[word_to_id['sun']] + desc8 = "sun" + label8 = word_to_id["sun"] + emb9 = emb[word_to_id['key']] + desc9 = "key" + label9 = word_to_id["key"] + test_cases = [emb1, emb2, emb3, emb4, emb5, emb6, emb7, emb8, emb9] + test_case_desc = [ + desc1, desc2, desc3, desc4, desc5, desc6, desc7, desc8, desc9 + ] + test_labels = [ + label1, label2, label3, label4, label5, label6, label7, label8, label9 + ] return norm(np.array(test_cases)), test_case_desc, test_labels @@ -229,8 +245,6 @@ def infer_during_train(args): while True: time.sleep(60) current_list = os.listdir(args.model_output_dir) - # logger.info("current_list is : {}".format(current_list)) - # logger.info("model_file_list is : {}".format(model_file_list)) if set(model_file_list) == set(current_list): if solved_new: solved_new = False diff --git a/fluid/PaddleRec/word2vec/preprocess.py b/fluid/PaddleRec/word2vec/preprocess.py index a8f08bdf..0c2d4b7d 100644 --- a/fluid/PaddleRec/word2vec/preprocess.py +++ b/fluid/PaddleRec/word2vec/preprocess.py @@ -3,6 +3,7 @@ import re import six import argparse +import io prog = re.compile("[^a-z ]", flags=0) word_count = dict() @@ -83,7 +84,6 @@ def native_to_unicode(s): return _to_unicode(s) except UnicodeDecodeError: res = _to_unicode(s, ignore_errors=True) - tf.logging.info("Ignoring Unicode error, outputting: %s" % res) return res @@ -199,14 +199,15 @@ def preprocess(args): # word to count if args.with_other_dict: - with open(args.other_dict_path, 'r') as f: + with io.open(args.other_dict_path, 'r', encoding='utf-8') as f: for line in f: word_count[native_to_unicode(line.strip())] = 1 if args.is_local: for i in range(1, 100): - with open(args.data_path + "/news.en-000{:0>2d}-of-00100".format( - i)) as f: + with io.open( + args.data_path + "/news.en-000{:0>2d}-of-00100".format(i), + encoding='utf-8') as f: for line in f: line = strip_lines(line) words = line.split() @@ -231,21 +232,17 @@ def preprocess(args): path_table, path_code, word_code_len = build_Huffman(word_count, 40) - with open(args.dict_path, 'w+') as f: + with io.open(args.dict_path, 'w+', encoding='utf-8') as f: for k, v in word_count.items(): - f.write(k.encode("utf-8") + " " + str(v).encode("utf-8") + '\n') + f.write(k + " " + str(v) + '\n') - with open(args.dict_path + "_ptable", 'w+') as f2: + with io.open(args.dict_path + "_ptable", 'w+', encoding='utf-8') as f2: for pk, pv in path_table.items(): - f2.write( - pk.encode("utf-8") + '\t' + ' '.join((str(x).encode("utf-8") - for x in pv)) + '\n') + f2.write(pk + '\t' + ' '.join((str(x) for x in pv)) + '\n') - with open(args.dict_path + "_pcode", 'w+') as f3: + with io.open(args.dict_path + "_pcode", 'w+', encoding='utf-8') as f3: for pck, pcv in path_code.items(): - f3.write( - pck.encode("utf-8") + '\t' + ' '.join((str(x).encode("utf-8") - for x in pcv)) + '\n') + f3.write(pck + '\t' + ' '.join((str(x) for x in pcv)) + '\n') if __name__ == "__main__": diff --git a/fluid/PaddleRec/word2vec/reader.py b/fluid/PaddleRec/word2vec/reader.py index 0f66eaac..01d0d8e0 100644 --- a/fluid/PaddleRec/word2vec/reader.py +++ b/fluid/PaddleRec/word2vec/reader.py @@ -2,8 +2,8 @@ import numpy as np import preprocess - import logging +import io logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger("fluid") @@ -42,6 +42,7 @@ class Word2VecReader(object): self.num_non_leaf = 0 self.word_to_id_ = dict() self.id_to_word = dict() + self.word_count = dict() self.word_to_path = dict() self.word_to_code = dict() self.trainer_id = trainer_id @@ -51,20 +52,19 @@ class Word2VecReader(object): word_counts = [] word_id = 0 - with open(dict_path, 'r') as f: + with io.open(dict_path, 'r', encoding='utf-8') as f: for line in f: - line = line.decode(encoding='UTF-8') word, count = line.split()[0], int(line.split()[1]) + self.word_count[word] = count self.word_to_id_[word] = word_id self.id_to_word[word_id] = word #build id to word dict word_id += 1 word_counts.append(count) word_all_count += count - with open(dict_path + "_word_to_id_", 'w+') as f6: + with io.open(dict_path + "_word_to_id_", 'w+', encoding='utf-8') as f6: for k, v in self.word_to_id_.items(): - f6.write( - k.encode("utf-8") + " " + str(v).encode("utf-8") + '\n') + f6.write(k + " " + str(v) + '\n') self.dict_size = len(self.word_to_id_) self.word_frequencys = [ @@ -73,7 +73,7 @@ class Word2VecReader(object): print("dict_size = " + str( self.dict_size)) + " word_all_count = " + str(word_all_count) - with open(dict_path + "_ptable", 'r') as f2: + with io.open(dict_path + "_ptable", 'r', encoding='utf-8') as f2: for line in f2: self.word_to_path[line.split('\t')[0]] = np.fromstring( line.split('\t')[1], dtype=int, sep=' ') @@ -81,9 +81,8 @@ class Word2VecReader(object): line.split('\t')[1], dtype=int, sep=' ')[0] print("word_ptable dict_size = " + str(len(self.word_to_path))) - with open(dict_path + "_pcode", 'r') as f3: + with io.open(dict_path + "_pcode", 'r', encoding='utf-8') as f3: for line in f3: - line = line.decode(encoding='UTF-8') self.word_to_code[line.split('\t')[0]] = np.fromstring( line.split('\t')[1], dtype=int, sep=' ') print("word_pcode dict_size = " + str(len(self.word_to_code))) @@ -109,13 +108,15 @@ class Word2VecReader(object): def train(self, with_hs): def _reader(): for file in self.filelist: - with open(self.data_path_ + "/" + file, 'r') as f: + with io.open( + self.data_path_ + "/" + file, 'r', + encoding='utf-8') as f: logger.info("running data in {}".format(self.data_path_ + "/" + file)) count = 1 for line in f: if self.trainer_id == count % self.trainer_num: - line = preprocess.strip_lines(line) + line = preprocess.strip_lines(line, self.word_count) word_ids = [ self.word_to_id_[word] for word in line.split() if word in self.word_to_id_ @@ -131,13 +132,15 @@ class Word2VecReader(object): def _reader_hs(): for file in self.filelist: - with open(self.data_path_ + "/" + file, 'r') as f: + with io.open( + self.data_path_ + "/" + file, 'r', + encoding='utf-8') as f: logger.info("running data in {}".format(self.data_path_ + "/" + file)) count = 1 for line in f: if self.trainer_id == count % self.trainer_num: - line = preprocess.strip_lines(line) + line = preprocess.strip_lines(line, self.word_count) word_ids = [ self.word_to_id_[word] for word in line.split() if word in self.word_to_id_ @@ -164,13 +167,20 @@ class Word2VecReader(object): if __name__ == "__main__": - window_size = 10 + window_size = 5 + + reader = Word2VecReader( + "./data/1-billion_dict", + "./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/", + ["news.en-00001-of-00100"], 0, 1) - reader = Word2VecReader("data/enwik9_dict", "data/enwik9", window_size) i = 0 - for x, y in reader.train()(): + # print(reader.train(True)) + for x, y, z, f in reader.train(True)(): print("x: " + str(x)) print("y: " + str(y)) + print("path: " + str(z)) + print("code: " + str(f)) print("\n") if i == 10: exit(0) diff --git a/fluid/PaddleRec/word2vec/train.py b/fluid/PaddleRec/word2vec/train.py index 74a8271a..40f7729f 100644 --- a/fluid/PaddleRec/word2vec/train.py +++ b/fluid/PaddleRec/word2vec/train.py @@ -135,7 +135,6 @@ def convert_python_to_tensor(batch_size, sample_reader, is_hs): for sample in sample_reader(): for i, fea in enumerate(sample): result[i].append(fea) - if len(result[0]) == batch_size: tensor_result = [] for tensor in result: -- GitLab