From 0faa6fb373f6671cc257cdc12654477b583bc73b Mon Sep 17 00:00:00 2001 From: zhangwenhui03 Date: Fri, 12 Apr 2019 16:25:54 +0800 Subject: [PATCH] fix train.py dict --- PaddleRec/word2vec/README.md | 4 ++-- PaddleRec/word2vec/preprocess.py | 6 ++++++ PaddleRec/word2vec/reader.py | 17 +++++------------ 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/PaddleRec/word2vec/README.md b/PaddleRec/word2vec/README.md index 936d9fac..e728bff0 100644 --- a/PaddleRec/word2vec/README.md +++ b/PaddleRec/word2vec/README.md @@ -72,7 +72,7 @@ nine 250430 python preprocess.py --build_dict --build_dict_corpus_dir data/text/ --dict_path data/test_build_dict ``` -第二步根据词典将文本转成id, 同时进行downsample,按照概率过滤常见词。 +第二步根据词典将文本转成id, 同时进行downsample,按照概率过滤常见词, 同时生成word和id映射的文件,文件名为词典+"_word_to_id_"。 ```bash python preprocess.py --filter_corpus --dict_path data/test_build_dict --input_corpus_dir data/text/ --output_corpus_dir data/convert_text8 --min_count 5 --downsample 0.001 @@ -107,7 +107,7 @@ wget https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar wget https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar ``` -预测命令,注意词典名称需要加后缀"_word_to_id_", 此文件是训练阶段生成的。 +预测命令,注意词典名称需要加后缀"_word_to_id_", 此文件是预处理阶段生成的。 ```bash python infer.py --infer_epoch --test_dir data/test_mid_dir/ --dict_path data/test_build_dict_word_to_id_ --batch_size 20000 --model_dir v1_cpu5_b100_lr1dir/ --start_index 0 ``` diff --git a/PaddleRec/word2vec/preprocess.py b/PaddleRec/word2vec/preprocess.py index 5af174e7..ac816db9 100644 --- a/PaddleRec/word2vec/preprocess.py +++ b/PaddleRec/word2vec/preprocess.py @@ -100,6 +100,12 @@ def filter_corpus(args): id_counts.append(count) word_all_count += count + #write word2id file + print("write word2id file to : " + dict_path + "_word_to_id_") + with io.open( + args.dict_path + "_word_to_id_", 'w+', encoding='utf-8') as fid: + for k, v in word_to_id_.items(): + fid.write(k + " " + str(v) + '\n') #filter corpus and convert id if not os.path.exists(args.output_corpus_dir): os.makedirs(args.output_corpus_dir) diff --git a/PaddleRec/word2vec/reader.py b/PaddleRec/word2vec/reader.py index ea3352ac..aee1a4db 100644 --- a/PaddleRec/word2vec/reader.py +++ b/PaddleRec/word2vec/reader.py @@ -41,9 +41,6 @@ class Word2VecReader(object): self.window_size_ = window_size self.data_path_ = data_path self.filelist = filelist - self.word_to_id_ = dict() - self.id_to_word = dict() - self.word_count = dict() self.trainer_id = trainer_id self.trainer_num = trainer_num @@ -52,24 +49,20 @@ class Word2VecReader(object): word_id = 0 with io.open(dict_path, 'r', encoding='utf-8') as f: + ll = 0 for line in f: + ll += 1 + if ll % 100000 == 1: + print(ll) word, count = line.split()[0], int(line.split()[1]) - self.word_count[word] = count - self.word_to_id_[word] = word_id - self.id_to_word[word_id] = word #build id to word dict word_id += 1 id_counts.append(count) word_all_count += count self.word_all_count = word_all_count self.corpus_size_ = word_all_count - self.dict_size = len(self.word_to_id_) + self.dict_size = len(id_counts) self.id_counts_ = id_counts - #write word2id file - print("write word2id file to : " + dict_path + "_word_to_id_") - with io.open(dict_path + "_word_to_id_", 'w+', encoding='utf-8') as f6: - for k, v in self.word_to_id_.items(): - f6.write(k + " " + str(v) + '\n') print("corpus_size:", self.corpus_size_) self.id_frequencys = [ -- GitLab