fix dict bug

963dd953 · zhangwenhui03 · 93e202ad · 963dd953 · 963dd953 · 963dd953
Showing with 13 addition and 14 deletion

PaddleRec/word2vec/README.md PaddleRec/word2vec/README.md +2 -2

PaddleRec/word2vec/preprocess.py PaddleRec/word2vec/preprocess.py +6 -0

PaddleRec/word2vec/reader.py PaddleRec/word2vec/reader.py +5 -12

未找到文件。
--- a/PaddleRec/word2vec/README.md
+++ b/PaddleRec/word2vec/README.md
@@ -72,7 +72,7 @@ nine 250430
 python preprocess.py --build_dict --build_dict_corpus_dir data/text/ --dict_path data/test_build_dict
 ```

-第二步根据词典将文本转成id, 同时进行downsample，按照概率过滤常见词。
+第二步根据词典将文本转成id, 同时进行downsample，按照概率过滤常见词, 同时生成word和id映射的文件，文件名为词典+"_word_to_id_"。

 ```bash
 python preprocess.py --filter_corpus --dict_path data/test_build_dict --input_corpus_dir data/text/ --output_corpus_dir data/convert_text8 --min_count 5 --downsample 0.001
@@ -107,7 +107,7 @@ wget https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
 wget https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar
 ```

-预测命令，注意词典名称需要加后缀"_word_to_id_", 此文件是训练阶段生成的。
+预测命令，注意词典名称需要加后缀"_word_to_id_", 此文件是预处理阶段生成的。
 ```bash
 python infer.py --infer_epoch --test_dir data/test_mid_dir/ --dict_path data/test_build_dict_word_to_id_ --batch_size 20000 --model_dir v1_cpu5_b100_lr1dir/  --start_index 0
 ```
--- a/PaddleRec/word2vec/preprocess.py
+++ b/PaddleRec/word2vec/preprocess.py
@@ -100,6 +100,12 @@ def filter_corpus(args):
            id_counts.append(count)
            word_all_count += count

+    #write word2id file
+    print("write word2id file to : " + dict_path + "_word_to_id_")
+    with io.open(
+            args.dict_path + "_word_to_id_", 'w+', encoding='utf-8') as fid:
+        for k, v in word_to_id_.items():
+            fid.write(k + " " + str(v) + '\n')
    #filter corpus and convert id
    if not os.path.exists(args.output_corpus_dir):
        os.makedirs(args.output_corpus_dir)

--- a/PaddleRec/word2vec/reader.py
+++ b/PaddleRec/word2vec/reader.py
@@ -41,9 +41,6 @@ class Word2VecReader(object):
        self.window_size_ = window_size
        self.data_path_ = data_path
        self.filelist = filelist
-        self.word_to_id_ = dict()
-        self.id_to_word = dict()
-        self.word_count = dict()
        self.trainer_id = trainer_id
        self.trainer_num = trainer_num

@@ -52,24 +49,20 @@ class Word2VecReader(object):
        word_id = 0

        with io.open(dict_path, 'r', encoding='utf-8') as f:
+            ll = 0
            for line in f:
+                ll += 1
+                if ll % 100000 == 1:
+                    print(ll)
                word, count = line.split()[0], int(line.split()[1])
-                self.word_count[word] = count
-                self.word_to_id_[word] = word_id
-                self.id_to_word[word_id] = word  #build id to word dict
                word_id += 1
                id_counts.append(count)
                word_all_count += count

        self.word_all_count = word_all_count
        self.corpus_size_ = word_all_count
-        self.dict_size = len(self.word_to_id_)
+        self.dict_size = len(id_counts)
        self.id_counts_ = id_counts
-        #write word2id file
-        print("write word2id file to : " + dict_path + "_word_to_id_")
-        with io.open(dict_path + "_word_to_id_", 'w+', encoding='utf-8') as f6:
-            for k, v in self.word_to_id_.items():
-                f6.write(k + " " + str(v) + '\n')

        print("corpus_size:", self.corpus_size_)
        self.id_frequencys = [