提交 963dd953 编写于 作者: Z zhangwenhui03

fix dict bug

上级 93e202ad
......@@ -72,7 +72,7 @@ nine 250430
python preprocess.py --build_dict --build_dict_corpus_dir data/text/ --dict_path data/test_build_dict
```
第二步根据词典将文本转成id, 同时进行downsample,按照概率过滤常见词。
第二步根据词典将文本转成id, 同时进行downsample,按照概率过滤常见词, 同时生成word和id映射的文件,文件名为词典+"_word_to_id_"
```bash
python preprocess.py --filter_corpus --dict_path data/test_build_dict --input_corpus_dir data/text/ --output_corpus_dir data/convert_text8 --min_count 5 --downsample 0.001
......@@ -107,7 +107,7 @@ wget https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
wget https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar
```
预测命令,注意词典名称需要加后缀"_word_to_id_", 此文件是训练阶段生成的。
预测命令,注意词典名称需要加后缀"_word_to_id_", 此文件是预处理阶段生成的。
```bash
python infer.py --infer_epoch --test_dir data/test_mid_dir/ --dict_path data/test_build_dict_word_to_id_ --batch_size 20000 --model_dir v1_cpu5_b100_lr1dir/ --start_index 0
```
......@@ -100,6 +100,12 @@ def filter_corpus(args):
id_counts.append(count)
word_all_count += count
#write word2id file
print("write word2id file to : " + dict_path + "_word_to_id_")
with io.open(
args.dict_path + "_word_to_id_", 'w+', encoding='utf-8') as fid:
for k, v in word_to_id_.items():
fid.write(k + " " + str(v) + '\n')
#filter corpus and convert id
if not os.path.exists(args.output_corpus_dir):
os.makedirs(args.output_corpus_dir)
......
......@@ -41,9 +41,6 @@ class Word2VecReader(object):
self.window_size_ = window_size
self.data_path_ = data_path
self.filelist = filelist
self.word_to_id_ = dict()
self.id_to_word = dict()
self.word_count = dict()
self.trainer_id = trainer_id
self.trainer_num = trainer_num
......@@ -52,24 +49,20 @@ class Word2VecReader(object):
word_id = 0
with io.open(dict_path, 'r', encoding='utf-8') as f:
ll = 0
for line in f:
ll += 1
if ll % 100000 == 1:
print(ll)
word, count = line.split()[0], int(line.split()[1])
self.word_count[word] = count
self.word_to_id_[word] = word_id
self.id_to_word[word_id] = word #build id to word dict
word_id += 1
id_counts.append(count)
word_all_count += count
self.word_all_count = word_all_count
self.corpus_size_ = word_all_count
self.dict_size = len(self.word_to_id_)
self.dict_size = len(id_counts)
self.id_counts_ = id_counts
#write word2id file
print("write word2id file to : " + dict_path + "_word_to_id_")
with io.open(dict_path + "_word_to_id_", 'w+', encoding='utf-8') as f6:
for k, v in self.word_to_id_.items():
f6.write(k + " " + str(v) + '\n')
print("corpus_size:", self.corpus_size_)
self.id_frequencys = [
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册