提交 a24db02f 编写于 作者: M minqiyang

Fix data download of wmt14

test=develop
上级 9d7c3b18
...@@ -71,15 +71,16 @@ def __build_dict(tar_file, dict_size, save_path, lang): ...@@ -71,15 +71,16 @@ def __build_dict(tar_file, dict_size, save_path, lang):
for w in sen.split(): for w in sen.split():
word_dict[w] += 1 word_dict[w] += 1
with open(save_path, "w") as fout: with open(save_path, "wb") as fout:
fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)) fout.write(
cpt.to_bytes("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)))
for idx, word in enumerate( for idx, word in enumerate(
sorted( sorted(
six.iteritems(word_dict), key=lambda x: x[1], six.iteritems(word_dict), key=lambda x: x[1],
reverse=True)): reverse=True)):
if idx + 3 == dict_size: break if idx + 3 == dict_size: break
fout.write(word[0].encode('utf-8')) fout.write(cpt.to_bytes(word[0]))
fout.write('\n') fout.write(cpt.to_bytes('\n'))
def __load_dict(tar_file, dict_size, lang, reverse=False): def __load_dict(tar_file, dict_size, lang, reverse=False):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册