You need to sign in or sign up before continuing.
提交 a2cec420 编写于 作者: Y Yi Wang

In response to comments from Wen-bo

上级 da6af591
...@@ -3,7 +3,6 @@ imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/ ...@@ -3,7 +3,6 @@ imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/
""" """
import paddle.v2.dataset.common import paddle.v2.dataset.common
import tarfile import tarfile
import collections
__all__ = ['train', 'test'] __all__ = ['train', 'test']
...@@ -40,10 +39,8 @@ def build_dict(train_filename, test_filename): ...@@ -40,10 +39,8 @@ def build_dict(train_filename, test_filename):
testf = tf.extractfile(test_filename) testf = tf.extractfile(test_filename)
word_freq = word_count(testf, word_count(trainf)) word_freq = word_count(testf, word_count(trainf))
STOPWORD_FREQ = 3000
TYPO_FREQ = 50 TYPO_FREQ = 50
word_freq = filter(lambda x: x[1] > TYPO_FREQ and x[1] < STOPWORD_FREQ, word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items())
word_freq.items())
dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0])) dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*dictionary)) words, _ = list(zip(*dictionary))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册