From 270c0c5f5cfa945fd08f46279dc5578aaa94a42a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 10 Apr 2017 16:21:51 +0800 Subject: [PATCH] Remove unecessary code to generate freq_dict. --- python/paddle/v2/dataset/common.py | 7 ------- python/paddle/v2/dataset/imdb.py | 5 +++-- python/paddle/v2/dataset/imikolov.py | 12 ++++++------ 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index 7021a6da05..2eb018b8d6 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -66,13 +66,6 @@ def download(url, module_name, md5sum): return filename -def dict_add(a_dict, ele): - if ele in a_dict: - a_dict[ele] += 1 - else: - a_dict[ele] = 1 - - def fetch_all(): for module_name in filter(lambda x: not x.startswith("__"), dir(paddle.v2.dataset)): diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index 5284017ce0..9a7ccff4d5 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -18,6 +18,7 @@ TODO(yuyang18): Complete comments. """ import paddle.v2.dataset.common +import collections import tarfile import Queue import re @@ -48,10 +49,10 @@ def tokenize(pattern): def build_dict(pattern, cutoff): - word_freq = {} + word_freq = collections.defaultdict(int) for doc in tokenize(pattern): for word in doc: - paddle.v2.dataset.common.dict_add(word_freq, word) + word_freq[word] += 1 # Not sure if we should prune less-frequent words here. word_freq = filter(lambda x: x[1] > cutoff, word_freq.items()) diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index 2931d06e7e..5d7e0282b4 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -17,6 +17,7 @@ imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/ Complete comments. """ import paddle.v2.dataset.common +import collections import tarfile __all__ = ['train', 'test', 'build_dict'] @@ -26,15 +27,14 @@ MD5 = '30177ea32e27c525793142b6bf2c8e2d' def word_count(f, word_freq=None): - add = paddle.v2.dataset.common.dict_add - if word_freq == None: - word_freq = {} + if word_freq is None: + word_freq = collections.defaultdict(int) for l in f: for w in l.strip().split(): - add(word_freq, w) - add(word_freq, '') - add(word_freq, '') + word_freq[w] += 1 + word_freq[''] += 1 + word_freq[''] += 1 return word_freq -- GitLab