From 270c0c5f5cfa945fd08f46279dc5578aaa94a42a Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 10 Apr 2017 16:21:51 +0800
Subject: [PATCH] Remove unecessary code to generate freq_dict.

---
 python/paddle/v2/dataset/common.py   |  7 -------
 python/paddle/v2/dataset/imdb.py     |  5 +++--
 python/paddle/v2/dataset/imikolov.py | 12 ++++++------
 3 files changed, 9 insertions(+), 15 deletions(-)
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 7021a6da0..2eb018b8d 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -66,13 +66,6 @@ def download(url, module_name, md5sum):
     return filename
 
 
-def dict_add(a_dict, ele):
-    if ele in a_dict:
-        a_dict[ele] += 1
-    else:
-        a_dict[ele] = 1
-
-
 def fetch_all():
     for module_name in filter(lambda x: not x.startswith("__"),
                               dir(paddle.v2.dataset)):
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index 5284017ce..9a7ccff4d 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -18,6 +18,7 @@ TODO(yuyang18): Complete comments.
 """
 
 import paddle.v2.dataset.common
+import collections
 import tarfile
 import Queue
 import re
@@ -48,10 +49,10 @@ def tokenize(pattern):
 
 
 def build_dict(pattern, cutoff):
-    word_freq = {}
+    word_freq = collections.defaultdict(int)
     for doc in tokenize(pattern):
         for word in doc:
-            paddle.v2.dataset.common.dict_add(word_freq, word)
+            word_freq[word] += 1
 
     # Not sure if we should prune less-frequent words here.
     word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
index 2931d06e7..5d7e0282b 100644
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -17,6 +17,7 @@ imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/
 Complete comments.
 """
 import paddle.v2.dataset.common
+import collections
 import tarfile
 
 __all__ = ['train', 'test', 'build_dict']
@@ -26,15 +27,14 @@ MD5 = '30177ea32e27c525793142b6bf2c8e2d'
 
 
 def word_count(f, word_freq=None):
-    add = paddle.v2.dataset.common.dict_add
-    if word_freq == None:
-        word_freq = {}
+    if word_freq is None:
+        word_freq = collections.defaultdict(int)
 
     for l in f:
         for w in l.strip().split():
-            add(word_freq, w)
-        add(word_freq, '<s>')
-        add(word_freq, '<e>')
+            word_freq[w] += 1
+        word_freq['<s>'] += 1
+        word_freq['<e>'] += 1
 
     return word_freq
 
-- 
GitLab