From 0bcc4d48defeb00f191c04d868098523965bc0d2 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 27 Feb 2017 17:19:29 +0800
Subject: [PATCH] Simplize cifar

---
 python/paddle/v2/dataset/cifar.py | 170 ++++++++++--------------------
 1 file changed, 53 insertions(+), 117 deletions(-)

diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index 54289430d4c..9a999de7e02 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -15,33 +15,10 @@ import cPickle
 import itertools
 import numpy
 
-__all__ = ['CIFAR10', 'CIFAR100', 'train_creator', 'test_creator']
-
-
-def __download_file__(filename, url, md5):
-    def __file_ok__():
-        if not os.path.exists(filename):
-            return False
-        md5_hash = hashlib.md5()
-        with open(filename, 'rb') as f:
-            for chunk in iter(lambda: f.read(4096), b""):
-                md5_hash.update(chunk)
-
-        return md5_hash.hexdigest() == md5
-
-    while not __file_ok__():
-        response = urllib2.urlopen(url)
-        with open(filename, mode='wb') as of:
-            shutil.copyfileobj(fsrc=response, fdst=of)
-
-
-def __read_one_batch__(batch):
-    data = batch['data']
-    labels = batch.get('labels', batch.get('fine_labels', None))
-    assert labels is not None
-    for sample, label in itertools.izip(data, labels):
-        yield (sample / 255.0).astype(numpy.float32), int(label)
-
+__all__ = [
+    'cifar_100_train_creator', 'cifar_100_test_creator', 'train_creator',
+    'test_creator'
+]
 
 CIFAR10_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
 CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
@@ -49,125 +26,84 @@ CIFAR100_URL = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
 CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
 
 
-class CIFAR(object):
-    """
-    CIFAR dataset reader. The base class for CIFAR-10 and CIFAR-100
-
-    :param url: Download url.
-    :param md5: File md5sum
-    :param meta_filename: Meta file name in package.
-    :param train_filename: Train file name in package.
-    :param test_filename: Test file name in package.
-    """
+def __read_batch__(filename, sub_name):
+    def reader():
+        def __read_one_batch_impl__(batch):
+            data = batch['data']
+            labels = batch.get('labels', batch.get('fine_labels', None))
+            assert labels is not None
+            for sample, label in itertools.izip(data, labels):
+                yield (sample / 255.0).astype(numpy.float32), int(label)
 
-    def __init__(self, url, md5, meta_filename, train_filename, test_filename):
-        filename = os.path.split(url)[-1]
-        assert DATA_HOME is not None
-        filepath = os.path.join(DATA_HOME, md5)
-        if not os.path.exists(filepath):
-            os.makedirs(filepath)
-
-        self.__full_file__ = os.path.join(filepath, filename)
-        self.__meta_filename__ = meta_filename
-        self.__train_filename__ = train_filename
-        self.__test_filename__ = test_filename
-        __download_file__(filename=self.__full_file__, url=url, md5=md5)
-
-    def labels(self):
-        """
-        labels get all dataset label in order.
-        :return: a list of label.
-        :rtype: list[string]
-        """
-        with tarfile.open(self.__full_file__, mode='r') as f:
-            name = [
-                each_item.name for each_item in f
-                if self.__meta_filename__ in each_item.name
-            ][0]
-            meta_f = f.extractfile(name)
-            meta = cPickle.load(meta_f)
-        for key in meta:
-            if 'label' in key:
-                return meta[key]
-        else:
-            raise RuntimeError("Unexpected branch.")
-
-    def train(self):
-        """
-        Train Reader
-        """
-        return self.__read_batch__(self.__train_filename__)
-
-    def test(self):
-        """
-        Test Reader
-        """
-        return self.__read_batch__(self.__test_filename__)
-
-    def __read_batch__(self, sub_name):
-        with tarfile.open(self.__full_file__, mode='r') as f:
+        with tarfile.open(filename, mode='r') as f:
             names = (each_item.name for each_item in f
                      if sub_name in each_item.name)
 
             for name in names:
                 batch = cPickle.load(f.extractfile(name))
-                for item in __read_one_batch__(batch):
+                for item in __read_one_batch_impl__(batch):
                     yield item
 
+    return reader
 
-class CIFAR10(CIFAR):
-    """
-    CIFAR-10 dataset, images are classified in 10 classes.
-    """
 
-    def __init__(self):
-        super(CIFAR10, self).__init__(
-            CIFAR10_URL,
-            CIFAR10_MD5,
-            meta_filename='batches.meta',
-            train_filename='data_batch',
-            test_filename='test_batch')
+def download(url, md5):
+    filename = os.path.split(url)[-1]
+    assert DATA_HOME is not None
+    filepath = os.path.join(DATA_HOME, md5)
+    if not os.path.exists(filepath):
+        os.makedirs(filepath)
+    __full_file__ = os.path.join(filepath, filename)
 
+    def __file_ok__():
+        if not os.path.exists(__full_file__):
+            return False
+        md5_hash = hashlib.md5()
+        with open(__full_file__, 'rb') as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                md5_hash.update(chunk)
+
+        return md5_hash.hexdigest() == md5
+
+    while not __file_ok__():
+        response = urllib2.urlopen(url)
+        with open(__full_file__, mode='wb') as of:
+            shutil.copyfileobj(fsrc=response, fdst=of)
+    return __full_file__
+
+
+def cifar_100_train_creator():
+    fn = download(url=CIFAR100_URL, md5=CIFAR100_MD5)
+    return __read_batch__(fn, 'train')
 
-class CIFAR100(CIFAR):
-    """
-    CIFAR-100 dataset, images are classified in 100 classes.
-    """
 
-    def __init__(self):
-        super(CIFAR100, self).__init__(
-            CIFAR100_URL,
-            CIFAR100_MD5,
-            meta_filename='meta',
-            train_filename='train',
-            test_filename='test')
+def cifar_100_test_creator():
+    fn = download(url=CIFAR100_URL, md5=CIFAR100_MD5)
+    return __read_batch__(fn, 'test')
 
 
 def train_creator():
     """
     Default train reader creator. Use CIFAR-10 dataset.
     """
-    cifar = CIFAR10()
-    return cifar.train
+    fn = download(url=CIFAR10_URL, md5=CIFAR10_MD5)
+    return __read_batch__(fn, 'data_batch')
 
 
 def test_creator():
     """
     Default test reader creator. Use CIFAR-10 dataset.
     """
-    cifar = CIFAR10()
-    return cifar.test
+    fn = download(url=CIFAR10_URL, md5=CIFAR10_MD5)
+    return __read_batch__(fn, 'test_batch')
 
 
-def unittest(label_count=100):
-    cifar = globals()["CIFAR%d" % label_count]()
-    assert len(cifar.labels()) == label_count
-    for _ in cifar.test():
+def unittest():
+    for _ in train_creator()():
         pass
-    for _ in cifar.train():
+    for _ in test_creator()():
         pass
 
 
 if __name__ == '__main__':
-    unittest(10)
-    unittest(100)
+    unittest()
-- 
GitLab