Simplize cifar

0bcc4d48 · Yu Yang · 434ada47 · 0bcc4d48
显示空白变更内容
内联并排

Showing with 53 addition and 117 deletion

python/paddle/v2/dataset/cifar.py python/paddle/v2/dataset/cifar.py +53 -117

未找到文件。
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -15,159 +15,95 @@ import cPickle
 import itertools
 import numpy
-__all__ = ['CIFAR10', 'CIFAR100', 'train_creator', 'test_creator']
+__all__ = [
+    'cifar_100_train_creator', 'cifar_100_test_creator', 'train_creator',
+    'test_creator'
+]
+CIFAR10_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
-def __download_file__(filename, url, md5):
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
-    def __file_ok__():
+CIFAR100_URL = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
-        if not os.path.exists(filename):
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
-            return False
-        md5_hash = hashlib.md5()
-        with open(filename, 'rb') as f:
-            for chunk in iter(lambda: f.read(4096), b""):
-                md5_hash.update(chunk)
-        return md5_hash.hexdigest() == md5
-    while not __file_ok__():
-        response = urllib2.urlopen(url)
-        with open(filename, mode='wb') as of:
-            shutil.copyfileobj(fsrc=response, fdst=of)
-def __read_one_batch__(batch):
+def __read_batch__(filename, sub_name):
+    def reader():
+        def __read_one_batch_impl__(batch):
            data = batch['data']
            labels = batch.get('labels', batch.get('fine_labels', None))
            assert labels is not None
            for sample, label in itertools.izip(data, labels):
                yield (sample / 255.0).astype(numpy.float32), int(label)
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
-CIFAR10_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
+            for name in names:
-CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+                batch = cPickle.load(f.extractfile(name))
-CIFAR100_URL = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
+                for item in __read_one_batch_impl__(batch):
-CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+                    yield item
-class CIFAR(object):
+    return reader
-    """
-    CIFAR dataset reader. The base class for CIFAR-10 and CIFAR-100
-    :param url: Download url.
-    :param md5: File md5sum
-    :param meta_filename: Meta file name in package.
-    :param train_filename: Train file name in package.
-    :param test_filename: Test file name in package.
-    """
-    def __init__(self, url, md5, meta_filename, train_filename, test_filename):
+def download(url, md5):
    filename = os.path.split(url)[-1]
    assert DATA_HOME is not None
    filepath = os.path.join(DATA_HOME, md5)
    if not os.path.exists(filepath):
        os.makedirs(filepath)
+    __full_file__ = os.path.join(filepath, filename)
-        self.__full_file__ = os.path.join(filepath, filename)
+    def __file_ok__():
-        self.__meta_filename__ = meta_filename
+        if not os.path.exists(__full_file__):
-        self.__train_filename__ = train_filename
+            return False
-        self.__test_filename__ = test_filename
+        md5_hash = hashlib.md5()
-        __download_file__(filename=self.__full_file__, url=url, md5=md5)
+        with open(__full_file__, 'rb') as f:
+            for chunk in iter(lambda: f.read(4096), b""):
-    def labels(self):
+                md5_hash.update(chunk)
-        """
-        labels get all dataset label in order.
-        :return: a list of label.
-        :rtype: list[string]
-        """
-        with tarfile.open(self.__full_file__, mode='r') as f:
-            name = [
-                each_item.name for each_item in f
-                if self.__meta_filename__ in each_item.name
-            ][0]
-            meta_f = f.extractfile(name)
-            meta = cPickle.load(meta_f)
-        for key in meta:
-            if 'label' in key:
-                return meta[key]
-        else:
-            raise RuntimeError("Unexpected branch.")
-    def train(self):
-        """
-        Train Reader
-        """
-        return self.__read_batch__(self.__train_filename__)
-    def test(self):
-        """
-        Test Reader
-        """
-        return self.__read_batch__(self.__test_filename__)
-    def __read_batch__(self, sub_name):
-        with tarfile.open(self.__full_file__, mode='r') as f:
-            names = (each_item.name for each_item in f
-                     if sub_name in each_item.name)
-            for name in names:
-                batch = cPickle.load(f.extractfile(name))
-                for item in __read_one_batch__(batch):
-                    yield item
+        return md5_hash.hexdigest() == md5
-class CIFAR10(CIFAR):
+    while not __file_ok__():
-    """
+        response = urllib2.urlopen(url)
-    CIFAR-10 dataset, images are classified in 10 classes.
+        with open(__full_file__, mode='wb') as of:
-    """
+            shutil.copyfileobj(fsrc=response, fdst=of)
+    return __full_file__
-    def __init__(self):
-        super(CIFAR10, self).__init__(
-            CIFAR10_URL,
-            CIFAR10_MD5,
-            meta_filename='batches.meta',
-            train_filename='data_batch',
-            test_filename='test_batch')
+def cifar_100_train_creator():
+    fn = download(url=CIFAR100_URL, md5=CIFAR100_MD5)
+    return __read_batch__(fn, 'train')
-class CIFAR100(CIFAR):
-    """
-    CIFAR-100 dataset, images are classified in 100 classes.
-    """
-    def __init__(self):
+def cifar_100_test_creator():
-        super(CIFAR100, self).__init__(
+    fn = download(url=CIFAR100_URL, md5=CIFAR100_MD5)
-            CIFAR100_URL,
+    return __read_batch__(fn, 'test')
-            CIFAR100_MD5,
-            meta_filename='meta',
-            train_filename='train',
-            test_filename='test')
 def train_creator():
    """
    Default train reader creator. Use CIFAR-10 dataset.
    """
-    cifar = CIFAR10()
+    fn = download(url=CIFAR10_URL, md5=CIFAR10_MD5)
-    return cifar.train
+    return __read_batch__(fn, 'data_batch')
 def test_creator():
    """
    Default test reader creator. Use CIFAR-10 dataset.
    """
-    cifar = CIFAR10()
+    fn = download(url=CIFAR10_URL, md5=CIFAR10_MD5)
-    return cifar.test
+    return __read_batch__(fn, 'test_batch')
-def unittest(label_count=100):
+def unittest():
-    cifar = globals()["CIFAR%d" % label_count]()
+    for _ in train_creator()():
-    assert len(cifar.labels()) == label_count
-    for _ in cifar.test():
        pass
-    for _ in cifar.train():
+    for _ in test_creator()():
        pass
 if __name__ == '__main__':
-    unittest(10)
+    unittest()
-    unittest(100)