From 50e750a2a183ed9dacf299f07af6d3181a59f54f Mon Sep 17 00:00:00 2001 From: Wojciech Uss Date: Fri, 22 Jun 2018 11:28:31 +0200 Subject: [PATCH] added cycling the cifar and flowers datasets --- python/paddle/v2/dataset/cifar.py | 27 ++++++++++------ python/paddle/v2/dataset/flowers.py | 50 ++++++++++++++++++++--------- 2 files changed, 52 insertions(+), 25 deletions(-) diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index 0a2a1ced1..662655c83 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -43,7 +43,7 @@ CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz' CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85' -def reader_creator(filename, sub_name): +def reader_creator(filename, sub_name, cycle=False): def read_batch(batch): data = batch['data'] labels = batch.get('labels', batch.get('fine_labels', None)) @@ -56,10 +56,13 @@ def reader_creator(filename, sub_name): names = (each_item.name for each_item in f if sub_name in each_item.name) - for name in names: - batch = cPickle.load(f.extractfile(name)) - for item in read_batch(batch): - yield item + while True: + for name in names: + batch = cPickle.load(f.extractfile(name)) + for item in read_batch(batch): + yield item + if not cycle: + break return reader @@ -94,34 +97,40 @@ def test100(): 'test') -def train10(): +def train10(cycle=False): """ CIFAR-10 training set creator. It returns a reader creator, each sample in the reader is image pixels in [0, 1] and label in [0, 9]. + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: Training reader creator :rtype: callable """ return reader_creator( paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), - 'data_batch') + 'data_batch', + cycle=cycle) -def test10(): +def test10(cycle=False): """ CIFAR-10 test set creator. It returns a reader creator, each sample in the reader is image pixels in [0, 1] and label in [0, 9]. + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: Test reader creator. :rtype: callable """ return reader_creator( paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), - 'test_batch') + 'test_batch', + cycle=cycle) def fetch(): diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py index 357a4e9b0..db12076d5 100644 --- a/python/paddle/v2/dataset/flowers.py +++ b/python/paddle/v2/dataset/flowers.py @@ -76,7 +76,8 @@ def reader_creator(data_file, dataset_name, mapper, buffered_size=1024, - use_xmap=True): + use_xmap=True, + cycle=False): ''' 1. read images from tar file and merge images into batch files in 102flowers.tgz_batch/ @@ -96,6 +97,8 @@ def reader_creator(data_file, :type mapper: callable :param buffered_size: the size of buffer used to process images :type buffered_size: int + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: data reader :rtype: callable ''' @@ -108,15 +111,18 @@ def reader_creator(data_file, file_list = batch_images_from_tar(data_file, dataset_name, img2label) def reader(): - for file in open(file_list): - file = file.strip() - batch = None - with open(file, 'r') as f: - batch = cPickle.load(f) - data = batch['data'] - labels = batch['label'] - for sample, label in itertools.izip(data, batch['label']): - yield sample, int(label) - 1 + while True: + for file in open(file_list): + file = file.strip() + batch = None + with open(file, 'r') as f: + batch = cPickle.load(f) + data = batch['data'] + labels = batch['label'] + for sample, label in itertools.izip(data, batch['label']): + yield sample, int(label) - 1 + if not cycle: + break if use_xmap: cpu_num = int(os.environ.get('CPU_NUM', cpu_count())) @@ -125,7 +131,7 @@ def reader_creator(data_file, return map_readers(mapper, reader) -def train(mapper=train_mapper, buffered_size=1024, use_xmap=True): +def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False): ''' Create flowers training set reader. It returns a reader, each sample in the reader is @@ -138,17 +144,23 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True): :type mapper: callable :param buffered_size: the size of buffer used to process images :type buffered_size: int + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: train data reader :rtype: callable ''' return reader_creator( download(DATA_URL, 'flowers', DATA_MD5), download(LABEL_URL, 'flowers', LABEL_MD5), - download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper, - buffered_size, use_xmap) + download(SETID_URL, 'flowers', SETID_MD5), + TRAIN_FLAG, + mapper, + buffered_size, + use_xmap, + cycle=cycle) -def test(mapper=test_mapper, buffered_size=1024, use_xmap=True): +def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False): ''' Create flowers test set reader. It returns a reader, each sample in the reader is @@ -161,14 +173,20 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True): :type mapper: callable :param buffered_size: the size of buffer used to process images :type buffered_size: int + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: test data reader :rtype: callable ''' return reader_creator( download(DATA_URL, 'flowers', DATA_MD5), download(LABEL_URL, 'flowers', LABEL_MD5), - download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper, - buffered_size, use_xmap) + download(SETID_URL, 'flowers', SETID_MD5), + TEST_FLAG, + mapper, + buffered_size, + use_xmap, + cycle=cycle) def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True): -- GitLab