diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index 0cb27f802c40ef123fdc9c6799aad3b2a5f554c0..aa418c657a4ba16cce61c030066f4d3e14e891cc 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -7,4 +7,4 @@
build_and_install/index_cn.rst
concepts/use_concepts_cn.rst
-- `深度学习入门课程 `_
+- `深度学习入门课程 `_
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index 9f771e93e8b63eb98e31ec12667bd1aa007af20e..be3253e3d41b99a2b696e2c5ef6463ed49680d69 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -6,4 +6,4 @@ GET STARTED
build_and_install/index_en.rst
-- `Deep Learning 101 `_
+- `Deep Learning 101 `_
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 9f0f9f2d74db8e0b538adb8263e2844c2cf4b74f..2b48e4dc0f875be9a87797fa14885926999a5010 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -58,7 +58,7 @@ EOF
make -j `nproc`
if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
pip uninstall -y py-paddle paddle || true
- ctest -V
+ ctest --output-on-failure
fi
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
new file mode 100644
index 0000000000000000000000000000000000000000..07c13cf719ae0c864c23fef51f0bd7d47f265759
--- /dev/null
+++ b/python/paddle/v2/dataset/flowers.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module will download dataset from
+http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
+and parse train/test set intopaddle reader creators.
+
+This set contains images of flowers belonging to 102 different categories.
+The images were acquired by searching the web and taking pictures. There are a
+minimum of 40 images for each category.
+
+The database was used in:
+
+Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
+ number of classes.Proceedings of the Indian Conference on Computer Vision,
+Graphics and Image Processing (2008)
+http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
+
+"""
+import cPickle
+import itertools
+from common import download
+import tarfile
+import scipy.io as scio
+from paddle.v2.image import *
+import os
+import numpy as np
+import paddle.v2 as paddle
+from multiprocessing import cpu_count
+__all__ = ['train', 'test', 'valid']
+
+DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
+LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
+SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
+DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
+LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
+SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+
+
+def default_mapper(sample):
+ '''
+ map image bytes data to type needed by model input layer
+ '''
+ img, label = sample
+ img = paddle.image.load_image_bytes(img)
+ img = paddle.image.simple_transform(img, 256, 224, True)
+ return img.flatten().astype('float32'), label
+
+
+def reader_creator(data_file,
+ label_file,
+ setid_file,
+ dataset_name,
+ mapper=default_mapper,
+ buffered_size=1024):
+ '''
+ 1. read images from tar file and
+ merge images into batch files in 102flowers.tgz_batch/
+ 2. get a reader to read sample from batch file
+
+ :param data_file: downloaded data file
+ :type data_file: string
+ :param label_file: downloaded label file
+ :type label_file: string
+ :param setid_file: downloaded setid file containing information
+ about how to split dataset
+ :type setid_file: string
+ :param dataset_name: data set name (tstid|trnid|valid)
+ :type dataset_name: string
+ :param mapper: a function to map image bytes data to type
+ needed by model input layer
+ :type mapper: callable
+ :param buffered_size: the size of buffer used to process images
+ :type buffered_size: int
+ :return: data reader
+ :rtype: callable
+ '''
+ labels = scio.loadmat(label_file)['labels'][0]
+ indexes = scio.loadmat(setid_file)[dataset_name][0]
+ img2label = {}
+ for i in indexes:
+ img = "jpg/image_%05d.jpg" % i
+ img2label[img] = labels[i - 1]
+ file_list = batch_images_from_tar(data_file, dataset_name, img2label)
+
+ def reader():
+ for file in open(file_list):
+ file = file.strip()
+ batch = None
+ with open(file, 'r') as f:
+ batch = cPickle.load(f)
+ data = batch['data']
+ labels = batch['label']
+ for sample, label in itertools.izip(data, batch['label']):
+ yield sample, int(label)
+
+ return paddle.reader.xmap_readers(mapper, reader,
+ cpu_count(), buffered_size)
+
+
+def train(mapper=default_mapper, buffered_size=1024):
+ '''
+ Create flowers training set reader.
+ It returns a reader, each sample in the reader is
+ image pixels in [0, 1] and label in [1, 102]
+ translated from original color image by steps:
+ 1. resize to 256*256
+ 2. random crop to 224*224
+ 3. flatten
+ :param mapper: a function to map sample.
+ :type mapper: callable
+ :param buffered_size: the size of buffer used to process images
+ :type buffered_size: int
+ :return: train data reader
+ :rtype: callable
+ '''
+ return reader_creator(
+ download(DATA_URL, 'flowers', DATA_MD5),
+ download(LABEL_URL, 'flowers', LABEL_MD5),
+ download(SETID_URL, 'flowers', SETID_MD5), 'trnid', mapper,
+ buffered_size)
+
+
+def test(mapper=default_mapper, buffered_size=1024):
+ '''
+ Create flowers test set reader.
+ It returns a reader, each sample in the reader is
+ image pixels in [0, 1] and label in [1, 102]
+ translated from original color image by steps:
+ 1. resize to 256*256
+ 2. random crop to 224*224
+ 3. flatten
+ :param mapper: a function to map sample.
+ :type mapper: callable
+ :param buffered_size: the size of buffer used to process images
+ :type buffered_size: int
+ :return: test data reader
+ :rtype: callable
+ '''
+ return reader_creator(
+ download(DATA_URL, 'flowers', DATA_MD5),
+ download(LABEL_URL, 'flowers', LABEL_MD5),
+ download(SETID_URL, 'flowers', SETID_MD5), 'tstid', mapper,
+ buffered_size)
+
+
+def valid(mapper=default_mapper, buffered_size=1024):
+ '''
+ Create flowers validation set reader.
+ It returns a reader, each sample in the reader is
+ image pixels in [0, 1] and label in [1, 102]
+ translated from original color image by steps:
+ 1. resize to 256*256
+ 2. random crop to 224*224
+ 3. flatten
+ :param mapper: a function to map sample.
+ :type mapper: callable
+ :param buffered_size: the size of buffer used to process images
+ :type buffered_size: int
+ :return: test data reader
+ :rtype: callable
+ '''
+ return reader_creator(
+ download(DATA_URL, 'flowers', DATA_MD5),
+ download(LABEL_URL, 'flowers', LABEL_MD5),
+ download(SETID_URL, 'flowers', SETID_MD5), 'valid', mapper,
+ buffered_size)
+
+
+def fetch():
+ download(DATA_URL, 'flowers', DATA_MD5)
+ download(LABEL_URL, 'flowers', LABEL_MD5)
+ download(SETID_URL, 'flowers', SETID_MD5)
diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc0626f4feae287d18dfb227cc69a4174da055da
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/flowers_test.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.flowers
+import unittest
+
+
+class TestFlowers(unittest.TestCase):
+ def check_reader(self, reader):
+ sum = 0
+ label = 0
+ size = 224 * 224 * 3
+ for l in reader():
+ self.assertEqual(l[0].size, size)
+ if l[1] > label:
+ label = l[1]
+ sum += 1
+ return sum, label
+
+ def test_train(self):
+ instances, max_label_value = self.check_reader(
+ paddle.v2.dataset.flowers.train())
+ self.assertEqual(instances, 1020)
+ self.assertEqual(max_label_value, 102)
+
+ def test_test(self):
+ instances, max_label_value = self.check_reader(
+ paddle.v2.dataset.flowers.test())
+ self.assertEqual(instances, 6149)
+ self.assertEqual(max_label_value, 102)
+
+ def test_valid(self):
+ instances, max_label_value = self.check_reader(
+ paddle.v2.dataset.flowers.valid())
+ self.assertEqual(instances, 1020)
+ self.assertEqual(max_label_value, 102)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
index 85ad6984ba08440d8f8c24a6ca5842024dbafe4b..0d648e9ae697ff0373c6cdc166608d395a8d8086 100644
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
@@ -1,14 +1,16 @@
import numpy as np
try:
import cv2
-except:
- print(
- "import cv2 error, please install opencv-python: pip install opencv-python"
- )
+except ImportError:
+ cv2 = None
+import os
+import tarfile
+import cPickle
__all__ = [
- "load_image", "resize_short", "to_chw", "center_crop", "random_crop",
- "left_right_flip", "simple_transform", "load_and_transform"
+ "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
+ "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
+ "batch_images_from_tar"
]
"""
This file contains some common interfaces for image preprocess.
@@ -28,6 +30,90 @@ the image layout as follows.
"""
+def batch_images_from_tar(data_file,
+ dataset_name,
+ img2label,
+ num_per_batch=1024):
+ """
+ Read images from tar file and batch them into batch file.
+ param data_file: path of image tar file
+ type data_file: string
+ param dataset_name: 'train','test' or 'valid'
+ type dataset_name: string
+ param img2label: a dic with image file name as key
+ and image's label as value
+ type img2label: dic
+ param num_per_batch: image number per batch file
+ type num_per_batch: int
+ return: path of list file containing paths of batch file
+ rtype: string
+ """
+ batch_dir = data_file + "_batch"
+ out_path = "%s/%s" % (batch_dir, dataset_name)
+ meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
+
+ if os.path.exists(out_path):
+ return meta_file
+ else:
+ os.makedirs(out_path)
+
+ tf = tarfile.open(data_file)
+ mems = tf.getmembers()
+ data = []
+ labels = []
+ file_id = 0
+ for mem in mems:
+ if mem.name in img2label:
+ data.append(tf.extractfile(mem).read())
+ labels.append(img2label[mem.name])
+ if len(data) == num_per_batch:
+ output = {}
+ output['label'] = labels
+ output['data'] = data
+ cPickle.dump(
+ output,
+ open('%s/batch_%d' % (out_path, file_id), 'w'),
+ protocol=cPickle.HIGHEST_PROTOCOL)
+ file_id += 1
+ data = []
+ labels = []
+ if len(data) > 0:
+ output = {}
+ output['label'] = labels
+ output['data'] = data
+ cPickle.dump(
+ output,
+ open('%s/batch_%d' % (out_path, file_id), 'w'),
+ protocol=cPickle.HIGHEST_PROTOCOL)
+
+ with open(meta_file, 'a') as meta:
+ for file in os.listdir(out_path):
+ meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n")
+ return meta_file
+
+
+def load_image_bytes(bytes, is_color=True):
+ """
+ Load an color or gray image from bytes array.
+
+ Example usage:
+
+ .. code-block:: python
+ with open('cat.jpg') as f:
+ im = load_image_bytes(f.read())
+
+ :param bytes: the input image bytes array.
+ :type file: str
+ :param is_color: If set is_color True, it will load and
+ return a color image. Otherwise, it will
+ load and return a gray image.
+ """
+ flag = 1 if is_color else 0
+ file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
+ img = cv2.imdecode(file_bytes, flag)
+ return img
+
+
def load_image(file, is_color=True):
"""
Load an color or gray image from the file path.
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
index 104ce9a0411413bb8fc65eedf5821f98d6acdba3..c76faa596c9fb9079cab3456b721c18ef9768e95 100644
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -14,7 +14,7 @@
__all__ = [
'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
- 'ComposeNotAligned', 'firstn'
+ 'ComposeNotAligned', 'firstn', 'xmap_readers'
]
import itertools
@@ -224,3 +224,74 @@ def firstn(reader, n):
yield item
return firstn_reader
+
+
+class XmapEndSignal():
+ pass
+
+
+def xmap_readers(mapper, reader, process_num, buffer_size):
+ """
+ Use multiprocess to map samples from reader by a mapper defined by user.
+ And this function contains a buffered decorator.
+ :param mapper: a function to map sample.
+ :type mapper: callable
+ :param reader: the data reader to read from
+ :type reader: callable
+ :param process_num: process number to handle original sample
+ :type process_num: int
+ :param buffer_size: max buffer size
+ :type buffer_size: int
+ :return: the decarated reader
+ :rtype: callable
+ """
+ end = XmapEndSignal()
+ in_queue = Queue(buffer_size)
+ out_queue = Queue(buffer_size)
+
+ # define a worker to read samples from reader to in_queue
+ def read_worker(reader, in_queue):
+ for i in reader():
+ in_queue.put(i)
+ in_queue.put(end)
+
+ # start a read worker in a thread
+ t = Thread(target=read_worker, args=(reader, in_queue))
+ t.daemon = True
+ t.start()
+
+ # define a worker to handle samples from in_queue by mapper
+ # and put mapped samples into out_queue
+ def handle_worker(in_queue, out_queue, mapper):
+ sample = in_queue.get()
+ while not isinstance(sample, XmapEndSignal):
+ r = mapper(sample)
+ out_queue.put(r)
+ sample = in_queue.get()
+ in_queue.put(end)
+ out_queue.put(end)
+
+ # start several handle_workers
+ workers = []
+ for i in xrange(process_num):
+ worker = Thread(
+ target=handle_worker, args=(in_queue, out_queue, mapper))
+ worker.daemon = True
+ workers.append(worker)
+ for w in workers:
+ w.start()
+
+ def xreader():
+ sample = out_queue.get()
+ while not isinstance(sample, XmapEndSignal):
+ yield sample
+ sample = out_queue.get()
+ finish = 1
+ while finish < process_num:
+ sample = out_queue.get()
+ if isinstance(sample, XmapEndSignal):
+ finish += 1
+ else:
+ yield sample
+
+ return xreader