Merge pull request #1 from PaddlePaddle/develop

Merge from upstream

Merge pull request #1 from PaddlePaddle/develop
Merge from upstream
0a83aa46 · wgzqz · GitHub · ae418490 · 08f169cb · 0a83aa46
14 changed file
--- a/fluid/DeepASR/data_utils/augmentor/tests/__init__.py
+++ b/fluid/DeepASR/data_utils/augmentor/tests/__init__.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
+import data_utils.augmentor.trans_add_delta as trans_add_delta
+import data_utils.augmentor.trans_splice as trans_splice
--- a/fluid/DeepASR/data_utils/data_reader.py
+++ b/fluid/DeepASR/data_utils/data_reader.py
-"""This model read the sample from disk. 
-   use multiprocessing to reading samples
-   push samples from one block to multiprocessing queue 
-   Todos:
-        1. multiprocess read block from disk
+"""This module contains data processing related logic.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import random
+import struct
 import Queue
+import time
 import numpy as np
-import struct
+from threading import Thread
+import signal
+from multiprocessing import Manager, Process
 import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
 import data_utils.augmentor.trans_add_delta as trans_add_delta
-
-
-class OneBlock(object):
-    """ struct for one block :
-        contain label, label desc, feature, feature_desc
-
-        Attributes:
-            label(str) :  label path of one block
-            label_desc(str) : label description path of one block
-            feature(str) : feature path of on block
-            feature_desc(str) : feature description path of on block
+from data_utils.util import suppress_complaints, suppress_signal
+from data_utils.util import CriticalException, ForceExitWrapper
+
+
+class SampleInfo(object):
+    """SampleInfo holds the necessary information to load a sample from disk.
+
+    Args:
+        feature_bin_path (str): File containing the feature data.
+        feature_start (int): Start position of the sample's feature data.
+        feature_size (int): Byte count of the sample's feature data.
+        feature_frame_num (int): Time length of the sample.
+        feature_dim (int): Feature dimension of one frame.
+        label_bin_path (str): File containing the label data.
+        label_size (int): Byte count of the sample's label data.
+        label_frame_num (int): Label number of the sample.
    """

-    def __init__(self):
-        """the constructor."""
-
-        self.label = "label"
-        self.label_desc = "label_desc"
-        self.feature = "feature"
-        self.feature_desc = "feature_desc"
-
-
-class DataRead(object):
+    def __init__(self, feature_bin_path, feature_start, feature_size,
+                 feature_frame_num, feature_dim, label_bin_path, label_start,
+                 label_size, label_frame_num):
+        self.feature_bin_path = feature_bin_path
+        self.feature_start = feature_start
+        self.feature_size = feature_size
+        self.feature_frame_num = feature_frame_num
+        self.feature_dim = feature_dim
+
+        self.label_bin_path = label_bin_path
+        self.label_start = label_start
+        self.label_size = label_size
+        self.label_frame_num = label_frame_num
+
+
+class SampleInfoBucket(object):
+    """SampleInfoBucket contains paths of several description files. Feature
+    description file contains necessary information (including path of binary
+    data, sample start position, sample byte number etc.) to access samples'
+    feature data and the same with the label description file. SampleInfoBucket
+    is the minimum unit to do shuffle.
+
+    Args:
+        feature_bin_paths (list|tuple): Files containing the binary feature
+                                        data.
+        feature_desc_paths (list|tuple): Files containing the description of
+                                         samples' feature data.
+        label_bin_paths (list|tuple): Files containing the binary label data.
+        label_desc_paths (list|tuple): Files containing the description of
+                                       samples' label data.
+        split_perturb(int): Maximum perturbation value for length of
+                            sub-sentence when splitting long sentence.
+        split_sentence_threshold(int): Sentence whose length larger than
+                                the value will trigger split operation.
+        split_sub_sentence_len(int): sub-sentence length is equal to
+                                    (split_sub_sentence_len + rand() % split_perturb).
    """
-    Attributes:
-        _lblock(obj:`OneBlock`) : the list of OneBlock
-        _ndrop_sentence_len(int): dropout the sentence which's frame_num large than _ndrop_sentence_len  
-        _que_sample(obj:`Queue`): sample buffer
-        _nframe_dim(int): the batch sample frame_dim(todo remove)
-        _nstart_block_idx(int): the start block id
-        _nload_block_num(int): the block num
+
+    def __init__(self,
+                 feature_bin_paths,
+                 feature_desc_paths,
+                 label_bin_paths,
+                 label_desc_paths,
+                 split_perturb=50,
+                 split_sentence_threshold=512,
+                 split_sub_sentence_len=256):
+        block_num = len(label_bin_paths)
+        assert len(label_desc_paths) == block_num
+        assert len(feature_bin_paths) == block_num
+        assert len(feature_desc_paths) == block_num
+        self._block_num = block_num
+
+        self._feature_bin_paths = feature_bin_paths
+        self._feature_desc_paths = feature_desc_paths
+        self._label_bin_paths = label_bin_paths
+        self._label_desc_paths = label_desc_paths
+        self._split_perturb = split_perturb
+        self._split_sentence_threshold = split_sentence_threshold
+        self._split_sub_sentence_len = split_sub_sentence_len
+        self._rng = random.Random(0)
+
+    def generate_sample_info_list(self):
+        sample_info_list = []
+        for block_idx in xrange(self._block_num):
+            label_bin_path = self._label_bin_paths[block_idx]
+            label_desc_path = self._label_desc_paths[block_idx]
+            feature_bin_path = self._feature_bin_paths[block_idx]
+            feature_desc_path = self._feature_desc_paths[block_idx]
+
+            label_desc_lines = open(label_desc_path).readlines()
+            feature_desc_lines = open(feature_desc_path).readlines()
+
+            sample_num = int(label_desc_lines[0].split()[1])
+            assert sample_num == int(feature_desc_lines[0].split()[1])
+
+            for i in xrange(sample_num):
+                feature_desc_split = feature_desc_lines[i + 1].split()
+                feature_start = int(feature_desc_split[2])
+                feature_size = int(feature_desc_split[3])
+                feature_frame_num = int(feature_desc_split[4])
+                feature_dim = int(feature_desc_split[5])
+
+                label_desc_split = label_desc_lines[i + 1].split()
+                label_start = int(label_desc_split[2])
+                label_size = int(label_desc_split[3])
+                label_frame_num = int(label_desc_split[4])
+                assert feature_frame_num == label_frame_num
+
+                if self._split_sentence_threshold == -1 or \
+                        self._split_perturb == -1 or \
+                        self._split_sub_sentence_len == -1 \
+                        or self._split_sentence_threshold >= feature_frame_num:
+                    sample_info_list.append(
+                        SampleInfo(feature_bin_path, feature_start,
+                                   feature_size, feature_frame_num, feature_dim,
+                                   label_bin_path, label_start, label_size,
+                                   label_frame_num))
+                #split sentence
+                else:
+                    cur_frame_pos = 0
+                    cur_frame_len = 0
+                    remain_frame_num = feature_frame_num
+                    while True:
+                        if remain_frame_num > self._split_sentence_threshold:
+                            cur_frame_len = self._split_sub_sentence_len + \
+                                    self._rng.randint(0, self._split_perturb)
+                            if cur_frame_len > remain_frame_num:
+                                cur_frame_len = remain_frame_num
+                        else:
+                            cur_frame_len = remain_frame_num
+
+                        sample_info_list.append(
+                            SampleInfo(
+                                feature_bin_path, feature_start + cur_frame_pos
+                                * feature_dim * 4, cur_frame_len * feature_dim *
+                                4, cur_frame_len, feature_dim, label_bin_path,
+                                label_start + cur_frame_pos * 4, cur_frame_len *
+                                4, cur_frame_len))
+
+                        remain_frame_num -= cur_frame_len
+                        cur_frame_pos += cur_frame_len
+                        if remain_frame_num <= 0:
+                            break
+
+        return sample_info_list
+
+
+class EpochEndSignal():
+    pass
+
+
+class DataReader(object):
+    """DataReader provides basic audio sample preprocessing pipeline including
+    data loading and data augmentation.
+
+    Args:
+        feature_file_list (str): File containing paths of feature data file and
+                                 corresponding description file.
+        label_file_list (str): File containing paths of label data file and
+                               corresponding description file.
+        drop_frame_len (int): Samples whose label length above the value will be
+                              dropped.(Using '-1' to disable the policy)
+        process_num (int): Number of processes for processing data.
+        sample_buffer_size (int): Buffer size to indicate the maximum samples
+                                  cached.
+        sample_info_buffer_size (int): Buffer size to indicate the maximum
+                                       sample information cached.
+        batch_buffer_size (int): Buffer size to indicate the maximum batch
+                                 cached.
+        shuffle_block_num (int): Block number indicating the minimum unit to do
+                                 shuffle.
+        random_seed (int): Random seed.
+        verbose (int): If set to 0, complaints including exceptions and signal
+                       traceback from sub-process will be suppressed. If set
+                       to 1, all complaints will be printed.
    """

-    def __init__(self, sfeature_lst, slabel_lst, ndrop_sentence_len=512):
-        """
-        Args:
-            sfeature_lst(str):feature lst path
-            slabel_lst(str):label lst path
-        Returns:
-            None
-        """
-        self._lblock = []
-        self._ndrop_sentence_len = ndrop_sentence_len
-        self._que_sample = Queue.Queue()
-        self._nframe_dim = 120 * 11
-        self._nstart_block_idx = 0
-        self._nload_block_num = 1
-        self._ndrop_frame_len = 256
-
-        self._load_list(sfeature_lst, slabel_lst)
-
-    def _load_list(self, sfeature_lst, slabel_lst):
-        """ load list and shuffle
-        Args:
-            sfeature_lst(str):feature lst path
-            slabel_lst(str):label lst path
-        Returns:
-            None
-        """
-        lfeature = open(sfeature_lst).readlines()
-        llabel = open(slabel_lst).readlines()
-        assert len(llabel) == len(lfeature)
-        for i in range(0, len(lfeature), 2):
-            one_block = OneBlock()
-
-            one_block.label = llabel[i]
-            one_block.label_desc = llabel[i + 1]
-            one_block.feature = lfeature[i]
-            one_block.feature_desc = lfeature[i + 1]
-            self._lblock.append(one_block)
-
-        random.shuffle(self._lblock)
-
-    def _load_one_block(self, lsample, id):
-        """read one block by id and push load sample in list lsample 
-        Args:
-            lsample(list): return sample list
-            id(int): block id 
-        Returns:
-            None
-        """
-        if id >= len(self._lblock):
-            return
-
-        slabel_path = self._lblock[id].label.strip()
-        slabel_desc_path = self._lblock[id].label_desc.strip()
-        sfeature_path = self._lblock[id].feature.strip()
-        sfeature_desc_path = self._lblock[id].feature_desc.strip()
-
-        llabel_line = open(slabel_desc_path).readlines()
-        lfeature_line = open(sfeature_desc_path).readlines()
-
-        file_lable_bin = open(slabel_path, "r")
-        file_feature_bin = open(sfeature_path, "r")
-
-        sample_num = int(llabel_line[0].split()[1])
-        assert sample_num == int(lfeature_line[0].split()[1])
-
-        llabel_line = llabel_line[1:]
-        lfeature_line = lfeature_line[1:]
-
-        for i in range(sample_num):
-            # read label 
-            llabel_split = llabel_line[i].split()
-            nlabel_start = int(llabel_split[2])
-            nlabel_size = int(llabel_split[3])
-            nlabel_frame_num = int(llabel_split[4])
-
-            file_lable_bin.seek(nlabel_start, 0)
-            label_bytes = file_lable_bin.read(nlabel_size)
-            assert nlabel_frame_num * 4 == len(label_bytes)
-            label_array = struct.unpack('I' * nlabel_frame_num, label_bytes)
-            label_data = np.array(label_array, dtype="int64")
-            label_data = label_data.reshape((nlabel_frame_num, 1))
-
-            # read feature
-            lfeature_split = lfeature_line[i].split()
-            nfeature_start = int(lfeature_split[2])
-            nfeature_size = int(lfeature_split[3])
-            nfeature_frame_num = int(lfeature_split[4])
-            nfeature_frame_dim = int(lfeature_split[5])
-
-            file_feature_bin.seek(nfeature_start, 0)
-            feature_bytes = file_feature_bin.read(nfeature_size)
-            assert nfeature_frame_num * nfeature_frame_dim * 4 == len(
-                feature_bytes)
-            feature_array = struct.unpack('f' * nfeature_frame_num *
-                                          nfeature_frame_dim, feature_bytes)
-            feature_data = np.array(feature_array, dtype="float32")
-            feature_data = feature_data.reshape(
-                (nfeature_frame_num, nfeature_frame_dim))
-
-            #drop long sentence
-            if self._ndrop_frame_len < feature_data.shape[0]:
-                continue
-            lsample.append((feature_data, label_data))
-
-    def get_one_batch(self, nbatch_size):
-        """construct one batch(feature, label), batch size is nbatch_size
-        Args:
-            nbatch_size(int): batch size
-        Returns:
-            None
-        """
-        if self._que_sample.empty():
-            lsample = self._load_block(
-                range(self._nstart_block_idx, self._nstart_block_idx +
-                      self._nload_block_num, 1))
-            self._move_sample(lsample)
-            self._nstart_block_idx += self._nload_block_num
-
-        if self._que_sample.empty():
-            self._nstart_block_idx = 0
-            return None
-        #cal all frame num
-        ncur_len = 0
-        lod = [0]
-        samples = []
-        bat_feature = np.zeros((nbatch_size, self._nframe_dim))
-        for i in range(nbatch_size):
-            # empty clear zero 
-            if self._que_sample.empty():
-                self._nstart_block_idx = 0
-            # copy
+    def __init__(self,
+                 feature_file_list,
+                 label_file_list,
+                 drop_frame_len=512,
+                 process_num=10,
+                 sample_buffer_size=1024,
+                 sample_info_buffer_size=1024,
+                 batch_buffer_size=1024,
+                 shuffle_block_num=10,
+                 random_seed=0,
+                 verbose=0):
+        self._feature_file_list = feature_file_list
+        self._label_file_list = label_file_list
+        self._drop_frame_len = drop_frame_len
+        self._shuffle_block_num = shuffle_block_num
+        self._block_info_list = None
+        self._rng = random.Random(random_seed)
+        self._bucket_list = None
+        self.generate_bucket_list(True)
+        self._order_id = 0
+        self._manager = Manager()
+        self._sample_buffer_size = sample_buffer_size
+        self._sample_info_buffer_size = sample_info_buffer_size
+        self._batch_buffer_size = batch_buffer_size
+        self._process_num = process_num
+        self._verbose = verbose
+        self._force_exit = ForceExitWrapper(self._manager.Value('b', False))
+
+    def generate_bucket_list(self, is_shuffle):
+        if self._block_info_list is None:
+            block_feature_info_lines = open(self._feature_file_list).readlines()
+            block_label_info_lines = open(self._label_file_list).readlines()
+            assert len(block_feature_info_lines) == len(block_label_info_lines)
+            self._block_info_list = []
+            for i in xrange(0, len(block_feature_info_lines), 2):
+                block_info = (block_feature_info_lines[i],
+                              block_feature_info_lines[i + 1],
+                              block_label_info_lines[i],
+                              block_label_info_lines[i + 1])
+                self._block_info_list.append(
+                    map(lambda line: line.strip(), block_info))
+
+        if is_shuffle:
+            self._rng.shuffle(self._block_info_list)
+
+        self._bucket_list = []
+        for i in xrange(0, len(self._block_info_list), self._shuffle_block_num):
+            bucket_block_info = self._block_info_list[i:i +
+                                                      self._shuffle_block_num]
+            self._bucket_list.append(
+                SampleInfoBucket(
+                    map(lambda info: info[0], bucket_block_info),
+                    map(lambda info: info[1], bucket_block_info),
+                    map(lambda info: info[2], bucket_block_info),
+                    map(lambda info: info[3], bucket_block_info)))
+
+    # @TODO make this configurable
+    def set_transformers(self, transformers):
+        self._transformers = transformers
+
+    def _sample_generator(self):
+        sample_info_queue = self._manager.Queue(self._sample_info_buffer_size)
+        sample_queue = self._manager.Queue(self._sample_buffer_size)
+        self._order_id = 0
+
+        @suppress_complaints(verbose=self._verbose, notify=self._force_exit)
+        def ordered_feeding_task(sample_info_queue):
+            for sample_info_bucket in self._bucket_list:
+                try:
+                    sample_info_list = \
+                            sample_info_bucket.generate_sample_info_list()
+                except Exception as e:
+                    raise CriticalException(e)
+                else:
+                    self._rng.shuffle(sample_info_list)  # do shuffle here
+                    for sample_info in sample_info_list:
+                        sample_info_queue.put((sample_info, self._order_id))
+                        self._order_id += 1
+
+            for i in xrange(self._process_num):
+                sample_info_queue.put(EpochEndSignal())
+
+        feeding_thread = Thread(
+            target=ordered_feeding_task, args=(sample_info_queue, ))
+        feeding_thread.daemon = True
+        feeding_thread.start()
+
+        @suppress_complaints(verbose=self._verbose, notify=self._force_exit)
+        def ordered_processing_task(sample_info_queue, sample_queue, out_order):
+            if self._verbose == 0:
+                signal.signal(signal.SIGTERM, suppress_signal)
+                signal.signal(signal.SIGINT, suppress_signal)
+
+            def read_bytes(fpath, start, size):
+                try:
+                    f = open(fpath, 'r')
+                    f.seek(start, 0)
+                    binary_bytes = f.read(size)
+                    f.close()
+                    return binary_bytes
+                except Exception as e:
+                    raise CriticalException(e)
+
+            ins = sample_info_queue.get()
+
+            while not isinstance(ins, EpochEndSignal):
+                sample_info, order_id = ins
+
+                feature_bytes = read_bytes(sample_info.feature_bin_path,
+                                           sample_info.feature_start,
+                                           sample_info.feature_size)
+
+                assert sample_info.feature_frame_num * sample_info.feature_dim * 4 \
+                        == len(feature_bytes), \
+                        (sample_info.feature_bin_path,
+                         sample_info.feature_frame_num,
+                         sample_info.feature_dim,
+                         len(feature_bytes))
+
+                label_bytes = read_bytes(sample_info.label_bin_path,
+                                         sample_info.label_start,
+                                         sample_info.label_size)
+
+                assert sample_info.label_frame_num * 4 == len(label_bytes), (
+                    sample_info.label_bin_path, sample_info.label_array,
+                    len(label_bytes))
+
+                label_array = struct.unpack('I' * sample_info.label_frame_num,
+                                            label_bytes)
+                label_data = np.array(
+                    label_array, dtype='int64').reshape(
+                        (sample_info.label_frame_num, 1))
+
+                feature_frame_num = sample_info.feature_frame_num
+                feature_dim = sample_info.feature_dim
+                assert feature_frame_num * feature_dim * 4 == len(feature_bytes)
+                feature_array = struct.unpack('f' * feature_frame_num *
+                                              feature_dim, feature_bytes)
+                feature_data = np.array(
+                    feature_array, dtype='float32').reshape((
+                        sample_info.feature_frame_num, sample_info.feature_dim))
+
+                sample_data = (feature_data, label_data)
+                for transformer in self._transformers:
+                    # @TODO(pkuyym) to make transfomer only accept feature_data
+                    sample_data = transformer.perform_trans(sample_data)
+
+                while order_id != out_order[0]:
+                    time.sleep(0.001)
+
+                # drop long sentence
+                if self._drop_frame_len == -1 or \
+                        self._drop_frame_len >= sample_data[0].shape[0]:
+                    sample_queue.put(sample_data)
+
+                out_order[0] += 1
+                ins = sample_info_queue.get()
+
+            sample_queue.put(EpochEndSignal())
+
+        out_order = self._manager.list([0])
+        args = (sample_info_queue, sample_queue, out_order)
+        workers = [
+            Process(
+                target=ordered_processing_task, args=args)
+            for _ in xrange(self._process_num)
+        ]
+
+        for w in workers:
+            w.daemon = True
+            w.start()
+
+        finished_process_num = 0
+
+        while self._force_exit == False:
+            try:
+                sample = sample_queue.get_nowait()
+            except Queue.Empty:
+                time.sleep(0.001)
+            else:
+                if isinstance(sample, EpochEndSignal):
+                    finished_process_num += 1
+                    if finished_process_num >= self._process_num:
+                        break
+                    else:
+                        continue
+
+                yield sample
+
+    def batch_iterator(self, batch_size, minimum_batch_size):
+        def batch_to_ndarray(batch_samples, lod):
+            assert len(batch_samples)
+            frame_dim = batch_samples[0][0].shape[1]
+            batch_feature = np.zeros((lod[-1], frame_dim), dtype="float32")
+            batch_label = np.zeros((lod[-1], 1), dtype="int64")
+            start = 0
+            for sample in batch_samples:
+                frame_num = sample[0].shape[0]
+                batch_feature[start:start + frame_num, :] = sample[0]
+                batch_label[start:start + frame_num, :] = sample[1]
+                start += frame_num
+            return (batch_feature, batch_label)
+
+        @suppress_complaints(verbose=self._verbose, notify=self._force_exit)
+        def batch_assembling_task(sample_generator, batch_queue):
+            batch_samples = []
+            lod = [0]
+            for sample in sample_generator():
+                batch_samples.append(sample)
+                lod.append(lod[-1] + sample[0].shape[0])
+                if len(batch_samples) == batch_size:
+                    (batch_feature, batch_label) = batch_to_ndarray(
+                        batch_samples, lod)
+                    batch_queue.put((batch_feature, batch_label, lod))
+                    batch_samples = []
+                    lod = [0]
+
+            if len(batch_samples) >= minimum_batch_size:
+                (batch_feature, batch_label) = batch_to_ndarray(batch_samples,
+                                                                lod)
+                batch_queue.put((batch_feature, batch_label, lod))
+
+            batch_queue.put(EpochEndSignal())
+
+        batch_queue = Queue.Queue(self._batch_buffer_size)
+
+        assembling_thread = Thread(
+            target=batch_assembling_task,
+            args=(self._sample_generator, batch_queue))
+        assembling_thread.daemon = True
+        assembling_thread.start()
+
+        while self._force_exit == False:
+            try:
+                batch_data = batch_queue.get_nowait()
+            except Queue.Empty:
+                time.sleep(0.001)
            else:
-                (one_feature, one_label) = self._que_sample.get()
-                samples.append((one_feature, one_label))
-                ncur_len += one_feature.shape[0]
-                lod.append(ncur_len)
-
-        bat_feature = np.zeros((ncur_len, self._nframe_dim), dtype="float32")
-        bat_label = np.zeros((ncur_len, 1), dtype="int64")
-        ncur_len = 0
-        for sample in samples:
-            one_feature = sample[0]
-            one_label = sample[1]
-            nframe_num = one_feature.shape[0]
-            nstart = ncur_len
-            nend = ncur_len + nframe_num
-            bat_feature[nstart:nend, :] = one_feature
-            bat_label[nstart:nend, :] = one_label
-            ncur_len += nframe_num
-        return (bat_feature, bat_label, lod)
-
-    def set_trans(self, ltrans):
-        """ set transform list
-        Args:
-            ltrans(list): data tranform list
-        Returns:
-            None
-        """
-        self._ltrans = ltrans
-
-    def _load_block(self, lblock_id):
-        """read blocks
-        """
-        lsample = []
-        for id in lblock_id:
-            self._load_one_block(lsample, id)
-
-        # transform sample
-        for (nidx, sample) in enumerate(lsample):
-            for trans in self._ltrans:
-                sample = trans.perform_trans(sample)
-            lsample[nidx] = sample
-
-        return lsample
-
-    def load_block(self, lblock_id):
-        """read blocks
-        Args:
-            lblock_id(list):the block list id
-        Returns:
-            None
-        """
-        lsample = []
-        for id in lblock_id:
-            self._load_one_block(lsample, id)
-
-        # transform sample
-        for (nidx, sample) in enumerate(lsample):
-            for trans in self._ltrans:
-                sample = trans.perform_trans(sample)
-            lsample[nidx] = sample
-
-        return lsample
-
-    def _move_sample(self, lsample):
-        """move sample to queue
-        Args:
-            lsample(list): one block of samples read from disk
-        Returns:
-            None
-        """
-        # random
-        random.shuffle(lsample)
-
-        for sample in lsample:
-            self._que_sample.put(sample)
+                if isinstance(batch_data, EpochEndSignal):
+                    break
+                yield batch_data
--- a/fluid/DeepASR/data_utils/util.py
+++ b/fluid/DeepASR/data_utils/util.py
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import sys
+from six import reraise
+from tblib import Traceback
+
+import numpy as np


 def to_lodtensor(data, place):
@@ -28,3 +33,42 @@ def lodtensor_to_ndarray(lod_tensor):
    for i in xrange(np.product(dims)):
        ret.ravel()[i] = lod_tensor.get_float_element(i)
    return ret, lod_tensor.lod()
+
+
+class CriticalException(Exception):
+    pass
+
+
+def suppress_signal(signo, stack_frame):
+    pass
+
+
+def suppress_complaints(verbose, notify=None):
+    def decorator_maker(func):
+        def suppress_warpper(*args, **kwargs):
+            try:
+                func(*args, **kwargs)
+            except:
+                et, ev, tb = sys.exc_info()
+
+                if notify is not None:
+                    notify(except_type=et, except_value=ev, traceback=tb)
+
+                if verbose == 1 or isinstance(ev, CriticalException):
+                    reraise(et, ev, Traceback(tb).as_traceback())
+
+        return suppress_warpper
+
+    return decorator_maker
+
+
+class ForceExitWrapper(object):
+    def __init__(self, exit_flag):
+        self._exit_flag = exit_flag
+
+    @suppress_complaints(verbose=0)
+    def __call__(self, *args, **kwargs):
+        self._exit_flag.value = True
+
+    def __eq__(self, flag):
+        return self._exit_flag.value == flag
--- a/fluid/DeepASR/infer.py
+++ b/fluid/DeepASR/infer.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import argparse
+import paddle.v2.fluid as fluid
+import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
+import data_utils.augmentor.trans_add_delta as trans_add_delta
+import data_utils.augmentor.trans_splice as trans_splice
+import data_utils.data_reader as reader
+from data_utils.util import lodtensor_to_ndarray
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Inference for stacked LSTMP model.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type. (default: %(default)s)')
+    parser.add_argument(
+        '--mean_var',
+        type=str,
+        default='data/global_mean_var_search26kHr',
+        help="The path for feature's global mean and variance. "
+        "(default: %(default)s)")
+    parser.add_argument(
+        '--infer_feature_lst',
+        type=str,
+        default='data/infer_feature.lst',
+        help='The feature list path for inference. (default: %(default)s)')
+    parser.add_argument(
+        '--infer_label_lst',
+        type=str,
+        default='data/infer_label.lst',
+        help='The label list path for inference. (default: %(default)s)')
+    parser.add_argument(
+        '--model_save_path',
+        type=str,
+        default='./checkpoints/deep_asr.pass_0.model/',
+        help='The directory for saving model. (default: %(default)s)')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def split_infer_result(infer_seq, lod):
+    infer_batch = []
+    for i in xrange(0, len(lod[0]) - 1):
+        infer_batch.append(infer_seq[lod[0][i]:lod[0][i + 1]])
+    return infer_batch
+
+
+def infer(args):
+    """ Gets one batch of feature data and predicts labels for each sample.
+    """
+
+    if not os.path.exists(args.model_save_path):
+        raise IOError("Invalid model path!")
+
+    place = fluid.CUDAPlace(0) if args.device == 'GPU' else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # load model
+    [infer_program, feed_dict,
+     fetch_targets] = fluid.io.load_inference_model(args.model_save_path, exe)
+
+    ltrans = [
+        trans_add_delta.TransAddDelta(2, 2),
+        trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var),
+        trans_splice.TransSplice()
+    ]
+
+    infer_data_reader = reader.DataReader(args.infer_feature_lst,
+                                          args.infer_label_lst)
+    infer_data_reader.set_transformers(ltrans)
+
+    feature_t = fluid.LoDTensor()
+    one_batch = infer_data_reader.batch_iterator(args.batch_size, 1).next()
+    (features, labels, lod) = one_batch
+    feature_t.set(features, place)
+    feature_t.set_lod([lod])
+
+    results = exe.run(infer_program,
+                      feed={feed_dict[0]: feature_t},
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+
+    probs, lod = lodtensor_to_ndarray(results[0])
+    preds = probs.argmax(axis=1)
+    infer_batch = split_infer_result(preds, lod)
+    for index, sample in enumerate(infer_batch):
+        print("result %d: " % index, sample, '\n')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    infer(args)
--- a/fluid/DeepASR/model_utils/__init__.py
+++ b/fluid/DeepASR/model_utils/__init__.py
--- a/fluid/DeepASR/model_utils/model.py
+++ b/fluid/DeepASR/model_utils/model.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def stacked_lstmp_model(hidden_dim,
+                        proj_dim,
+                        stacked_num,
+                        class_num,
+                        parallel=False,
+                        is_train=True):
+    """ The model for DeepASR. The main structure is composed of stacked 
+        identical LSTMP (LSTM with recurrent projection) layers.
+
+        When running in training and validation phase, the feeding dictionary
+        is {'feature', 'label'}, fed by the LodTensor for feature data and 
+        label data respectively. And in inference, only `feature` is needed.
+
+    Args:
+	hidden_dim(int): The hidden state's dimension of the LSTMP layer.
+	proj_dim(int): The projection size of the LSTMP layer.
+	stacked_num(int): The number of stacked LSTMP layers.
+	parallel(bool): Run in parallel or not, default `False`.
+	is_train(bool): Run in training phase or not, default `True`.
+	class_dim(int): The number of output classes.
+    """
+
+    # network configuration
+    def _net_conf(feature, label):
+        seq_conv1 = fluid.layers.sequence_conv(
+            input=feature,
+            num_filters=1024,
+            filter_size=3,
+            filter_stride=1,
+            bias_attr=True)
+        bn1 = fluid.layers.batch_norm(
+            input=seq_conv1,
+            act="sigmoid",
+            is_test=not is_train,
+            momentum=0.9,
+            epsilon=1e-05,
+            data_layout='NCHW')
+
+        stack_input = bn1
+        for i in range(stacked_num):
+            fc = fluid.layers.fc(input=stack_input,
+                                 size=hidden_dim * 4,
+                                 bias_attr=True)
+            proj, cell = fluid.layers.dynamic_lstmp(
+                input=fc,
+                size=hidden_dim * 4,
+                proj_size=proj_dim,
+                bias_attr=True,
+                use_peepholes=True,
+                is_reverse=False,
+                cell_activation="tanh",
+                proj_activation="tanh")
+            bn = fluid.layers.batch_norm(
+                input=proj,
+                act="sigmoid",
+                is_test=not is_train,
+                momentum=0.9,
+                epsilon=1e-05,
+                data_layout='NCHW')
+            stack_input = bn
+
+        prediction = fluid.layers.fc(input=stack_input,
+                                     size=class_num,
+                                     act='softmax')
+
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=prediction, label=label)
+        return prediction, avg_cost, acc
+
+    # data feeder
+    feature = fluid.layers.data(
+        name="feature", shape=[-1, 120 * 11], dtype="float32", lod_level=1)
+    label = fluid.layers.data(
+        name="label", shape=[-1, 1], dtype="int64", lod_level=1)
+
+    if parallel:
+        # When the execution place is specified to CUDAPlace, the program will
+        # run on all $CUDA_VISIBLE_DEVICES GPUs. Otherwise the program will 
+        # run on all CPU devices.
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            feat_ = pd.read_input(feature)
+            label_ = pd.read_input(label)
+            prediction, avg_cost, acc = _net_conf(feat_, label_)
+            for out in [avg_cost, acc]:
+                pd.write_output(out)
+
+        # get mean loss and acc through every devices.
+        avg_cost, acc = pd()
+        avg_cost = fluid.layers.mean(x=avg_cost)
+        acc = fluid.layers.mean(x=acc)
+    else:
+        prediction, avg_cost, acc = _net_conf(feature, label)
+
+    return prediction, avg_cost, acc
--- a/fluid/DeepASR/tools/_init_paths.py
+++ b/fluid/DeepASR/tools/_init_paths.py
+"""Add the parent directory to $PYTHONPATH"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+import sys
+
+
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+
+
+this_dir = os.path.dirname(__file__)
+
+# Add project path to PYTHONPATH
+proj_path = os.path.join(this_dir, '..')
+add_path(proj_path)
--- a/fluid/DeepASR/tools/profile.py
+++ b/fluid/DeepASR/tools/profile.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import numpy as np
+import argparse
+import time
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.profiler as profiler
+import _init_paths
+import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
+import data_utils.augmentor.trans_add_delta as trans_add_delta
+import data_utils.augmentor.trans_splice as trans_splice
+import data_utils.data_reader as reader
+from model_utils.model import stacked_lstmp_model
+from data_utils.util import lodtensor_to_ndarray
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Profiling for the stacked LSTMP model.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--minimum_batch_size',
+        type=int,
+        default=1,
+        help='The minimum sequence number of a batch data. '
+        '(default: %(default)d)')
+    parser.add_argument(
+        '--stacked_num',
+        type=int,
+        default=5,
+        help='Number of lstmp layers to stack. (default: %(default)d)')
+    parser.add_argument(
+        '--proj_dim',
+        type=int,
+        default=512,
+        help='Project size of lstmp unit. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=1024,
+        help='Hidden size of lstmp unit. (default: %(default)d)')
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=0.002,
+        help='Learning rate used to train. (default: %(default)f)')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type. (default: %(default)s)')
+    parser.add_argument(
+        '--parallel', action='store_true', help='If set, run in parallel.')
+    parser.add_argument(
+        '--mean_var',
+        type=str,
+        default='data/global_mean_var_search26kHr',
+        help='mean var path')
+    parser.add_argument(
+        '--feature_lst',
+        type=str,
+        default='data/feature.lst',
+        help='feature list path.')
+    parser.add_argument(
+        '--label_lst',
+        type=str,
+        default='data/label.lst',
+        help='label list path.')
+    parser.add_argument(
+        '--max_batch_num',
+        type=int,
+        default=10,
+        help='Maximum number of batches for profiling. (default: %(default)d)')
+    parser.add_argument(
+        '--first_batches_to_skip',
+        type=int,
+        default=1,
+        help='Number of first batches to skip for profiling. '
+        '(default: %(default)d)')
+    parser.add_argument(
+        '--print_train_acc',
+        action='store_true',
+        help='If set, output training accuray.')
+    parser.add_argument(
+        '--sorted_key',
+        type=str,
+        default='total',
+        choices=['None', 'total', 'calls', 'min', 'max', 'ave'],
+        help='Different types of time to sort the profiling report. '
+        '(default: %(default)s)')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def profile(args):
+    """profile the training process.
+    """
+
+    if not args.first_batches_to_skip < args.max_batch_num:
+        raise ValueError("arg 'first_batches_to_skip' must be smaller than "
+                         "'max_batch_num'.")
+    if not args.first_batches_to_skip >= 0:
+        raise ValueError(
+            "arg 'first_batches_to_skip' must not be smaller than 0.")
+
+    _, avg_cost, accuracy = stacked_lstmp_model(
+        hidden_dim=args.hidden_dim,
+        proj_dim=args.proj_dim,
+        stacked_num=args.stacked_num,
+        class_num=1749,
+        parallel=args.parallel)
+
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    adam_optimizer.minimize(avg_cost)
+
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    ltrans = [
+        trans_add_delta.TransAddDelta(2, 2),
+        trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var),
+        trans_splice.TransSplice()
+    ]
+
+    data_reader = reader.DataReader(args.feature_lst, args.label_lst)
+    data_reader.set_transformers(ltrans)
+
+    feature_t = fluid.LoDTensor()
+    label_t = fluid.LoDTensor()
+
+    sorted_key = None if args.sorted_key is 'None' else args.sorted_key
+    with profiler.profiler(args.device, sorted_key) as prof:
+        frames_seen, start_time = 0, 0.0
+        for batch_id, batch_data in enumerate(
+                data_reader.batch_iterator(args.batch_size,
+                                           args.minimum_batch_size)):
+            if batch_id >= args.max_batch_num:
+                break
+            if args.first_batches_to_skip == batch_id:
+                profiler.reset_profiler()
+                start_time = time.time()
+                frames_seen = 0
+            # load_data
+            (features, labels, lod) = batch_data
+            feature_t.set(features, place)
+            feature_t.set_lod([lod])
+            label_t.set(labels, place)
+            label_t.set_lod([lod])
+
+            frames_seen += lod[-1]
+
+            outs = exe.run(fluid.default_main_program(),
+                           feed={"feature": feature_t,
+                                 "label": label_t},
+                           fetch_list=[avg_cost, accuracy],
+                           return_numpy=False)
+
+            if args.print_train_acc:
+                print("Batch %d acc: %f" %
+                      (batch_id, lodtensor_to_ndarray(outs[1])[0]))
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        time_consumed = time.time() - start_time
+        frames_per_sec = frames_seen / time_consumed
+        print("\nTime consumed: %f s, performance: %f frames/s." %
+              (time_consumed, frames_per_sec))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    profile(args)
--- a/fluid/DeepASR/stacked_dynamic_lstm.py
+++ b/fluid/DeepASR/stacked_dynamic_lstm.py
@@ -2,26 +2,34 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import sys
+import os
 import numpy as np
 import argparse
 import time

-import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
-import paddle.v2.fluid.profiler as profiler
-import data_utils.trans_mean_variance_norm as trans_mean_variance_norm
-import data_utils.trans_add_delta as trans_add_delta
-import data_utils.trans_splice as trans_splice
+import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
+import data_utils.augmentor.trans_add_delta as trans_add_delta
+import data_utils.augmentor.trans_splice as trans_splice
 import data_utils.data_reader as reader
+from data_utils.util import lodtensor_to_ndarray
+from model_utils.model import stacked_lstmp_model


 def parse_args():
-    parser = argparse.ArgumentParser("LSTM model benchmark.")
+    parser = argparse.ArgumentParser("Training for stacked LSTMP model.")
    parser.add_argument(
        '--batch_size',
        type=int,
        default=32,
        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--minimum_batch_size',
+        type=int,
+        default=1,
+        help='The minimum sequence number of a batch data. '
+        '(default: %(default)d)')
    parser.add_argument(
        '--stacked_num',
        type=int,
@@ -42,6 +50,11 @@ def parse_args():
        type=int,
        default=100,
        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--print_per_batches',
+        type=int,
+        default=100,
+        help='Interval to print training accuracy. (default: %(default)d)')
    parser.add_argument(
        '--learning_rate',
        type=float,
@@ -54,107 +67,68 @@ def parse_args():
        choices=['CPU', 'GPU'],
        help='The device type. (default: %(default)s)')
    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
+        '--parallel', action='store_true', help='If set, run in parallel.')
+    parser.add_argument(
+        '--mean_var',
+        type=str,
+        default='data/global_mean_var_search26kHr',
+        help="The path for feature's global mean and variance. "
+        "(default: %(default)s)")
+    parser.add_argument(
+        '--train_feature_lst',
+        type=str,
+        default='data/feature.lst',
+        help='The feature list path for training. (default: %(default)s)')
+    parser.add_argument(
+        '--train_label_lst',
+        type=str,
+        default='data/label.lst',
+        help='The label list path for training. (default: %(default)s)')
+    parser.add_argument(
+        '--val_feature_lst',
+        type=str,
+        default='data/val_feature.lst',
+        help='The feature list path for validation. (default: %(default)s)')
    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
+        '--val_label_lst',
+        type=str,
+        default='data/val_label.lst',
+        help='The label list path for validation. (default: %(default)s)')
    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    parser.add_argument('--mean_var', type=str, help='mean var path')
-    parser.add_argument('--feature_lst', type=str, help='mean var path')
-    parser.add_argument('--label_lst', type=str, help='mean var path')
+        '--model_save_dir',
+        type=str,
+        default='./checkpoints',
+        help="The directory for saving model. Do not save model if set to "
+        "''. (default: %(default)s)")
    args = parser.parse_args()
    return args


 def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
    print('-----------  Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')


-def dynamic_lstmp_model(hidden_dim,
-                        proj_dim,
-                        stacked_num,
-                        class_num=1749,
-                        is_train=True):
-    feature = fluid.layers.data(
-        name="feature", shape=[-1, 120 * 11], dtype="float32", lod_level=1)
-
-    seq_conv1 = fluid.layers.sequence_conv(
-        input=feature,
-        num_filters=1024,
-        filter_size=3,
-        filter_stride=1,
-        bias_attr=True)
-    bn1 = fluid.layers.batch_norm(
-        input=seq_conv1,
-        act="sigmoid",
-        is_test=False,
-        momentum=0.9,
-        epsilon=1e-05,
-        data_layout='NCHW')
-
-    stack_input = bn1
-    for i in range(stacked_num):
-        fc = fluid.layers.fc(input=stack_input,
-                             size=hidden_dim * 4,
-                             bias_attr=True)
-        proj, cell = fluid.layers.dynamic_lstmp(
-            input=fc,
-            size=hidden_dim * 4,
-            proj_size=proj_dim,
-            bias_attr=True,
-            use_peepholes=True,
-            is_reverse=False,
-            cell_activation="tanh",
-            proj_activation="tanh")
-        bn = fluid.layers.batch_norm(
-            input=proj,
-            act="sigmoid",
-            is_test=False,
-            momentum=0.9,
-            epsilon=1e-05,
-            data_layout='NCHW')
-        stack_input = bn
-
-    prediction = fluid.layers.fc(input=stack_input,
-                                 size=class_num,
-                                 act='softmax')
-
-    if not is_train: return feature, prediction
-
-    label = fluid.layers.data(
-        name="label", shape=[-1, 1], dtype="int64", lod_level=1)
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    return prediction, label, avg_cost
-
-
 def train(args):
-    if args.use_cprof:
-        pr = cProfile.Profile()
-        pr.enable()
+    """train in loop.
+    """

-    prediction, label, avg_cost = dynamic_lstmp_model(
-        args.hidden_dim, args.proj_dim, args.stacked_num)
+    prediction, avg_cost, accuracy = stacked_lstmp_model(
+        hidden_dim=args.hidden_dim,
+        proj_dim=args.proj_dim,
+        stacked_num=args.stacked_num,
+        class_num=1749,
+        parallel=args.parallel)

    adam_optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
    adam_optimizer.minimize(avg_cost)

-    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
-
-    # clone from default main program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        test_accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
-        test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
-        inference_program = fluid.io.get_inference_program(test_target)
+    # program for test
+    test_program = fluid.default_main_program().clone()
+    with fluid.program_guard(test_program):
+        test_program = fluid.io.get_inference_program([avg_cost, accuracy])

    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
@@ -166,62 +140,90 @@ def train(args):
        trans_splice.TransSplice()
    ]

-    data_reader = reader.DataRead(args.feature_lst, args.label_lst)
-    data_reader.set_trans(ltrans)
-
-    res_feature = fluid.LoDTensor()
-    res_label = fluid.LoDTensor()
+    feature_t = fluid.LoDTensor()
+    label_t = fluid.LoDTensor()
+
+    # validation
+    def test(exe):
+        # If test data not found, return invalid cost and accuracy
+        if not (os.path.exists(args.val_feature_lst) and
+                os.path.exists(args.val_label_lst)):
+            return -1.0, -1.0
+        # test data reader
+        test_data_reader = reader.DataReader(args.val_feature_lst,
+                                             args.val_label_lst)
+        test_data_reader.set_transformers(ltrans)
+        test_costs, test_accs = [], []
+        for batch_id, batch_data in enumerate(
+                test_data_reader.batch_iterator(args.batch_size,
+                                                args.minimum_batch_size)):
+            # load_data
+            (features, labels, lod) = batch_data
+            feature_t.set(features, place)
+            feature_t.set_lod([lod])
+            label_t.set(labels, place)
+            label_t.set_lod([lod])
+
+            cost, acc = exe.run(test_program,
+                                feed={"feature": feature_t,
+                                      "label": label_t},
+                                fetch_list=[avg_cost, accuracy],
+                                return_numpy=False)
+            test_costs.append(lodtensor_to_ndarray(cost)[0])
+            test_accs.append(lodtensor_to_ndarray(acc)[0])
+        return np.mean(test_costs), np.mean(test_accs)
+
+    # train data reader
+    train_data_reader = reader.DataReader(args.train_feature_lst,
+                                          args.train_label_lst, -1)
+    train_data_reader.set_transformers(ltrans)
+    # train
    for pass_id in xrange(args.pass_num):
        pass_start_time = time.time()
-        words_seen = 0
-        accuracy.reset(exe)
-        batch_id = 0
-        while True:
+        for batch_id, batch_data in enumerate(
+                train_data_reader.batch_iterator(args.batch_size,
+                                                 args.minimum_batch_size)):
            # load_data
-            one_batch = data_reader.get_one_batch(args.batch_size)
-            if one_batch == None:
-                break
-            (bat_feature, bat_label, lod) = one_batch
-            res_feature.set(bat_feature, place)
-            res_feature.set_lod([lod])
-            res_label.set(bat_label, place)
-            res_label.set_lod([lod])
-
-            batch_id += 1
-
-            words_seen += lod[-1]
-
-            loss, acc = exe.run(
-                fluid.default_main_program(),
-                feed={"feature": res_feature,
-                      "label": res_label},
-                fetch_list=[avg_cost] + accuracy.metrics,
-                return_numpy=False)
-            train_acc = accuracy.eval(exe)
-            print("acc:", lodtensor_to_ndarray(loss))
-
+            (features, labels, lod) = batch_data
+            feature_t.set(features, place)
+            feature_t.set_lod([lod])
+            label_t.set(labels, place)
+            label_t.set_lod([lod])
+
+            cost, acc = exe.run(fluid.default_main_program(),
+                                feed={"feature": feature_t,
+                                      "label": label_t},
+                                fetch_list=[avg_cost, accuracy],
+                                return_numpy=False)
+
+            if batch_id > 0 and (batch_id % args.print_per_batches == 0):
+                print("\nBatch %d, train cost: %f, train acc: %f" %
+                      (batch_id, lodtensor_to_ndarray(cost)[0],
+                       lodtensor_to_ndarray(acc)[0]))
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        # run test
+        val_cost, val_acc = test(exe)
+        # save model
+        if args.model_save_dir != '':
+            model_path = os.path.join(
+                args.model_save_dir, "deep_asr.pass_" + str(pass_id) + ".model")
+            fluid.io.save_inference_model(model_path, ["feature"],
+                                          [prediction], exe)
+        # cal pass time
        pass_end_time = time.time()
        time_consumed = pass_end_time - pass_start_time
-        words_per_sec = words_seen / time_consumed
-
-
-def lodtensor_to_ndarray(lod_tensor):
-    dims = lod_tensor.get_dims()
-    ret = np.zeros(shape=dims).astype('float32')
-    for i in xrange(np.product(dims)):
-        ret.ravel()[i] = lod_tensor.get_float_element(i)
-    return ret, lod_tensor.lod()
+        # print info at pass end
+        print("\nPass %d, time consumed: %f s, val cost: %f, val acc: %f\n" %
+              (pass_id, time_consumed, val_cost, val_acc))


 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)

-    if args.infer_only:
-        pass
-    else:
-        if args.use_nvprof and args.device == 'GPU':
-            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
-                train(args)
-        else:
-            train(args)
+    if args.model_save_dir != '' and not os.path.exists(args.model_save_dir):
+        os.mkdir(args.model_save_dir)
+
+    train(args)
--- a/fluid/adversarial/advbox/attacks/saliency.py
+++ b/fluid/adversarial/advbox/attacks/saliency.py
+"""
+This module provide the attack method for JSMA's implement.
+"""
+from __future__ import division
+
+import logging
+import random
+import numpy as np
+
+from .base import Attack
+
+
+class SaliencyMapAttack(Attack):
+    """
+    Implements the Saliency Map Attack.
+    The Jacobian-based Saliency Map Approach (Papernot et al. 2016).
+    Paper link: https://arxiv.org/pdf/1511.07528.pdf
+    """
+
+    def _apply(self,
+               adversary,
+               max_iter=2000,
+               fast=True,
+               theta=0.1,
+               max_perturbations_per_pixel=7):
+        """
+        Apply the JSMA attack.
+        Args:
+            adversary(Adversary): The Adversary object.
+            max_iter(int): The max iterations.
+            fast(bool): Whether evaluate the pixel influence on sum of residual classes.
+            theta(float): Perturbation per pixel relative to [min, max] range.
+            max_perturbations_per_pixel(int): The max count of perturbation per pixel.
+        Return:
+            adversary: The Adversary object.
+        """
+        assert adversary is not None
+
+        if not adversary.is_targeted_attack or (adversary.target_label is None):
+            target_labels = self._generate_random_target(
+                adversary.original_label)
+        else:
+            target_labels = [adversary.target_label]
+
+        for target in target_labels:
+            original_image = adversary.original
+
+            # the mask defines the search domain
+            # each modified pixel with border value is set to zero in mask
+            mask = np.ones_like(original_image)
+
+            # count tracks how often each pixel was changed
+            counts = np.zeros_like(original_image)
+
+            labels = range(self.model.num_classes())
+            adv_img = original_image.copy()
+            min_, max_ = self.model.bounds()
+
+            for step in range(max_iter):
+                adv_img = np.clip(adv_img, min_, max_)
+                adv_label = np.argmax(self.model.predict(adv_img))
+                if adversary.try_accept_the_example(adv_img, adv_label):
+                    return adversary
+
+                # stop if mask is all zero
+                if not any(mask.flatten()):
+                    return adversary
+
+                logging.info('step = {}, original_label = {}, adv_label={}'.
+                             format(step, adversary.original_label, adv_label))
+
+                # get pixel location with highest influence on class
+                idx, p_sign = self._saliency_map(
+                    adv_img, target, labels, mask, fast=fast)
+
+                # apply perturbation
+                adv_img[idx] += -p_sign * theta * (max_ - min_)
+
+                # tracks number of updates for each pixel
+                counts[idx] += 1
+
+                # remove pixel from search domain if it hits the bound
+                if adv_img[idx] <= min_ or adv_img[idx] >= max_:
+                    mask[idx] = 0
+
+                # remove pixel if it was changed too often
+                if counts[idx] >= max_perturbations_per_pixel:
+                    mask[idx] = 0
+
+                adv_img = np.clip(adv_img, min_, max_)
+
+    def _generate_random_target(self, original_label):
+        """
+        Draw random target labels all of which are different and not the original label.
+        Args:
+            original_label(int): Original label.
+        Return:
+            target_labels(list): random target labels
+        """
+        num_random_target = 1
+        num_classes = self.model.num_classes()
+        assert num_random_target <= num_classes - 1
+
+        target_labels = random.sample(range(num_classes), num_random_target + 1)
+        target_labels = [t for t in target_labels if t != original_label]
+        target_labels = target_labels[:num_random_target]
+
+        return target_labels
+
+    def _saliency_map(self, image, target, labels, mask, fast=False):
+        """
+        Get pixel location with highest influence on class.
+        Args:
+            image(numpy.ndarray): Image with shape (height, width, channels).
+            target(int): The target label.
+            labels(int): The number of classes of the output label.
+            mask(list): Each modified pixel with border value is set to zero in mask.
+            fast(bool): Whether evaluate the pixel influence on sum of residual classes.
+        Return:
+            idx: The index of optimal pixel.
+            pix_sign: The direction of perturbation
+        """
+        # pixel influence on target class
+        alphas = self.model.gradient(image, target) * mask
+
+        # pixel influence on sum of residual classes(don't evaluate if fast == True)
+        if fast:
+            betas = -np.ones_like(alphas)
+        else:
+            betas = np.sum([
+                self.model.gradient(image, label) * mask - alphas
+                for label in labels
+            ], 0)
+
+        # compute saliency map (take into account both pos. & neg. perturbations)
+        sal_map = np.abs(alphas) * np.abs(betas) * np.sign(alphas * betas)
+
+        # find optimal pixel & direction of perturbation
+        idx = np.argmin(sal_map)
+        idx = np.unravel_index(idx, mask.shape)
+        pix_sign = np.sign(alphas)[idx]
+
+        return idx, pix_sign
+
+
+JSMA = SaliencyMapAttack
--- a/fluid/adversarial/mnist_tutorial_jsma.py
+++ b/fluid/adversarial/mnist_tutorial_jsma.py
+"""
+FGSM demos on mnist using advbox tool.
+"""
+import matplotlib.pyplot as plt
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import numpy as np
+
+from advbox import Adversary
+from advbox.attacks.saliency import SaliencyMapAttack
+from advbox.models.paddle import PaddleModel
+
+
+def cnn_model(img):
+    """
+    Mnist cnn model
+    Args:
+        img(Varaible): the input image to be recognized
+    Returns:
+        Variable: the label prediction
+    """
+    # conv1 = fluid.nets.conv2d()
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        num_filters=20,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        num_filters=50,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return logits
+
+
+def main():
+    """
+    Advbox demo which demonstrate how to use advbox.
+    """
+    IMG_NAME = 'img'
+    LABEL_NAME = 'label'
+
+    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
+    # gradient should flow
+    img.stop_gradient = False
+    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
+    logits = cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    BATCH_SIZE = 1
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+    feeder = fluid.DataFeeder(
+        feed_list=[IMG_NAME, LABEL_NAME],
+        place=place,
+        program=fluid.default_main_program())
+
+    fluid.io.load_params(
+        exe, "./mnist/", main_program=fluid.default_main_program())
+
+    # advbox demo
+    m = PaddleModel(fluid.default_main_program(), IMG_NAME, LABEL_NAME,
+                    logits.name, avg_cost.name, (-1, 1))
+    attack = SaliencyMapAttack(m)
+    total_num = 0
+    success_num = 0
+    for data in train_reader():
+        total_num += 1
+        # adversary.set_target(True, target_label=target_label)
+        jsma_attack = attack(Adversary(data[0][0], data[0][1]))
+        if jsma_attack is not None and jsma_attack.is_successful():
+            # plt.imshow(jsma_attack.target, cmap='Greys_r')
+            # plt.show()
+            success_num += 1
+            print('original_label=%d, adversary examples label =%d' %
+                  (data[0][1], jsma_attack.adversarial_label))
+            # np.save('adv_img', jsma_attack.adversarial_example)
+        print('total num = %d, success num = %d ' % (total_num, success_num))
+        if total_num == 100:
+            break
+
+
+if __name__ == '__main__':
+    main()
--- a/fluid/image_classification/mobilenet.py
+++ b/fluid/image_classification/mobilenet.py
+import os
+
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+from paddle.v2.fluid.initializer import MSRA
+from paddle.v2.fluid.param_attr import ParamAttr
+
+parameter_attr = ParamAttr(initializer=MSRA())
+
+
+def conv_bn_layer(input,
+                  filter_size,
+                  num_filters,
+                  stride,
+                  padding,
+                  channels=None,
+                  num_groups=1,
+                  act='relu',
+                  use_cudnn=True):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=padding,
+        groups=num_groups,
+        act=None,
+        use_cudnn=use_cudnn,
+        param_attr=parameter_attr,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv, act=act)
+
+
+def depthwise_separable(input, num_filters1, num_filters2, num_groups, stride,
+                        scale):
+    """
+    """
+    depthwise_conv = conv_bn_layer(
+        input=input,
+        filter_size=3,
+        num_filters=int(num_filters1 * scale),
+        stride=stride,
+        padding=1,
+        num_groups=int(num_groups * scale),
+        use_cudnn=False)
+
+    pointwise_conv = conv_bn_layer(
+        input=depthwise_conv,
+        filter_size=1,
+        num_filters=int(num_filters2 * scale),
+        stride=1,
+        padding=0)
+    return pointwise_conv
+
+
+def mobile_net(img, class_dim, scale=1.0):
+
+    # conv1: 112x112
+    tmp = conv_bn_layer(
+        img,
+        filter_size=3,
+        channels=3,
+        num_filters=int(32 * scale),
+        stride=2,
+        padding=1)
+
+    # 56x56
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=32,
+        num_filters2=64,
+        num_groups=32,
+        stride=1,
+        scale=scale)
+
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=64,
+        num_filters2=128,
+        num_groups=64,
+        stride=2,
+        scale=scale)
+
+    # 28x28
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=128,
+        num_filters2=128,
+        num_groups=128,
+        stride=1,
+        scale=scale)
+
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=128,
+        num_filters2=256,
+        num_groups=128,
+        stride=2,
+        scale=scale)
+
+    # 14x14
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=256,
+        num_filters2=256,
+        num_groups=256,
+        stride=1,
+        scale=scale)
+
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=256,
+        num_filters2=512,
+        num_groups=256,
+        stride=2,
+        scale=scale)
+
+    # 14x14
+    for i in range(5):
+        tmp = depthwise_separable(
+            tmp,
+            num_filters1=512,
+            num_filters2=512,
+            num_groups=512,
+            stride=1,
+            scale=scale)
+    # 7x7
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=512,
+        num_filters2=1024,
+        num_groups=512,
+        stride=2,
+        scale=scale)
+
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=1024,
+        num_filters2=1024,
+        num_groups=1024,
+        stride=1,
+        scale=scale)
+
+    tmp = fluid.layers.pool2d(
+        input=tmp,
+        pool_size=0,
+        pool_stride=1,
+        pool_type='avg',
+        global_pooling=True)
+
+    tmp = fluid.layers.fc(input=tmp,
+                          size=class_dim,
+                          act='softmax',
+                          param_attr=parameter_attr)
+    return tmp
+
+
+def train(learning_rate, batch_size, num_passes, model_save_dir='model'):
+    class_dim = 102
+    image_shape = [3, 224, 224]
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    out = mobile_net(image, class_dim=class_dim)
+
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=learning_rate,
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(5 * 1e-5))
+    opts = optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=out, label=label)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        test_accuracy = fluid.evaluator.Accuracy(input=out, label=label)
+        test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
+        inference_program = fluid.io.get_inference_program(test_target)
+
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    train_reader = paddle.batch(
+        paddle.dataset.flowers.train(), batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.flowers.test(), batch_size=batch_size)
+    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
+
+    for pass_id in range(num_passes):
+        accuracy.reset(exe)
+        for batch_id, data in enumerate(train_reader()):
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            print("Pass {0}, batch {1}, loss {2}, acc {3}".format(
+                pass_id, batch_id, loss[0], acc[0]))
+        pass_acc = accuracy.eval(exe)
+
+        test_accuracy.reset(exe)
+        for data in test_reader():
+            loss, acc = exe.run(inference_program,
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + test_accuracy.metrics)
+        test_pass_acc = test_accuracy.eval(exe)
+        print("End pass {0}, train_acc {1}, test_acc {2}".format(
+            pass_id, pass_acc, test_pass_acc))
+        if pass_id % 10 == 0:
+            model_path = os.path.join(model_save_dir, str(pass_id))
+            print 'save models to %s' % (model_path)
+            fluid.io.save_inference_model(model_path, ['image'], [out], exe)
+
+
+if __name__ == '__main__':
+    train(learning_rate=0.005, batch_size=40, num_passes=300)
--- a/fluid/image_classification/se_resnext.py
+++ b/fluid/image_classification/se_resnext.py
@@ -103,66 +103,87 @@ def train(learning_rate,
          batch_size,
          num_passes,
          init_model=None,
-          model_save_dir='model'):
+          model_save_dir='model',
+          parallel=True):
    class_dim = 1000
    image_shape = [3, 224, 224]

    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')

-    out = SE_ResNeXt(input=image, class_dim=class_dim)
-
-    cost = fluid.layers.cross_entropy(input=out, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    if parallel:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+
+        with pd.do():
+            image_ = pd.read_input(image)
+            label_ = pd.read_input(label)
+            out = SE_ResNeXt(input=image_, class_dim=class_dim)
+            cost = fluid.layers.cross_entropy(input=out, label=label_)
+            avg_cost = fluid.layers.mean(x=cost)
+            accuracy = fluid.layers.accuracy(input=out, label=label_)
+            pd.write_output(avg_cost)
+            pd.write_output(accuracy)
+
+        avg_cost, accuracy = pd()
+        avg_cost = fluid.layers.mean(x=avg_cost)
+        accuracy = fluid.layers.mean(x=accuracy)
+    else:
+        out = SE_ResNeXt(input=image, class_dim=class_dim)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        accuracy = fluid.layers.accuracy(input=out, label=label)

    optimizer = fluid.optimizer.Momentum(
        learning_rate=learning_rate,
        momentum=0.9,
        regularization=fluid.regularizer.L2Decay(1e-4))
    opts = optimizer.minimize(avg_cost)
-    accuracy = fluid.evaluator.Accuracy(input=out, label=label)

    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
-        test_accuracy = fluid.evaluator.Accuracy(input=out, label=label)
-        test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
-        inference_program = fluid.io.get_inference_program(test_target)
+        inference_program = fluid.io.get_inference_program([avg_cost, accuracy])

    place = fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    if init_model is not None:
-        fluid.io.load_persistables_if_exist(exe, init_model)
+        fluid.io.load_persistables(exe, init_model)

    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])

    for pass_id in range(num_passes):
-        accuracy.reset(exe)
        for batch_id, data in enumerate(train_reader()):
-            loss, acc = exe.run(fluid.default_main_program(),
-                                feed=feeder.feed(data),
-                                fetch_list=[avg_cost] + accuracy.metrics)
-            print("Pass {0}, batch {1}, loss {2}, acc {3}".format(
-                pass_id, batch_id, loss[0], acc[0]))
-        pass_acc = accuracy.eval(exe)
-
-        test_accuracy.reset(exe)
+            loss = exe.run(fluid.default_main_program(),
+                           feed=feeder.feed(data),
+                           fetch_list=[avg_cost])
+            print("Pass {0}, batch {1}, loss {2}".format(pass_id, batch_id,
+                                                         float(loss[0])))
+
+        total_loss = 0.0
+        total_acc = 0.0
+        total_batch = 0
        for data in test_reader():
            loss, acc = exe.run(inference_program,
                                feed=feeder.feed(data),
-                                fetch_list=[avg_cost] + test_accuracy.metrics)
-        test_pass_acc = test_accuracy.eval(exe)
-        print("End pass {0}, train_acc {1}, test_acc {2}".format(
-            pass_id, pass_acc, test_pass_acc))
+                                fetch_list=[avg_cost, accuracy])
+            total_loss += float(loss)
+            total_acc += float(acc)
+            total_batch += 1
+        print("End pass {0}, test_loss {1}, test_acc {2}".format(
+            pass_id, total_loss / total_batch, total_acc / total_batch))

        model_path = os.path.join(model_save_dir, str(pass_id))
-        if not os.path.isdir(model_path):
-            os.makedirs(model_path)
-        fluid.io.save_persistables(exe, model_path)
+        fluid.io.save_inference_model(model_path, ['image'], [out], exe)


 if __name__ == '__main__':
-    train(learning_rate=0.1, batch_size=8, num_passes=100, init_model=None)
+    train(
+        learning_rate=0.1,
+        batch_size=8,
+        num_passes=100,
+        init_model=None,
+        parallel=False)
--- a/fluid/ocr_recognition/ctc_reader.py
+++ b/fluid/ocr_recognition/ctc_reader.py
+import os
+import cv2
+import numpy as np
+from PIL import Image
+
+from paddle.v2.image import load_image
+
+
+class DataGenerator(object):
+    def __init__(self):
+        pass
+
+    def train_reader(self, img_root_dir, img_label_list, batchsize):
+        '''
+        Reader interface for training.
+
+        :param img_root_dir: The root path of the image for training.
+        :type file_list: str 
+
+        :param img_label_list: The path of the <image_name, label> file for training.
+        :type file_list: str 
+
+        '''
+
+        img_label_lines = []
+        if batchsize == 1:
+            to_file = "tmp.txt"
+            cmd = "cat " + img_label_list + " | awk '{print $1,$2,$3,$4;}' | shuf > " + to_file
+            print "cmd: " + cmd
+            os.system(cmd)
+            print "finish batch shuffle"
+            img_label_lines = open(to_file, 'r').readlines()
+        else:
+            to_file = "tmp.txt"
+            #cmd1: partial shuffle
+            cmd = "cat " + img_label_list + " | awk '{printf(\"%04d%.4f %s\\n\", $1, rand(), $0)}' | sort | sed 1,$((1 + RANDOM % 100))d | "
+            #cmd2: batch merge and shuffle
+            cmd += "awk '{printf $2\" \"$3\" \"$4\" \"$5\" \"; if(NR % " + str(
+                batchsize) + " == 0) print \"\";}' | shuf | "
+            #cmd3: batch split
+            cmd += "awk '{if(NF == " + str(
+                batchsize
+            ) + " * 4) {for(i = 0; i < " + str(
+                batchsize
+            ) + "; i++) print $(4*i+1)\" \"$(4*i+2)\" \"$(4*i+3)\" \"$(4*i+4);}}' > " + to_file
+            print "cmd: " + cmd
+            os.system(cmd)
+            print "finish batch shuffle"
+            img_label_lines = open(to_file, 'r').readlines()
+
+        def reader():
+            sizes = len(img_label_lines) / batchsize
+            for i in range(sizes):
+                result = []
+                sz = [0, 0]
+                for j in range(batchsize):
+                    line = img_label_lines[i * batchsize + j]
+                    # h, w, img_name, labels
+                    items = line.split(' ')
+
+                    label = [int(c) for c in items[-1].split(',')]
+                    img = Image.open(os.path.join(img_root_dir, items[
+                        2])).convert('L')  #zhuanhuidu
+                    if j == 0:
+                        sz = img.size
+                    img = img.resize((sz[0], sz[1]))
+                    img = np.array(img) - 127.5
+                    img = img[np.newaxis, ...]
+                    result.append([img, label])
+                yield result
+
+        return reader
+
+    def test_reader(self, img_root_dir, img_label_list):
+        '''
+        Reader interface for inference.
+
+        :param img_root_dir: The root path of the images for training.
+        :type file_list: str 
+
+        :param img_label_list: The path of the <image_name, label> file for testing.
+        :type file_list: list
+        '''
+
+        def reader():
+            for line in open(img_label_list):
+                # h, w, img_name, labels
+                items = line.split(' ')
+
+                label = [int(c) for c in items[-1].split(',')]
+                img = Image.open(os.path.join(img_root_dir, items[2])).convert(
+                    'L')
+                img = np.array(img) - 127.5
+                img = img[np.newaxis, ...]
+                yield img, label
+
+        return reader